"""

DATA ENGINEERING ETL PIPELINE - XETRA DATASET

4: Parametisation of extraction.

Aim:
Write a production ready ETL pipeline using python and pandas.

Overview:
Xetra is a German stock exchange based in Frankfurt operated by Deutsche Börse Group. 
Data related to daily trading activity is stored publicly on the Amazon S3 database. 
(Update - as of July 2022 the data is no longer available. An archival S3 database will be used) 

Task:
Use jupyter notebook as a protoyping tool to extract and transform source data.
Request and extract source data from cloud based web services.
Use loops and iteration to read and consolidate multiple source files.
Familiarise with pandas package functions to clean and transform output data. 

Below outlines the steps to be performed:
    
    1) Continuation of xetra_3 - transformed data via sorting, grouping and aggregation. 
    2) Parametise Amazon bucket data filtering using datetime functions.
    3) Create datatime argument to extract multiple data objects within a single list comprehension.
    4) Filter transformed output by specified argument date. 
    5) Print dataframe object.
    
"""

Data Extraction

In [414]:
import boto3 #AWS service management package.
import pandas as pd #Data analysis library.
from io import StringIO #String buffer to read CSV files.
from datetime import datetime, timedelta #Facilitate calulations relating to day of trade. 


In [424]:
#Parametise data time filtering of amazon data bucket.  
arg_date = '2022-12-01' #Define datetime argument.
arg_dt_obj = datetime.strptime(arg_date, '%Y-%m-%d') #Parse to datetime object.

#Trading calculations during data transformation will require previous day's data.
arg_dt_obj = arg_dt_obj.date()-timedelta(days=1) #Force argument date to use previous calender day. 
arg_dt_obj

datetime.date(2022, 11, 30)

In [425]:
s3 = boto3.resource('s3') #Use the Amazon S3 cloud storage resource.
bucket = s3.Bucket('xetra-1234') #Create instance of the "xetra" data bucket.

In [436]:
#Filter and extract bucket data based on date argument.
bucket_objects = [obj for obj in bucket.objects.all() if (datetime.strptime(obj.key.split('/')[0], '%Y-%m-%d').date()>= arg_dt_obj)]

In [427]:
#Read csv body of dataset into pandas dataframe - initialisation step:
csv_obj_init = bucket.Object(key=bucket_objects[0].key).get().get('Body') #Initialise first element of csv object.
csv_obj_init = csv_obj_init.read().decode('utf-8') #Store into csv object in utf-8 format.
data = StringIO(csv_obj_init) #Convert csv object from streaming body to string data.
df_init = pd.read_csv(data, delimiter=',') #Read data into pandas data frame.
df_all = pd.DataFrame(columns=df_init.columns) #Initialise df_all with df_init columns.

In [430]:
#Read csv body of dataset into pandas dataframe - iteration step:
for obj in bucket_objects:
    csv_obj = bucket.Object(key=obj.key).get().get('Body') #Read data element from list.
    csv_obj = csv_obj.read().decode('utf-8') #Store into to csv object in utf-8 format.
    data = StringIO(csv_obj) #Convert csv object to string data.
    df = pd.read_csv(data, delimiter=',') #Read data as pandas data frame.
    df_all = pd.concat([df, df_all]) #Concatenate data to one master dataframe.

In [431]:
csv_obj #Print csv object to view columns.

'ISIN,Mnemonic,SecurityDesc,SecurityType,Currency,SecurityID,Date,Time,StartPrice,MaxPrice,MinPrice,EndPrice,TradedVolume,NumberOfTrades\n'

Data Cleansing

In [432]:
#Remove unecessary columns by storing required columns in variable and passing as .loc function argument. 
columns_use = ['ISIN', 'Date', 'Time', 'StartPrice', 'MaxPrice', 'MinPrice', 'EndPrice', 'TradedVolume']
df_all = df_all.loc[:,columns_use]

In [433]:
df_all.dropna(inplace=True) #Drop all missing values from the dataset.
df_all = df_all.reset_index(drop=True) #Reset the column index.
df_all.shape #Check if there was any filtering (should match table dimensions)

(981058, 8)

Data Transformation

In [434]:
#Get opening price per ISIN on a particular day.  
df_all['OpeningPrice'] = df_all.sort_values('Time').groupby(['ISIN','Date'])['StartPrice'].transform('first')

#Get closing price per ISIN on a particular day. 
df_all['ClosingPrice'] = df_all.sort_values('Time').groupby(['ISIN','Date'])['EndPrice'].transform('last')

#Aggregate data per ISIN on a particular day.
df_all = df_all.groupby(['ISIN','Date'], as_index = False).agg(OpeningPriceEUR = ('OpeningPrice', 'min'),ClosingPriceEUR = ('ClosingPrice', 'min'), MinPriceEUR = ('MinPrice', 'min'), MaxPriceEUR = ('MaxPrice', 'max'), DailyTradedVolume = ('TradedVolume', 'sum'))

#Percentage change in closing price between current and pervious day of trade. 
df_all['PrevClosingPriceEUR'] = df_all.sort_values(by = 'Date').groupby(['ISIN'])['ClosingPriceEUR'].shift(1)
df_all['DeltaPCP%'] = (df_all['ClosingPriceEUR'] - df_all['PrevClosingPriceEUR'])/df_all['PrevClosingPriceEUR']*100

#Cleanse aggregated data. 
df_all.drop(columns = ['PrevClosingPriceEUR'], inplace = True) 
df_all = df_all.round(decimals = 2)

#Filter output by specified by argument date
df_all = df_all[df_all['Date']>=arg_date]


In [435]:
#Print transformed data output.
df_all

Unnamed: 0,ISIN,Date,OpeningPriceEUR,ClosingPriceEUR,MinPriceEUR,MaxPriceEUR,DailyTradedVolume,DeltaPCP%
1,AT000000STR1,2022-12-01,38.10,38.50,38.10,38.60,1336,4.90
2,AT000000STR1,2022-12-02,38.85,38.60,38.55,38.95,5194,0.26
3,AT000000STR1,2022-12-03,39.00,38.60,38.60,39.00,132,0.00
5,AT00000FACC2,2022-12-01,7.82,8.00,7.75,8.12,3660,-6.65
6,AT00000FACC2,2022-12-02,7.85,7.85,7.85,7.85,0,-1.88
...,...,...,...,...,...,...,...,...
12854,XS2314660700,2022-12-02,20.51,20.43,20.42,20.65,634,-0.27
12855,XS2314660700,2022-12-03,20.42,20.20,20.20,20.51,542,-1.15
12857,XS2376095068,2022-12-01,34.08,34.66,34.08,34.67,7840,-5.04
12858,XS2376095068,2022-12-02,33.97,32.88,32.88,34.10,1200,-5.13
