"""

DATA ENGINEERING ETL PIPELINE - XETRA DATASET

5: Saving output to Amazon s3 and code restructuring.

Aim:
Write a production ready ETL pipeline using python and pandas.

Overview:
Xetra is a German stock exchange based in Frankfurt operated by Deutsche Börse Group. 
Data related to daily trading activity is stored publicly on the Amazon S3 database. 
(Update - as of July 2022 the data is no longer available. An archival S3 database will be used) 

Task:
Use jupyter notebook as a protoyping tool to extract and transform source data.
Request and extract source data from cloud based web services.
Use loops and iteration to read and consolidate multiple source files.
Familiarise with pandas package functions to clean and transform output data. 

Below outlines the steps to be performed:
    
    1) Continuation of xetra_4 - parametising data extraction. 
    2) Load data to Amazon S3 bucket using pyarrow package. 
    3) Check that loaded data can also be extracted within script. 
    4) Code improvements:
        - Define variables at the beginning of script.
        - Add comments and functions to improve code readibility.
        - Remove unecessary lines. 
    
"""

Define Packages

In [87]:
#Packages to be imported
import boto3 #AWS service management package.
import pandas as pd #Data analysis library.
from io import StringIO #String buffer to read CSV files.
from io import BytesIO #Bytes buffer to read PARQUET files.
from datetime import datetime, timedelta #Facilitate calulations relating to day of trade. 


Define Variables

In [88]:
#User defined input parameters. 
arg_date = '2022-12-03' #Bucket filtering argument.
arg_date_format = '%Y-%m-%d' #Date format.
src_bucket = 'xetra-1234' #Source data bucket name
trg_bucket = 'xetra-probe'#Output data bucket name

#Columns to be kept during data cleanse.
columns_use = ['ISIN', 'Date', 'Time', 'StartPrice', 'MaxPrice', 'MinPrice', 'EndPrice', 'TradedVolume']

#Parametised key name for Amazon target bucket. 
bucket_key = 'xetra_daily_report' + datetime.today().strftime('%Y%m%d_%H%M%S') + '.parquet'

Data Extraction

In [89]:
#Parametise data time filtering of amazon data bucket.  
arg_dt_obj = datetime.strptime(arg_date, arg_date_format) #Parse to datetime object.

#Trading calculations during data transformation will require previous day's data.
arg_dt_obj = arg_dt_obj.date()-timedelta(days=1) #Force argument date to use previous calender day. 

#Create instance of the "xetra" data bucket from Amazon S3 resource.
s3 = boto3.resource('s3') 
bucket = s3.Bucket(src_bucket)

#Filter and extract bucket data based on date argument.
bucket_objects = [obj for obj in bucket.objects.all() if (datetime.strptime(obj.key.split('/')[0], '%Y-%m-%d').date()>= arg_dt_obj)]

In [90]:
#Method to convert bucket data from csv to pandas dataframe.
def csv_to_df (filename):
    csv_obj = bucket.Object(key=filename).get().get('Body') #Read data element from list.
    csv_obj = csv_obj.read().decode('utf-8') #Store into to csv object in utf-8 format.
    data = StringIO(csv_obj) #Convert csv object to string data.
    df = pd.read_csv(data, delimiter=',') #Read data as pandas data frame.
    return df

#Concatenate bucket data to master dataframe via list comprehension. 
df_all = pd.concat([csv_to_df(obj.key) for obj in bucket_objects], ignore_index=True)

Data Cleansing

In [91]:
#Remove unecssary columns and missing baues from data. 
df_all = df_all.loc[:,columns_use] #Remove unecessary columns 
df_all.dropna(inplace=True) #Drop all missing values from the dataset.
df_all = df_all.reset_index(drop=True) #Reset the column index.
df_all.shape #Check if there was any filtering (should match table dimensions).

(353475, 8)

Data Transformation

In [92]:
#Get opening price per ISIN on a particular day.  
df_all['OpeningPrice'] = df_all.sort_values('Time').groupby(['ISIN','Date'])['StartPrice'].transform('first')

#Get closing price per ISIN on a particular day. 
df_all['ClosingPrice'] = df_all.sort_values('Time').groupby(['ISIN','Date'])['EndPrice'].transform('last')

#Aggregate data per ISIN on a particular day.
df_all = df_all.groupby(['ISIN','Date'], as_index = False).agg(OpeningPriceEUR = ('OpeningPrice', 'min'),ClosingPriceEUR = ('ClosingPrice', 'min'), MinPriceEUR = ('MinPrice', 'min'), MaxPriceEUR = ('MaxPrice', 'max'), DailyTradedVolume = ('TradedVolume', 'sum'))

#Percentage change in closing price between current and pervious day of trade. 
df_all['PrevClosingPriceEUR'] = df_all.sort_values(by = 'Date').groupby(['ISIN'])['ClosingPriceEUR'].shift(1)
df_all['DeltaPrevClosingPriceEUR%'] = (df_all['ClosingPriceEUR'] - df_all['PrevClosingPriceEUR'])/df_all['PrevClosingPriceEUR']*100

#Round aggregated data. 
df_all = df_all.round(decimals = 2)

#Filter output by specified by argument date
df_all = df_all[df_all['Date']>=arg_date]

Data Loading

In [93]:
#Create output buffer to and store dataframe as parquet file. 
output = BytesIO()
df_all.to_parquet(output, index=False)

#Save data to Amazon S3 target bucket. 
target_bucket = s3.Bucket(trg_bucket)
target_bucket.put_object(Body=output.getvalue(),Key=bucket_key)

s3.Object(bucket_name='xetra-probe', key='xetra_daily_report20221204_193420.parquet')

Data Reading

In [94]:
#Verify output by reading data back into workflow. 
for obj in target_bucket.objects.all():
    prq_obj = target_bucket.Object(key=obj.key).get().get('Body').read()
    data = BytesIO(prq_obj)
    df_report = pd.read_parquet(data)

#Print dataframe output. 
df_report

Unnamed: 0,ISIN,Date,OpeningPriceEUR,ClosingPriceEUR,MinPriceEUR,MaxPriceEUR,DailyTradedVolume,PrevClosingPriceEUR,DeltaPrevClosingPriceEUR%
0,AT000000STR1,2022-12-03,39.00,38.60,38.60,39.00,66,38.60,0.00
1,AT000000STR1,2022-12-04,38.85,38.90,38.80,38.90,250,38.60,0.78
2,AT00000FACC2,2022-12-03,8.19,8.57,8.19,8.57,8013,7.85,9.17
3,AT00000FACC2,2022-12-04,8.46,8.68,8.46,8.78,3183,8.57,1.28
4,AT0000606306,2022-12-03,26.24,26.60,26.24,26.78,4072,26.22,1.45
...,...,...,...,...,...,...,...,...,...
6392,XS2314659447,2022-12-04,8.60,8.56,8.56,8.60,184,8.61,-0.61
6393,XS2314660700,2022-12-03,20.42,20.20,20.20,20.51,271,20.43,-1.15
6394,XS2314660700,2022-12-04,20.01,20.21,20.01,20.27,42,20.20,0.03
6395,XS2376095068,2022-12-03,32.75,32.20,32.20,32.75,0,32.88,-2.07
