In [1]:
"""

DATA ENGINEERING ETL PIPELINE - XETRA DATASET

2. Reading multiple files.

Aim:
Write a production ready ETL pipeline using python and pandas.

Overview:
Xetra is a German stock exchange based in Frankfurt operated by Deutsche Börse Group. 
Data related to daily trading activity is stored publicly on the Amazon S3 database. 
(Update - as of July 2022 the data is no longer available. An archival S3 database will be used) 

Task:

Below outlines the steps to be performed:
    
    1) We first import the necessary libraries and functions for our project.
    2) We then create a couple of variables to define the api we're going to call 
    as well as where we're looking to export our final transformed file.
    3) We then call the API (with the requests library) about the US population over the years
    and check whether the call has been successfull (with the api_answer function)
    4) Export the API answer's text data, convert it to a pandas dataframe 
    and apply a couple of transformation with the extract_data_into_dataframe function.
    5) Finally we export our transformed pandas dataframe to a CSV file to a local folder.
    
"""


"\n\nDATA ENGINEERING ETL PIPELINE - XETRA DATASET\n\n2. Reading multiple files.\n\nAim:\nWrite a production ready ETL pipeline using python and pandas.\n\nOverview:\nXetra is a German stock exchange based in Frankfurt operated by Deutsche Börse Group. \nData related to daily trading activity is stored publicly on the Amazon S3 database. \n(Update - as of July 2022 the data is no longer available. An archival S3 database will be used) \n\nTask:\n\nBelow outlines the steps to be performed:\n    \n    1) We first import the necessary libraries and functions for our project.\n    2) We then create a couple of variables to define the api we're going to call \n    as well as where we're looking to export our final transformed file.\n    3) We then call the API (with the requests library) about the US population over the years\n    and check whether the call has been successfull (with the api_answer function)\n    4) Export the API answer's text data, convert it to a pandas dataframe \n    a

In [2]:
import boto3 # AWS service management package
import pandas as pd # Data analysis library
from io import StringIO # String buffer to read CSV files

In [3]:
s3 = boto3.resource('s3') # Use the Amazon S3 cloud storage resource
bucket = s3.Bucket('xetra-1234') # Create instance of the "xetra" data bucket



In [4]:
bucket_obj1 = bucket.objects.filter(Prefix='2022-01-28/') # Filter by date and store data as "bucket_obj1"
bucket_obj2 = bucket.objects.filter(Prefix='2022-02-28/') # Filter by date and store data as "bucket_obj2"
bucket_objects = [obj for obj in bucket_obj1] + [obj for obj in bucket_obj2]  # Store data into bucket list

In [17]:
# Initialisation step retrieve body of data from first element and uses as column template. 
csv_obj_init = bucket.Object(key=bucket_objects[0].key).get().get('Body') # Initialise first element of csv object.
csv_obj_init = csv_obj_init.read().decode('utf-8') # Store into csv object in utf-8 format.
data = StringIO(csv_obj_init) # Convert csv object from streaming body to string data.
df_init = pd.read_csv(data, delimiter=',') # Read data as pandas data frame.
df_all = pd.DataFrame(columns=df_init.columns) # Initialise df_all with df_init columns.

In [28]:
# Iteration step retrieve body of data from remaining elements of data and store in csv object. 
for obj in bucket_objects:
    csv_obj = bucket.Object(key=obj.key).get().get('Body') # Read data element from list
    csv_obj = csv_obj.read().decode('utf-8') # Store into to csv object in utf-8 format
    data = StringIO(csv_obj) # Convert csv object to string data
    df = pd.read_csv(data, delimiter=',') # Read data as pandas data frame
    df_all = pd.concat([df, df_all]) # Concatenate data to one master dataframe.
    #df_all = df_all.append(df_init, ignore_index = True) # Append data to one master dataframe.

In [29]:
df_all

Unnamed: 0,ISIN,Mnemonic,SecurityDesc,SecurityType,Currency,SecurityID,Date,Time,StartPrice,MaxPrice,MinPrice,EndPrice,TradedVolume,NumberOfTrades
0,US98956P1021,ZIM,ZIMMER BIOMET HLDGS DL-01,Common stock,EUR,4582018,2022-02-28,20:30,113.100,113.100,113.100,113.100,0,1
1,US9224171002,VEO,"VEECO INSTRUMENTS DL-,01",Common stock,EUR,6198311,2022-02-28,20:30,24.600,24.600,24.600,24.600,0,1
2,IT0005143547,EM8,ENERGICA MOTOR CO.S.P.A.,Common stock,EUR,7026075,2022-02-28,20:30,3.100,3.100,3.100,3.100,0,1
0,CA0679011084,ABR,BARRICK GOLD CORP.,Common stock,EUR,2504196,2022-02-28,16:00,20.215,20.215,20.185,20.185,60,2
1,CA32076V1031,FMV,FIRST MAJESTIC SILVER,Common stock,EUR,2504197,2022-02-28,16:00,10.060,10.060,10.060,10.060,11,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16728,DK0061539921,VWSB,"VESTAS WIND SYS. DK -,20",Common stock,EUR,6354987,2022-01-28,08:59,23.270,23.270,23.270,23.270,37,1
16729,DE000A3E5D56,FPE,FUCHS PETROLUB NA ST O.N.,Common stock,EUR,6699157,2022-01-28,08:59,30.060,30.080,30.060,30.080,218,3
16730,DE000A3E5D64,FPE3,FUCHS PETROLUB VZO NA ON,Common stock,EUR,6699158,2022-01-28,08:59,38.100,38.100,38.100,38.100,34,1
16731,FR0000121147,FAU,FAURECIA EU INH EO 7,Common stock,EUR,7026036,2022-01-28,08:59,38.840,38.840,38.840,38.840,40,1


In [30]:
csv_obj # Print csv object

'ISIN,Mnemonic,SecurityDesc,SecurityType,Currency,SecurityID,Date,Time,StartPrice,MaxPrice,MinPrice,EndPrice,TradedVolume,NumberOfTrades\r\n'

In [31]:
# Extract columns only.  
columns_use = ['ISIN', 'Date', 'Time', 'StartPrice', 'MaxPrice', 'MinPrice', 'EndPrice', 'TradedVolume']
df_all = df_all.loc[:,columns_use]
df_all


Unnamed: 0,ISIN,Date,Time,StartPrice,MaxPrice,MinPrice,EndPrice,TradedVolume
0,US98956P1021,2022-02-28,20:30,113.100,113.100,113.100,113.100,0
1,US9224171002,2022-02-28,20:30,24.600,24.600,24.600,24.600,0
2,IT0005143547,2022-02-28,20:30,3.100,3.100,3.100,3.100,0
0,CA0679011084,2022-02-28,16:00,20.215,20.215,20.185,20.185,60
1,CA32076V1031,2022-02-28,16:00,10.060,10.060,10.060,10.060,11
...,...,...,...,...,...,...,...,...
16728,DK0061539921,2022-01-28,08:59,23.270,23.270,23.270,23.270,37
16729,DE000A3E5D56,2022-01-28,08:59,30.060,30.080,30.060,30.080,218
16730,DE000A3E5D64,2022-01-28,08:59,38.100,38.100,38.100,38.100,34
16731,FR0000121147,2022-01-28,08:59,38.840,38.840,38.840,38.840,40


In [32]:
df_all.dropna(inplace=True) # Keep the DataFrame with valid entries in the same variable.
df_all.shape # Check if there was any filtering (should match table dimensions)




(514496, 8)