#### DATA ENGINEERING PIPELINE - EURO FOREIGN EXCHANGE RATES

Aim:
Write a production ready data engineering pipeline using python and pandas.

Overview:
The European Central Bank provides daily online exchange rates. This script will parse data from an xml file, transform the data and export to csv. 

Task:

Below outlines the steps to be performed:
    
    1) Import the necessary libraries for the project.
    2) Define the functions that will faciliate the data engineering.
    3) Create variables to define the url that will be requested. 
    4) Request ECB exchange rate data xml data from a url. 
    5) Parse the xml content into a pandas dataframe.
    6) Cleanse and transform data using pandas library functions.
    7) Display the content as a pandas data frame.
    8) Export the content to a csv file. 

Import Packages

In [40]:
import pandas as pd # Data analysis library.
import ssl # Secure sockets layer package.
import urllib # Url handling module.
import sys # Runtime environment handling module.
import xml.etree.ElementTree as et # XML parsing library.
import datetime as dt # Datetime parsing library.

Define Methods

In [41]:
def parse_xml(xml_obj, xml_namespaces, column_names):
    # Parse xml content. 
    xml_tree = et.parse(xml_obj)
    xml_root = xml_tree.getroot()
    # Find required tags and store data via list comprehension.
    rows = xml_root.findall('.//ex:Cube', namespaces=xml_namespaces)
    xml_data = [[row.get('time'), row.get('currency'), row.get('rate')] for row in rows]
    # Create columns for dataframe and read in content.
    df = pd.DataFrame(xml_data, columns = column_names)
    return df
    
def create_ex_rate_pivot(df, column_names):
    # Create ex.rate pivot table by date and currency.
    df_out = pd.pivot_table(df, index=column_names[0], columns=column_names[1], values=column_names[2])
    # Add weekend dates missing from period to the table index.
    max_date = df.iloc[1, df.columns.get_loc(column_names[0])]
    min_date = df.iloc[-1, df.columns.get_loc(column_names[0])]
    date_idx = pd.date_range(min_date, max_date)
    df_out.index = pd.DatetimeIndex(df_out.index)
    df_out = df_out.reindex(date_idx)
    # Fill forward missing weekend ex.rate values. 
    df_out = df_out.ffill(axis=0)
    df_out = df_out.sort_index(ascending=0)
    return df_out

def request(url, xml_namespaces, column_names):
    # Disable security certificate checks for url requests.
    ssl._create_default_https_context = ssl._create_unverified_context
    try:
        # If URL is valid print confirmation.
        xml_object = urllib.request.urlopen(url)
        print('Message: Requested URL is valid.')
        df = parse_xml(xml_object, xml_namespaces, column_names)

    except urllib.error.URLError:
        # If URL is invalid print error.
        print('Error: Requested URL is invalid.')
        sys.exit()
    return df

def transform(df, column_names):
    # Fill forward rows with missing dates. 
    df = df.ffill(axis=0)
    # Drop all other empty rows.
    df = df.dropna()
    # Create ex.rate pivot table.
    df[column_names[2]] = pd.to_numeric(df[column_names[2]])
    df_out = create_ex_rate_pivot(df,column_names)
    return df_out

def export(file_name, df, column_names):
    # Export to csv file 
    df.iloc[:,0] = df.iloc[:,0].astype("str")
    df.index.name = column_names[0]
    df.to_csv(file_name, index=True)
    
def rte_process(url, xml_namespaces, column_names, file_name):
    # Run full rte process and display csv output file.
    df_request = request(url, xml_namespaces, column_names)
    df_transform = transform(df_request, column_names)
    export(file_name,df_transform, column_names)
    display(pd.read_csv(file_name, dtype=str))


Define Main Function

In [42]:
def main():
    # Define variables.
    url = 'https://www.ecb.europa.eu/stats/eurofxref/eurofxref-hist-90d.xml'
    xml_namespaces = {'ex': 'http://www.ecb.int/vocabulary/2002-08-01/eurofxref'}
    column_names = ['Date', 'Currency', 'Rate']
    file_name = 'ECB FX Rates.csv'
 
    # Request xml data from URL, transform and export as CSV.
    rte_process(url, xml_namespaces, column_names, file_name)

    
# Define main as program entry point if script is running as standalone and not as module.
if __name__=="__main__":
    main()

Message: Requested URL is valid.


Unnamed: 0,Date,AUD,BGN,BRL,CAD,CHF,CNY,CZK,DKK,GBP,...,NZD,PHP,PLN,RON,SEK,SGD,THB,TRY,USD,ZAR
0,2023-01-13,1.5586,1.9558,5.5512,1.4494,1.0051,7.2729,24.011,7.4387,0.888,...,1.7014,59.44,4.6888,4.9423,11.2528,1.4311,35.751,20.3196,1.0814,18.2482
1,2023-01-12,1.557,1.9558,5.5556,1.4439,1.0056,7.27,24.036,7.4385,0.8869,...,1.6937,59.292,4.692,4.944,11.273,1.4309,35.849,20.2312,1.0772,18.19495
2,2023-01-11,1.5588,1.9558,5.584,1.4429,0.9967,7.2807,24.027,7.4375,0.88673,...,1.6912,59.013,4.6819,4.9335,11.2783,1.4316,35.895,20.1793,1.0747,18.2122
3,2023-01-10,1.5616,1.9558,5.6471,1.4382,0.9908,7.2732,23.984,7.4375,0.8833,...,1.6879,58.751,4.695,4.9338,11.1963,1.429,36.024,20.1356,1.0723,18.29405
4,2023-01-09,1.5446,1.9558,5.6475,1.4299,0.9865,7.2546,23.99,7.4374,0.88048,...,1.6741,58.946,4.6963,4.9253,11.196,1.4244,35.789,20.0824,1.0696,18.25565
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,2022-10-21,1.5646,1.9558,5.1117,1.3465,0.9855,7.0504,24.511,7.4382,0.87728,...,1.7347,57.287,4.7885,4.9125,11.0868,1.3917,37.349,18.0988,0.973,18.0323
85,2022-10-20,1.5554,1.9558,5.1387,1.3461,0.9836,7.0858,24.525,7.4389,0.87258,...,1.7206,57.742,4.7728,4.9203,10.982,1.3959,37.36,18.2257,0.9811,17.95635
86,2022-10-19,1.5568,1.9558,5.1755,1.3479,0.981,7.0672,24.563,7.439,0.86993,...,1.7264,57.741,4.7878,4.9248,10.9448,1.3931,37.469,18.1793,0.9778,17.87225
87,2022-10-18,1.5557,1.9558,5.1795,1.3495,0.9792,7.0805,24.593,7.4393,0.86928,...,1.7251,57.897,4.804,4.9359,10.906,1.3963,37.422,18.2813,0.9835,17.812150000000003
