## DATA ENGINEERING PIPELINE - EURO FOREIGN EXCHANGE RATES

Aim:
Write a production ready data engineering pipeline using python and pandas.

Overview:
The European Central Bank provides daily online exchange rates. This script will parse data from an xml file, transform the data and export to csv. 

Task:

Below outlines the steps to be performed:
    
    1) Import the necessary libraries for the project.
    2) Define the functions that will faciliate the data engineering.
    3) Create variables to define the url that will be requested. 
    4) Request ECB exchange rate data xml data from a url. 
    5) Parse the xml content into a pandas dataframe.
    6) Cleanse and transform data using pandas library functions.
    7) Display the content as a pandas data frame.
    8) Export the content to a csv file. 

#### Import Packages

In [60]:
import pandas as pd # Data analysis library.
import ssl # Secure sockets layer package.
import urllib # Url handling module.
import xml.etree.ElementTree as et # XML parsing library.
import datetime as dt # Datetime parsing library.

#### Define Methods

In [61]:
def parse_xml(xml_obj, xml_namespaces, column_names):
    # Parse xml content. 
    try:
        xml_tree = et.parse(xml_obj)
        xml_root = xml_tree.getroot()
        # Find required tags and store data via list comprehension.
        rows = xml_root.findall('.//ex:Cube', namespaces=xml_namespaces)
        xml_data = [[row.get('time'), row.get('currency'), row.get('rate')] for row in rows]
        # Create columns for dataframe and read in content.
        df = pd.DataFrame(xml_data, columns = column_names)
        print('Message: Xml data parsing successful.')
    except et.ParseError:
            # Return empty dataframe if parse error.
            df = pd.DataFrame()
            print('Error: Xml data parsing failed.')
    return df
    
def create_ex_rate_pivot(df, column_names):
    # Create ex.rate pivot table by date and currency.
    df_out = pd.pivot_table(df, index=column_names[0], columns=column_names[1], values=column_names[2])
    # Add weekend dates missing from period to the table index.
    date_idx = pd.date_range(df['Date'].min(), df['Date'].max())
    df_out.index = pd.DatetimeIndex(df_out.index)
    df_out = df_out.reindex(date_idx)
    # Fill forward missing weekend ex.rate values. 
    df_out = df_out.ffill(axis=0)
    df_out = df_out.sort_index(ascending=0)
    return df_out

def read(url, xml_namespaces, column_names):
    # Disable security certificate checks for url requests.
    ssl._create_default_https_context = ssl._create_unverified_context
    try:
        # Parse the requested url content and read into dataframe. 
        xml_object = urllib.request.urlopen(url)
        df = parse_xml(xml_object, xml_namespaces, column_names)
    except urllib.error.HTTPError as e:
        if e.code == '404':
            # If URL is invalid create empty dataframe and print error.
            print('Error: Requested URL is invalid.')
        else:
            # If URL is valid read content into dataframe and print confirmation. 
            print('Message: Requested URL is valid.')
    return df

def transform(df, column_names):
    # Fill forward rows with missing dates. 
    df= df.ffill(axis=0)
    # Drop all other empty rows.
    df= df.dropna()
    # Create ex.rate pivot table.
    df[column_names[2]] = pd.to_numeric(df[column_names[2]])
    df_out = create_ex_rate_pivot(df,column_names)
    return df_out

def export(file_name, df, column_names):
    # Export to csv file 
    df.iloc[:,0] = df.iloc[:,0].astype("str")
    df.index.name = column_names[0]
    df.to_csv(file_name, index=True)
    
def rte_process(url, xml_namespaces, column_names, file_name):
    # Run full rte process and display csv output file.
    df_request = read(url, xml_namespaces, column_names)
    df_transform = transform(df_request, column_names)
    export(file_name,df_transform, column_names)
    display(pd.read_csv(file_name, dtype=str))


#### Define Main Function

In [62]:
def main():
    # Define variables.
    url = 'https://www.ecb.europa.eu/stats/eurofxref/eurofxref-hist-90d.xml'
    xml_namespaces = {'ex': 'http://www.ecb.int/vocabulary/2002-08-01/eurofxref'}
    column_names = ['Date', 'Currency', 'Rate']
    file_name = 'ECB FX Rates.csv'
 
    # Request xml data from URL, transform and export as CSV.
    rte_process(url, xml_namespaces, column_names, file_name)

# Define main as program entry point if script is running as standalone and not as module.
if __name__=="__main__":
    main()

Message: Xml data parsing successful.


Unnamed: 0,Date,AUD,BGN,BRL,CAD,CHF,CNY,CZK,DKK,GBP,...,NZD,PHP,PLN,RON,SEK,SGD,THB,TRY,USD,ZAR
0,2023-01-27,1.5289,1.9558,5.5104,1.4479,1.0017,7.369,23.826,7.4378,0.87885,...,1.6759,59.187,4.7085,4.8965,11.2108,1.4277,35.702,20.4365,1.0865,18.7185
1,2023-01-26,1.5308,1.9558,5.5572,1.4568,1.0002,7.3893,23.818,7.4383,0.87945,...,1.6799,59.301,4.7195,4.8818,11.1763,1.4292,35.687,20.4961,1.0895,18.665599999999998
2,2023-01-25,1.536,1.9558,5.569,1.4544,1.002,7.3778,23.808,7.4381,0.88248,...,1.6792,59.35,4.7158,4.9035,11.1335,1.4307,35.718,20.4658,1.0878,18.6436
3,2023-01-24,1.547,1.9558,5.6164,1.4517,1.0053,7.3642,23.874,7.4399,0.88368,...,1.6743,59.122,4.713,4.9171,11.0995,1.4337,35.609,20.4234,1.0858,18.7216
4,2023-01-23,1.5529,1.9558,5.6372,1.4523,1.0013,7.373,23.881,7.4393,0.8797,...,1.6778,59.345,4.7113,4.9202,11.1183,1.4328,35.63,20.4478,1.0871,18.714199999999998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,2022-11-04,1.5311,1.9558,4.9682,1.3351,0.9863,7.0894,24.422,7.4419,0.87478,...,1.6769,57.672,4.6825,4.8893,10.8538,1.3891,36.906,18.3845,0.9872,17.7783
85,2022-11-03,1.5517,1.9558,5.0262,1.3452,0.9889,7.1367,24.539,7.4433,0.87228,...,1.6957,57.463,4.709,4.9013,10.932,1.3878,37.091,18.1602,0.9753,17.9078
86,2022-11-02,1.5426,1.9558,5.0964,1.347,0.9861,7.2156,24.506,7.4431,0.861,...,1.6844,57.841,4.7035,4.912,10.9065,1.3983,37.314,18.4488,0.9908,17.98905
87,2022-11-01,1.5409,1.9558,5.1337,1.3469,0.9878,7.2165,24.484,7.4438,0.86058,...,1.6876,57.786,4.7053,4.9138,10.874,1.4017,37.45,18.5216,0.9947,17.9705
