## DATA ENGINEERING PIPELINE - EURO FOREIGN EXCHANGE RATES

Overview:
The European Central Bank provides daily online exchange rates. This script will parse data from an xml file, transform the data into a lookip table. 

Task:

Below outlines the steps to be performed:
    
    1) Import the necessary libraries for the project.
    2) Define the functions that will faciliate the data engineering.
    3) Create variables to define the url that will be requested. 
    4) Request ECB exchange rate data xml data from a url. 
    5) Parse the xml content into a pandas dataframe.
    6) Cleanse and transform data using pandas library functions.
    7) Display the content as a pandas data frame.

In [149]:
import pandas as pd # Data analysis library.
import ssl # Secure sockets layer package.
import urllib # Url handling module.
import xml.etree.ElementTree as et # XML parsing library.
import datetime as dt # Datetime parsing library.

ECB_URL = 'https://www.ecb.europa.eu/stats/eurofxref/eurofxref-hist.xml'
XML_NAMESPACES = {'ex': 'http://www.ecb.int/vocabulary/2002-08-01/eurofxref'}
XML_CHILD = './/ex:Cube'
START_DATE = dt.datetime.today()
END_DATE = dt.datetime.today() - pd.DateOffset(years=1)

# Disable security certificate checks for url requests.
ssl._create_default_https_context = ssl._create_unverified_context

def _parse():
    try:
        # Configure request parameters.
        opener = urllib.request.build_opener()
        xml_object = opener.open(ECB_URL)
        xml_tree = et.parse(xml_object)
        xml_root = xml_tree.getroot()
        # Find required child element instances and store content via list comprehension.
        rows = xml_root.findall(XML_CHILD, namespaces=XML_NAMESPACES)
        xml_data = [[row.get('time'), row.get('currency'), row.get('rate')] for row in rows]
        # Create columns for dataframe and read in content.
        df = pd.DataFrame(xml_data, columns = ['Date', 'Currency', 'Rate'])
    except et.ParseError:
        # Return empty dataframe if parse error.
        df = pd.DataFrame()
        raise('Error: Xml data parsing failed.')
    return df

def _pivot(df):
    df_out = pd.pivot_table(df, index='Date', columns='Currency', values='Rate')
    # Add weekend dates missing from period to the table index.
    date_idx = pd.date_range(df["Date"].min(), df['Date'].max())
    df_out.index = pd.DatetimeIndex(df_out.index)
    df_out = df_out.reindex(date_idx)
    # Fill forward missing weekend fx rate values.
    df_out = df_out.ffill(axis=0)
    df_out = df_out.sort_index(ascending=False)
    return df_out

def _read(url, xml_namespaces, column_names):
    try:
        # Parse the requested url content and read into dataframe. 
        xml_object = urllib.request.urlopen(url)
        df = _parse_xml(xml_object, xml_namespaces, column_names)
    except urllib.error.HTTPError as e:
        if e.code == '404':
            # If URL is invalid create empty dataframe and print error.
            print('Error: Requested URL is invalid.')
        else:
            # If URL is valid read content into dataframe and print confirmation. 
            print('Message: Requested URL is valid.')
    return df

def _transform():
    # Read data into pandas dataframe.
    try:
        df = _parse()
    except urllib.error.HTTPError as e:
        if e.code == 404:
               raise('Error: Requested fx rate url is invalid.')
    #Cleanse rows.
    df = df.ffill(axis=0)
    df = df.dropna()
    # Create fx rate pivot.
    df['Rate'] = pd.to_numeric(df['Rate'])
    df_out = _pivot(df)
    return df_out
    
def get_rates(start_date=None, end_date=None):
    # Check if default start date needs applying, else parse input date.
    if start_date is None:
        start_date = START_DATE
    else:
        start_date = dt.datetime.strptime(start_date,'%Y-%m-%d')
    # Check if default end date needs applying, else parse input date.
    if end_date is None:
        end_date = END_DATE
    else:
       end_date = dt.datetime.strptime(end_date,'%Y-%m-%d')
    # Apply date filter range to transformed xml data.
    df_out = _transform()
    df_out = df_out.loc[start_date:end_date]
    return df_out

def convert_from_eur(df, eur_amount, f_currency, ref_date, start_date=None, end_date=None):
    df_ecb = get_rates(start_date, end_date)[f_currency]
    df = pd.merge(left=df, right=df_ecb, left_on=ref_date, right_index=True, how='left') 
    df['Amount ' + f_currency] = df[eur_amount].astype(float)*df[f_currency].astype(float)
    return df

def convert_to_eur(df, loc_amount, f_currency, ref_date, start_date=None, end_date=None):
    df_ecb = get_rates(start_date, end_date)[f_currency]
    df = pd.merge(left=df, right=df_ecb, left_on=ref_date, right_index=True, how='left') 
    df['Amount EUR'] = df[loc_amount].astype(float)/df[f_currency].astype(float)
    return df

def main():
    df = pd.DataFrame()
    display(get_rates())

if __name__ == '__main__':
    main()


Unnamed: 0,net,idate,NOK,Amount EUR
2023-10-06,11.6045,2023-10-06,11.6090,0.999612
2023-10-05,11.5975,2023-10-05,11.5703,1.002351
2023-10-04,11.5855,2023-10-04,11.5135,1.006254
2023-10-03,11.6375,2023-10-03,11.4258,1.018528
2023-10-02,11.5833,2023-10-02,11.3225,1.023034
...,...,...,...,...
2022-10-13,11.0098,2022-10-13,10.3525,1.063492
2022-10-12,11.0200,2022-10-12,10.4145,1.058140
2022-10-11,11.0015,2022-10-11,10.4235,1.055452
2022-10-10,10.9502,2022-10-10,10.3378,1.059239
