#### DATA ENGINEERING PIPELINE - INTRASTAT DECLARATION

Aim:
Write a production ready data engineering pipeline using python and pandas.

Overview:
Intrastat is a system that collects information relating to the trade of goods. This script will transform sample invoice data from a fictious company into a submissable Swedish intrastat declaration.

Task:

Below outlines the steps to be performed:
    
    01) Import the necessary libraries for the project.
    02) Define the functions that will faciliate the data engineering.
    03) Read the sample intrastat data into a pd dataframe.
    04) Request intrastat commodity code list URL and read content into pd dataframe.
    05) Verify sample data using commodity code list.
    06) Request ECB FX rates from URL, parse xml file and read data into pd dataframe.
    07) Cleanse and transform ex.rate data into pivot table.
    08) Apply daily exchange rate calculation on sample invoice values.
    09) Apply final transformations to intrastat output file. 
    10) Display the content of the prepared file.
    11) Export the content as an excel file, submissible to the Swedish stats authority.

Import Packages

In [74]:
import pandas as pd # Data analysis library.
import numpy as np # Array and matrice libary
import ssl # Secure sockets layer package.
import urllib # Url handling module.
import sys # Runtime environment handling module.
import xml.etree.ElementTree as et # XML parsing library.
import datetime as dt # Datetime parsing library.

Define Methods

In [75]:
def cc_request(url):
    # Disable security certificate checks for url requests.
    ssl._create_default_https_context = ssl._create_unverified_context
    try:
        # If URL is valid print confirmation.
        urllib.request.urlopen(url)
        print('Message: Requested commodity code url is valid.')
        # Read data into pandas dataframe.
        df = pd.read_excel(url)
    except urllib.error.URLError:
        # If URL is invalid print error.
        print('Error: Requested commodity code url is invalid.')
        sys.exit()
    return df

def cc_transform(df, column_rename):
    # Pad left first column to CN8 format
    df.iloc[:,0] = df.iloc[:,0].astype("str").str.pad(8, side='left', fillchar='0')
    # Rename first column to CN8. 
    df.columns.values[0] = column_rename
    return df

def cc_export(file_name, df):
    # Export to csv file 
    df.iloc[:,0] = df.iloc[:,0].astype("str")
    df.to_csv(file_name, index=False)
    
def cc_rte_process(url, column_rename, file_name):
    # Run full rte process and display csv output file.
    df_request = cc_request(url)
    df_transform = cc_transform(df_request, column_rename)
    cc_export(file_name,df_transform)
    #display(pd.read_csv(file_name, dtype=str))
    return df_transform

def fx_parse_xml(xml_obj, xml_namespaces, column_names):
    # Parse xml content. 
    xml_tree = et.parse(xml_obj)
    xml_root = xml_tree.getroot()
    # Find required tags and store data via list comprehension.
    rows = xml_root.findall('.//ex:Cube', namespaces=xml_namespaces)
    xml_data = [[row.get('time'), row.get('currency'), row.get('rate')] for row in rows]
    # Create columns for dataframe and read in content.
    df = pd.DataFrame(xml_data, columns = column_names)
    return df
    
def fx_create_pivot(df, column_names):
    # Create fx rate pivot table by date and currency.
    df_out = pd.pivot_table(df, index=column_names[0], columns=column_names[1], values=column_names[2])
    # Add weekend dates missing from period to the table index.
    max_date = df.iloc[1, df.columns.get_loc(column_names[0])]
    min_date = df.iloc[-1, df.columns.get_loc(column_names[0])]
    date_idx = pd.date_range(min_date, max_date)
    df_out.index = pd.DatetimeIndex(df_out.index)
    df_out = df_out.reindex(date_idx)
    # Fill forward missing weekend fx rate values. 
    df_out = df_out.ffill(axis=0)
    df_out = df_out.sort_index(ascending=0)
    return df_out

def fx_request(url, xml_namespaces, column_names):
    # Disable security certificate checks for url requests.
    ssl._create_default_https_context = ssl._create_unverified_context
    try:
        # If URL is valid print confirmation.
        xml_object = urllib.request.urlopen(url)
        print('Message: Requested fx rate url is valid.')
        df = fx_parse_xml(xml_object, xml_namespaces, column_names)

    except urllib.error.URLError:
        # If URL is invalid print error.
        print('Error: Requested fx rate url is invalid.')
        sys.exit()
    return df

def fx_transform(df, column_names):
    # Fill forward rows with missing dates. 
    df = df.ffill(axis=0)
    # Drop all other empty rows.
    df = df.dropna()
    # Create ex.rate pivot table.
    df[column_names[2]] = pd.to_numeric(df[column_names[2]])
    df_out = fx_create_pivot(df,column_names)
    return df_out

def fx_export(file_name, df, column_names):
    # Export to csv file 
    df.iloc[:,0] = df.iloc[:,0].astype("str")
    df.index.name = column_names[0]
    df.to_csv(file_name, index=True)
    
    
def fx_rte_process(url, xml_namespaces, column_names, file_name):
    # Run full rte process and display csv output file.
    df_request = fx_request(url, xml_namespaces, column_names)
    df_transform = fx_transform(df_request, column_names)
    fx_export(file_name,df_transform, column_names)
    #display(pd.read_csv(file_name, dtype=str))
    return df_transform

def src_return_mot(mode):
    mot_switch={
    'Sea':'1', 
    'Rail':'2',
    'Road':'3',
    'Air':'4',
    }
    return mot_switch.get(mode,"Invalid mode of transport")

def src_checks(df_src, df_cc):
    df_out = pd.merge(df_src, df_cc, how='left', left_on='Commodity Code', right_on='CN8')
    df_out['CC Check'] = np.where(df_out['Commodity Code'] == df_out['CN8'], 'OK', '`ERROR')
    df_out['Partner VAT'] = np.where(df_out['Transaction'] == 'B2C', 'QV999999999999', df_out['Partner VAT'])
    return df_out

def src_fx_convert(df_src, df_fx):
    df_src['Shipping Date'] = df_src['Shipping Date'].astype("string")
    df_src['Shipping Date'] = pd.to_datetime(df_src['Shipping Date'], format="%d-%m-%Y")
    df_out = pd.merge(df_src, df_fx, how='left', left_on='Shipping Date', right_on=df_fx.index)
    df_out.rename(columns = {'SEK':'EUR to SEK'}, inplace = True)
    df_out['Net (SEK)'] = df_out['Net (EUR)'].astype('float').multiply(df_out['EUR to SEK'].astype('float'), axis='index')    
    return df_out

def src_transform(df_src):
    df_src['Mode of Transport'] = [src_return_mot(mode) for mode in df_src['Mode of Transport']] 
    df_src['Mass (KG)'] = [float(g)*1000 for g in df_src['Mass (grams)']] 
    df_src["County of Origin"] = 'CN'
    df_src = df_src.drop(['Description_x', 'Mass (grams)', 'Shipping Date', 'Ship From', 'Incoterms', 'Transaction','CN8','SU', 'Description_y', 'CC Check', 'Net (EUR)', 'EUR to SEK' ], axis = 1)
    df_src = df_src[['Ship To', 'Commodity Code','Net (SEK)', 'Quantity', 'Mass (KG)', 'County of Origin', 'Mode of Transport', 'Partner VAT'  ]]    
    return df_src

def src_export(file_name, df):
    # Export to excel file 
    df.iloc[:,0] = df.iloc[:,0].astype("str")
    df.to_excel(file_name, index=False)

def src_cfte_process(df_src, df_cc, df_fx, file_name):
    df_check = src_checks(df_src, df_cc)
    df_fx_convert = src_fx_convert(df_check, df_fx)
    df_transform = src_transform(df_fx_convert)
    src_export(file_name, df_transform)
    display(df_transform)

In [76]:
def main():
    
    # Define variables.
    cc_url = 'https://www.cbs.nl/-/media/cbsvooruwbedrijf/international-trade-in-goods/commoditycodes-2023.xlsx'
    cc_column_rename = 'CN8'
    cc_file_name = 'CN8 Codes.csv'
    
    fx_url = 'https://www.ecb.europa.eu/stats/eurofxref/eurofxref-hist-90d.xml'
    xml_namespaces = {'ex': 'http://www.ecb.int/vocabulary/2002-08-01/eurofxref'}
    fx_column_names = ['Date', 'Currency', 'Rate']
    fx_file_name = 'ECB FX Rates.csv'
    
    output_file_name = 'Intrastat Submission Sample.xlsx'
    
    #Read in intrastat data.  
    df_source = pd.read_excel('Intrastat Dispatches Data Sample.xlsx', dtype = str)
    
    #Read in URL resources to assist analysis of source data. Export local copies.
    df_cn8_codes = cc_rte_process(cc_url, cc_column_rename, cc_file_name)
    df_fx_rates = fx_rte_process(fx_url, xml_namespaces, fx_column_names, fx_file_name)
    
    #Run intrastat data check, fx conversion, transformation and export process.
    src_cfte_process(df_source, df_cn8_codes, df_fx_rates, output_file_name)
    
# Define main as program entry point if script is running as standalone and not as module.
if __name__=="__main__":
    main()


Message: Requested commodity code url is valid.
Message: Requested fx rate url is valid.


Unnamed: 0,Ship To,Commodity Code,Net (SEK),Quantity,Mass (KG),County of Origin,Mode of Transport,Partner VAT
0,DE,61012010,2121.407,1,595.0,CN,2,QV999999999999
1,NL,61012090,1668.27,1,678.0,CN,1,NL999999999999
2,ES,61013010,1957.34,1,704.0,CN,3,QV999999999999
3,FR,61019080,1519.83,1,844.0,CN,4,FR999999999999
4,NL,61021010,1619.1135,1,461.0,CN,3,NL999999999999
5,BE,61021090,1782.88,1,589.0,CN,4,BE999999999999
6,PT,61022090,1733.644,1,533.0,CN,1,QV999999999999
7,IT,61029010,2026.44,1,406.0,CN,2,QV999999999999
