# Initialize ECB data lake bronze layer 



In [1]:
# import all dependencies needed for this notebook
import requests
import zipfile
import io
import pandas as pd

StatementMeta(, 0e90b6c4-6885-4ada-b770-100d38748572, 3, Finished, Available)

###### Pull ECB exchange rate zip file directly from website, unzip and process into pandas data frame

In [2]:
# URL of the zip file
zip_url = 'https://www.ecb.europa.eu/stats/eurofxref/eurofxref-hist.zip'

# Send a GET request to download the zip file
response = requests.get(zip_url)

# Extract the zip file contents
zip_file = zipfile.ZipFile(io.BytesIO(response.content))
csv_file = zip_file.open('eurofxref-hist.csv')

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file)

# Display the DataFrame
df.head()

StatementMeta(, 0e90b6c4-6885-4ada-b770-100d38748572, 4, Finished, Available)

Unnamed: 0,Date,USD,JPY,BGN,CYP,CZK,DKK,EEK,GBP,HUF,...,INR,KRW,MXN,MYR,NZD,PHP,SGD,THB,ZAR,Unnamed: 42
0,2024-04-09,1.0867,164.97,1.9558,,25.38,7.459,,0.85663,389.2,...,90.4135,1470.98,17.7268,5.1591,1.7938,61.301,1.4623,39.529,20.0861,
1,2024-04-08,1.0823,164.43,1.9558,,25.354,7.4588,,0.85795,390.78,...,90.159,1468.37,17.8053,5.1425,1.7984,61.204,1.4607,39.769,20.1574,
2,2024-04-05,1.0841,164.1,1.9558,,25.286,7.459,,0.85773,390.1,...,90.3283,1464.09,17.9104,5.1462,1.8001,61.268,1.4608,39.743,20.171,
3,2024-04-04,1.0852,164.69,1.9558,,25.322,7.4589,,0.85788,391.55,...,90.5116,1462.46,17.9675,5.1433,1.7998,61.243,1.4628,39.881,20.2704,
4,2024-04-03,1.0783,163.66,1.9558,,25.352,7.4589,,0.85713,393.2,...,90.0055,1456.04,17.8782,5.1273,1.8054,60.817,1.4571,39.584,20.2667,


###### Save raw data as csv file in Files folder

In [11]:
df2 = spark.createDataFrame(df)
df2.coalesce(1).write.mode("overwrite").option("header",True).csv("Files/eurofxref-hist")

StatementMeta(, 0e90b6c4-6885-4ada-b770-100d38748572, 13, Finished, Available)

###### Unpivot data and perform data cleanup as needed
###### Usually we leave the raw data in bronze layer, but in this case it would be beneficial for downstream silver layer to unpivot the raw data and perform some basic cleanup

In [4]:
# Unpivot the columns starting from 'USD' using melt() function
unpivoted_df = df.melt(id_vars=['Date'], var_name='Currency', value_name='Rate')

# filter out empty rows where Currency is empty and rows where Currency = Unnamed: 42
#unpivoted_df = unpivoted_df.dropna(subset=['Currency']).query("Currency.str.strip() != '' & Currency.str.strip() != 'Unnamed: 42'")

# filter out empty rows where Rate is empty
unpivoted_df = unpivoted_df.dropna(subset=['Rate'])

# Convert the currency column to a string
unpivoted_df['Currency'] = unpivoted_df['Currency'].astype(str)

# Convert the rate column to a decimal
unpivoted_df['Rate'] = unpivoted_df['Rate'].astype(float)

# Convert the RateDate column to a datetime
unpivoted_df['Date'] = pd.to_datetime(unpivoted_df['Date'])

unpivoted_df.head()

StatementMeta(, ae387733-4fa6-4049-8285-b4674bfb7518, 6, Finished, Available)

Unnamed: 0,Date,Currency,Rate
0,2024-04-09,USD,1.0867
1,2024-04-08,USD,1.0823
2,2024-04-05,USD,1.0841
3,2024-04-04,USD,1.0852
4,2024-04-03,USD,1.0783


###### Convert unpivoted pandas dataframe to spark df and write to bronze table called bronze_fxdata

In [6]:
# Convert unpivoted pandas dataframe to spark df
spark_df = spark.createDataFrame(unpivoted_df)

# Use format and save to load as a Delta table
table_name = "bronze_fxdata"
spark_df.write.mode("overwrite").format("delta").save(f"Tables/{table_name}")

# Confirm load as Delta table
print(f"Spark DataFrame saved to Delta table: {table_name}")

StatementMeta(, ae387733-4fa6-4049-8285-b4674bfb7518, 8, Finished, Available)

Spark DataFrame saved to Delta table: bronze_fxdata
