Import libraries

In [128]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3
import numpy as np
from datetime import datetime

Set variables

In [129]:
url = 'https://web.archive.org/web/20230908091635/https://en.wikipedia.org/wiki/List_of_largest_banks'
csv_file = './Largest_banks_data.csv'

Define the log function

In [130]:
def log_progress(message):
    now = datetime.now().strftime('%Y-%h-%d-%H:%M:%S')
    with open("./code_log.txt","a") as f:
        f.write(now + ' : ' + message + '\n')

Define the extract function
The function receives the url and the columns of the table and returns a dataframe

In [131]:
def extract(url, table_attribs):
    page = requests.get(url).text
    data = BeautifulSoup(page,'html.parser')
    df = pd.DataFrame(columns=table_attribs)
    tables = data.find_all('tbody')
    rows = tables[0].find_all('tr')
    for row in rows:
        col = row.find_all('td')
        if len(col) != 0:
            data_dict = {"Rank" : int(col[0].text.strip('\n')),
                         "Bank Name" : col[1].find_all('a')[1].text,
                         "MC_USD_Billion" : float(col[2].text.strip('\n'))}
            df1 = pd.DataFrame(data_dict, index=[0])
            df = pd.concat([df,df1], ignore_index=True)
    return df


Load exchange rates from CSV file

In [132]:
def load_exchange_rates():
    # load the csv file with the exchange rates and returns a dictionary with the values
    rates = {}
    df = pd.read_csv("./exchange_rate.csv")
    for index, row in df.iterrows():
        rates[row['Currency']] = row['Rate']
    return rates

Transform the data into other currencies

In [133]:
def transform(df):
    df1 = df
    df1['MC_GBP_Billion'] = [np.round(x*rates['GBP'],2) for x in df['MC_USD_Billion']]
    df1['MC_EUR_Billion'] = [np.round(x*rates['EUR'],2) for x in df['MC_USD_Billion']]
    df1['MC_INR_Billion'] = [np.round(x*rates['INR'],2) for x in df['MC_USD_Billion']]
    return df.add(df1)

Save the data to a CSV file

In [135]:
def load_to_csv(df, csv_path):
    df.to_csv(csv_path)

In [134]:
log_progress('ETL progress started')

df = extract(url, ['Rank','Bank Name','MC_USD_Billion'])
log_progress('Data extraction complete')

rates = load_exchange_rates()
log_progress('Exchange Rates loaded')

df = transform(df)
log_progress('Data transformation complete')

load_to_csv(df, csv_file)
log_progress('Dataframe saved to CSV file')

print(df)
print(df['MC_EUR_Billion'][4])

  Rank                                          Bank Name  MC_USD_Billion  \
0    2                       JPMorgan ChaseJPMorgan Chase          865.84   
1    4                     Bank of AmericaBank of America          463.04   
2    6  Industrial and Commercial Bank of ChinaIndustr...          389.12   
3    8  Agricultural Bank of ChinaAgricultural Bank of...          321.36   
4   10                                 HDFC BankHDFC Bank          315.82   
5   12                             Wells FargoWells Fargo          311.74   
6   14                 HSBC Holdings PLCHSBC Holdings PLC          297.80   
7   16                       Morgan StanleyMorgan Stanley          281.66   
8   18     China Construction BankChina Construction Bank          279.64   
9   20                         Bank of ChinaBank of China          273.62   

   MC_GBP_Billion  MC_EUR_Billion  MC_INR_Billion  
0          692.68          805.24        71821.42  
1          370.44          430.62        38409.1