# Data ETL Pipeline for the Concentration of Banking Project

## Importing key packages

### Importation Summary
1. Web Scraping: 
<span style="background-color:rgb(79, 78, 78); padding: 2px 5px; border-radius: 3px;">bs4</span> + <span style="background-color:rgb(79, 78, 78); padding: 2px 5px; border-radius: 3px;">re</span>

2. Parsing: <span style="background-color:rgb(79, 78, 78); padding: 2px 5px; border-radius: 3px;">lxml</span> +  <span style="background-color:rgb(79, 78, 78); padding: 2px 5px; border-radius: 3px;">BeautifulSoup</span>

3. Data Processing: <span style="background-color:rgb(79, 78, 78); padding: 2px 5px; border-radius: 3px;">pandas</span> + <span style="background-color:rgb(79, 78, 78); padding: 2px 5px; border-radius: 3px;">numpy</span> + <span style="background-color:rgb(79, 78, 78); padding: 2px 5px; border-radius: 3px;">os</span> + <span style="background-color:rgb(79, 78, 78); padding: 2px 5px; border-radius: 3px;">datetime</span>

3. File Upload: <span style="background-color:rgb(79, 78, 78); padding: 2px 5px; border-radius: 3px;">json</span> + <span style="background-color:rgb(79, 78, 78); padding: 2px 5px; border-radius: 3px;">base64</span> + <span style="background-color:rgb(79, 78, 78); padding: 2px 5px; border-radius: 3px;">requests</span>





In [None]:
import requests #Handles HTTP requests to fetch web contents from the federal reserve
from bs4 import BeautifulSoup #Parses HTML content for web web scraping. It extracts the quarterly bank data tables from the downloaded HTML pages. 
import re #performs string pattern and substitution. It sanitizes filenames
from datetime import datetime #Manages date/time operations to generated quarterly ranges and formats timestamps for filenames. 
import os #Interfaces with the operating system. 
import pandas as pd #Data manipulation and analysis from the dataframes converted from the HTML file tables. Cleans and transforms data. Converts dataframes into csv files. 
import lxml #Optimal parser for beautiful soup. 
import numpy as np #Numerical computing and NaN handling. 
import json #Serializes and deserializes Python objects to JSON format. Formats payloads for Github API requests during file uploads. 
import base64 #Converts file content to base64 for Github API uploads.


In [3]:
#Setting the relative paths to save HTML and CSV files
html_dir="./html_files"
csv_dir="./csv_files"

## Webscraping & File Conversion

### Quarterly dates generation function

In [4]:
"Develops a list of quarterly end dates from a given start year up to the current year."
def generate_quarterly_dates(start_year):
    """Generates a list of quarterly end dates from a given start year up to the current year."""
    end_year = datetime.now().year  # Current year
    quarters = ["March 31,", "June 30,", "September 30,", "December 31,"]
    formatted_dates = []
    for year in range(start_year, end_year + 1):
        for quarter in quarters:
            formatted_dates.append(quarter + " " + str(year))
    return formatted_dates

generate_quarterly_dates(2003)

['March 31, 2003',
 'June 30, 2003',
 'September 30, 2003',
 'December 31, 2003',
 'March 31, 2004',
 'June 30, 2004',
 'September 30, 2004',
 'December 31, 2004',
 'March 31, 2005',
 'June 30, 2005',
 'September 30, 2005',
 'December 31, 2005',
 'March 31, 2006',
 'June 30, 2006',
 'September 30, 2006',
 'December 31, 2006',
 'March 31, 2007',
 'June 30, 2007',
 'September 30, 2007',
 'December 31, 2007',
 'March 31, 2008',
 'June 30, 2008',
 'September 30, 2008',
 'December 31, 2008',
 'March 31, 2009',
 'June 30, 2009',
 'September 30, 2009',
 'December 31, 2009',
 'March 31, 2010',
 'June 30, 2010',
 'September 30, 2010',
 'December 31, 2010',
 'March 31, 2011',
 'June 30, 2011',
 'September 30, 2011',
 'December 31, 2011',
 'March 31, 2012',
 'June 30, 2012',
 'September 30, 2012',
 'December 31, 2012',
 'March 31, 2013',
 'June 30, 2013',
 'September 30, 2013',
 'December 31, 2013',
 'March 31, 2014',
 'June 30, 2014',
 'September 30, 2014',
 'December 31, 2014',
 'March 31, 2015

### Date to quarter conversion function

In [5]:
'''Converts a date to a quarter.'''
def date_to_quarter(month, day):
    if month == 3 and day == 31:
        return 'Q1'
    elif month == 6 and day == 30:
        return 'Q2'
    elif month == 9 and day == 30:
        return 'Q3'
    elif month == 12 and day == 31:
        return 'Q4'
    else:
        return None

#Set example date by picking March 31
date_to_quarter(3, 31)

'Q1'

### Webscrape, update and rename files function
Webscrapes the Federal Reserve website for large commercial bank data, updates the files, 
and renames them according to a specified format.

In [6]:
"""Downloads data, uploads to local directory, and renames files according to a specified format."""
def update_and_rename_files():
    """Downloads data, uploads to local directory, and renames files according to a specified format."""
    URL = "https://www.federalreserve.gov/releases/lbr/" #URL of the Federal Reserve website
    quarterly_dates = generate_quarterly_dates(2003) #Recalling the function that generates a list of quarterly end dates
    response = requests.get(URL) #Sending a GET request to the URL
    soup = BeautifulSoup(response.text, "html.parser") #Parsing the HTML content
    all_links = soup.find_all('a', href=True) #Finding all 'a' elements with 'href' attribute
    links = [(link['href'], link.text) for link in all_links if any(date in link.text for date in quarterly_dates)] #Extracting 'href' and 'text' attributes from 'a' elements

    for href, date in links: #Looping through the extracted 'href' and 'text' attributes
        full_url = URL + href #Constructing the full URL
        table_response = requests.get(full_url) #Sending a GET request to the full URL
        filename = re.sub(r'\W+', '_', href) + ".html" #Sanitizing the filename
        date_obj = datetime.strptime(date, "%B %d, %Y") #Converting the 'text' attribute to a datetime object
        datenum = date_obj.strftime("%Y%m%d") #Converting the datetime object to a string
        quarter = date_to_quarter(date_obj.month, date_obj.day) #Calling the function that converts a date to a quarter
        new_filename = f'{datenum}_{quarter}_{date_obj.year}_large_commercial_banks.html' #Constructing the new filename
        current_path = os.path.join(html_dir, filename) # Path to current file
        new_path = os.path.join(html_dir, new_filename) # Path to new file

        #Write the content of the respons to a file
        with open(new_path, 'w', encoding='utf-8') as f:
            f.write(table_response.text)

update_and_rename_files()

In [7]:
html_dir.replace('/', '\\')

'.\\html_files'

### Convert HTML file to CSV 

In [8]:
" Reads a specific table from an HTML file."
def read_specific_table(html_file): 
    with open(html_file, 'r', encoding='utf8') as file:  # It's a good practice to define encoding
        contents = file.read()

        # Use BeautifulSoup to parse the HTML content
        soup = BeautifulSoup(contents, 'html.parser')  # You can switch back to 'lxml' or use 'html.parser'

        '''Create OR statements to account for differences in the attributes in the html code for the html files'''
        def matches_table_attribute(tag):
            is_table = tag.name == 'table'
            cellpadding_match = tag.get('cellpadding') in ['1', '7']
            border_match = tag.get('border') in ['1', '1px']
            frame_match = tag.get('frame') in ['BOX', 'box']
            return is_table and cellpadding_match and border_match and frame_match 
        
        #Find all tables that match our custom criteria
        tables=soup.find_all(matches_table_attribute)

        #Return the first matching table
        if tables:
            #Parse the table with Pandas
            df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table
            return df
        else:
            #Handle the case where no table is found
            print(f"No table found in file: {html_file}")
            return None

for file in os.listdir(html_dir): 
    if file.endswith('.html'):
        #Read the aspecific table from the HTML file
        df=read_specific_table(os.path.join(html_dir,file))

        #Write the DataFrame to a CSV file
        csv_file=os.path.splitext(file)[0]+'.csv' #[0] is the file name without the extension
        #Save the csv file in the csv directory
        df.to_csv(os.path.join(csv_dir,csv_file),index=False)
        print(f"File {csv_file} has been saved.")  

  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20030930_Q3_2003_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20031231_Q4_2003_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20040331_Q1_2004_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20040630_Q2_2004_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20040930_Q3_2004_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20041231_Q4_2004_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20050331_Q1_2005_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20050630_Q2_2005_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20050930_Q3_2005_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20051231_Q4_2005_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20060331_Q1_2006_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20060630_Q2_2006_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20060930_Q3_2006_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20061231_Q4_2006_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20070331_Q1_2007_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20070630_Q2_2007_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20070930_Q3_2007_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20071231_Q4_2007_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20080331_Q1_2008_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20080630_Q2_2008_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20080930_Q3_2008_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20081231_Q4_2008_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20090331_Q1_2009_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20090630_Q2_2009_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20090930_Q3_2009_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20091231_Q4_2009_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20100331_Q1_2010_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20100630_Q2_2010_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20100930_Q3_2010_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20101231_Q4_2010_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20110331_Q1_2011_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20110630_Q2_2011_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20110930_Q3_2011_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20111231_Q4_2011_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20120331_Q1_2012_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20120630_Q2_2012_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20120930_Q3_2012_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20121231_Q4_2012_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20130331_Q1_2013_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20130630_Q2_2013_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20130930_Q3_2013_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20131231_Q4_2013_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20140331_Q1_2014_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20140630_Q2_2014_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20140930_Q3_2014_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20141231_Q4_2014_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20150331_Q1_2015_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20150630_Q2_2015_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20150930_Q3_2015_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20151231_Q4_2015_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20160331_Q1_2016_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20160630_Q2_2016_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20160930_Q3_2016_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20161231_Q4_2016_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20170331_Q1_2017_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20170630_Q2_2017_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20170930_Q3_2017_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20171231_Q4_2017_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20180331_Q1_2018_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20180630_Q2_2018_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20180930_Q3_2018_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20181231_Q4_2018_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20190331_Q1_2019_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20190630_Q2_2019_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20190930_Q3_2019_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20191231_Q4_2019_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20200331_Q1_2020_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20200630_Q2_2020_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20200930_Q3_2020_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20201231_Q4_2020_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20210331_Q1_2021_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20210630_Q2_2021_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20210930_Q3_2021_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20211231_Q4_2021_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20220331_Q1_2022_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20220630_Q2_2022_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20220930_Q3_2022_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20221231_Q4_2022_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20230331_Q1_2023_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20230630_Q2_2023_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20230930_Q3_2023_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20231231_Q4_2023_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20240331_Q1_2024_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20240630_Q2_2024_large_commercial_banks.csv has been saved.


  df=pd.read_html(str(tables[0]))[0] #Read the table into a dataframe. The [0] is to get the first table


File 20240930_Q3_2024_large_commercial_banks.csv has been saved.


## Data Cleaning

### Standardization of columns and filling NaN

In [9]:
#Defining the path to the csv files
csv_files=os.listdir(csv_dir) #List of all the csv files in the directory

#Define the uniform column names
uniform_columns= [
    'Name',
    'Natl Rank',
    'Bank ID',
    'Bank Location',
    'Charter',
    'Consolidated Assets',
    'Domestic Assets',
    'Percentage Domestic Assets',
    'Percentage Cumulative Assets',
    'Domestic Branches',
    'Foreign Branches',
    'IBF',
    'Percentage Foreign Owned',
]

#Loop through each CSV file and append to the uniform_columns
for csv_file in csv_files:
    try:
        #construct the full path to the CSV file
        csv_path=os.path.join(csv_dir,csv_file)

        #Read the CSV file into a Dataframe
        df=pd.read_csv(csv_path)

        #Check if the DataFrame has the same number of columns as 'uniform_columns'
        if len(df.columns) < len(uniform_columns):
            #Add the missing columns and fill them with NaN
            missing_cols=len(uniform_columns)-len(df.columns)
            for _ in range(missing_cols):
                df[uniform_columns[-missing_cols]]=np.nan

        #Read the columns
        df.columns=uniform_columns

        #Write the DataFrame back to the CSV file
        df.to_csv(csv_path,index=False)

        print(f"Successfully converted {csv_file}")
    except Exception as e:
        print(f"Error converting {csv_file}: {e}")

Successfully converted 20030930_Q3_2003_large_commercial_banks.csv
Successfully converted 20031231_Q4_2003_large_commercial_banks.csv
Successfully converted 20040331_Q1_2004_large_commercial_banks.csv
Successfully converted 20040630_Q2_2004_large_commercial_banks.csv
Successfully converted 20040930_Q3_2004_large_commercial_banks.csv
Successfully converted 20041231_Q4_2004_large_commercial_banks.csv
Successfully converted 20050331_Q1_2005_large_commercial_banks.csv
Successfully converted 20050630_Q2_2005_large_commercial_banks.csv
Successfully converted 20050930_Q3_2005_large_commercial_banks.csv
Successfully converted 20051231_Q4_2005_large_commercial_banks.csv
Successfully converted 20060331_Q1_2006_large_commercial_banks.csv
Successfully converted 20060630_Q2_2006_large_commercial_banks.csv
Successfully converted 20060930_Q3_2006_large_commercial_banks.csv
Successfully converted 20061231_Q4_2006_large_commercial_banks.csv
Successfully converted 20070331_Q1_2007_large_commercial_banks

### Setting datatypes

In [10]:
'''Change datatypes for columns and add new columns'''
#Define the columns to covnert the integer
integer_columns=['Natl Rank', 'Consolidated Assets', 'Domestic Assets', 'Domestic Branches', 'Foreign Branches']

#Define the columns to convert to floats
float_columns=['Percentage Domestic Assets', 'Percentage Cumulative Assets', 'Percentage Foreign Owned']

#Loop through each CSV file and convert the columns to the correct data types
for csv_file in csv_files:
    try: #Error handling
        print(f"Processing file: {csv_file}")
        csv_path=os.path.join(csv_dir,csv_file)
        df=pd.read_csv(csv_path)

        #Convert to float columns
        for col in integer_columns:
            df[col]=pd.to_numeric(df[col],errors='coerce').fillna(0).astype(float)

        #Mutiply 'Consolidated Assets' and 'Domestic Assets' by 1,000,000
        df['Consolidated Assets']=df['Consolidated Assets']*1000000 #*1000000 use to multiply the 'Consolidated Assets', 'Domestic Assets' columns
        df['Domestic Assets']=df['Domestic Assets']*1000000 #*1000000 use to multiply the 'Consolidated Assets', 'Domestic Assets' columns

        #Convert float columns and divide by 100 to get decimal representation
        for col in float_columns:
            df[col]=pd.to_numeric(df[col],errors='coerce').div(100) #.div(100) use to divide 'Percentage Domestic Assets', 'Percentage Cumulative Assets', 'Percentage Foreign Owned'columns

        #Save the cleaned DataFrame back to the CSV file
        df.to_csv(csv_path,index=False)

        print(f"Successfully cleaned file: {csv_file}")
    except Exception as e:
        print(f"Error processing file: {csv_file}: {e}")


Processing file: 20030930_Q3_2003_large_commercial_banks.csv
Successfully cleaned file: 20030930_Q3_2003_large_commercial_banks.csv
Processing file: 20031231_Q4_2003_large_commercial_banks.csv
Successfully cleaned file: 20031231_Q4_2003_large_commercial_banks.csv
Processing file: 20040331_Q1_2004_large_commercial_banks.csv
Successfully cleaned file: 20040331_Q1_2004_large_commercial_banks.csv
Processing file: 20040630_Q2_2004_large_commercial_banks.csv
Successfully cleaned file: 20040630_Q2_2004_large_commercial_banks.csv
Processing file: 20040930_Q3_2004_large_commercial_banks.csv
Successfully cleaned file: 20040930_Q3_2004_large_commercial_banks.csv
Processing file: 20041231_Q4_2004_large_commercial_banks.csv
Successfully cleaned file: 20041231_Q4_2004_large_commercial_banks.csv
Processing file: 20050331_Q1_2005_large_commercial_banks.csv
Successfully cleaned file: 20050331_Q1_2005_large_commercial_banks.csv
Processing file: 20050630_Q2_2005_large_commercial_banks.csv
Successfully cl

## Data Transformation

### Creating New Columns- Percentage Column
Create another column in each one of the csv files that would allow me to see the percentage of the consolidated assets of each bank compared to the sum of the consolidated assets of all banks

In [11]:
#Loop through all the CSV files to create a percentage column based on the consolidated assets:
for csv_file in csv_files:
    csv_path=os.path.join(csv_dir,csv_file)
    df=pd.read_csv(csv_path)

    #Calculate the sum of consolidated assets for each bank
    total_assets=df['Consolidated Assets'].sum()

    #Calculate the percentage of consolidated assets for each bank
    df['Percentage of Total Cosolidated Assets']=(df['Consolidated Assets']/total_assets)

    #Round the percentage to 4 decimal places
    df['Percentage of Total Cosolidated Assets']=df['Percentage of Total Cosolidated Assets'].round(4)

    #Convert 'Percentage of Total Cosolidated Assets' column into a float column
    df['Percentage of Total Cosolidated Assets']=df['Percentage of Total Cosolidated Assets'].astype(float)

    # Save the updated dataframe back to the CSV file
    df.to_csv(csv_path,index=False)

### Creating New Columns- Date and Quarter
Crete date and quarter column in each one of the csv files
based on the file name format in which '20170331_Q1_2017_large_commercial_banks.csv' 
stands for ''yyyymmdd_Qx_yyyy_large_commercial_banks.csv'. 

The date column should be in the format 'mm/dd/yyyy' (in date data type fomat) and the quarter column 
should be in the format 'Qx-yyyy' format in date data type fomat.



In [12]:
#Loop through all the CSV files to create the date and quarter columns
for csv_file in csv_files:
    csv_path=os.path.join(csv_dir,csv_file)
    df=pd.read_csv(csv_path)

    #Extract the date from the file name
    date_str=csv_file.split('_')[0] #split('_')[0] use to extract the date from the file name. The [0] is the first element in the list
    date=pd.to_datetime(date_str,format='%Y%m%d')
    df['Date']=date

    #Extract the quarter from the file name
    quarter_str=csv_file.split('_')[1] #split('_')[1] use to extract the quarter from the file name. The [1] is the second element in the list
    year_str=csv_file.split('_')[2] #split('_')[2] use to extract the year from the file name. The [2] is the third element in the list
    quarter=f"{quarter_str}-{year_str}" #f"{quarter_str}-{year_str}" use to format the quarter and year
    df['Quarter']=quarter #Add the quarter column to the DataFrame

    #Save the updated DataFrame back to the CSV file
    df.to_csv(csv_path,index=False)

    print(f"Successfully processed file: {csv_file}")

Successfully processed file: 20030930_Q3_2003_large_commercial_banks.csv
Successfully processed file: 20031231_Q4_2003_large_commercial_banks.csv
Successfully processed file: 20040331_Q1_2004_large_commercial_banks.csv
Successfully processed file: 20040630_Q2_2004_large_commercial_banks.csv
Successfully processed file: 20040930_Q3_2004_large_commercial_banks.csv
Successfully processed file: 20041231_Q4_2004_large_commercial_banks.csv
Successfully processed file: 20050331_Q1_2005_large_commercial_banks.csv
Successfully processed file: 20050630_Q2_2005_large_commercial_banks.csv
Successfully processed file: 20050930_Q3_2005_large_commercial_banks.csv
Successfully processed file: 20051231_Q4_2005_large_commercial_banks.csv
Successfully processed file: 20060331_Q1_2006_large_commercial_banks.csv
Successfully processed file: 20060630_Q2_2006_large_commercial_banks.csv
Successfully processed file: 20060930_Q3_2006_large_commercial_banks.csv
Successfully processed file: 20061231_Q4_2006_large

## Data Wrangling

In [13]:
#Define the path to save the plot ready dataframes in the form of csv files
dataset_dir='./datasets'

### Master dataset
Function reads multiple CSV files and concatenates them into a single DataFrame.
 * It sorts the DataFrame by 'Date' in the descending order.
 * It groups the DataFrame by 'Bank ID' and gets the first 'Name' for each group.
 * It maps the bank_names_series to the 'Bank ID' column in the concatenated_df.
 * It sums the consolidated assets of all banks per quarter.
 * It turns 'total_assets_per_quarter' into a dataframe.
 * It saves the concatenated_df dataframe to a CSV file.

In [14]:
def process_data(csv_files, csv_dir):
    # Create an empty list to store the dataframes
    dfs = []

    # Iterate over each CSV file
    for file in csv_files:
        # Read the CSV file into a dataframe
        df = pd.read_csv(os.path.join(csv_dir, file))
        # Append the dataframe to the list
        dfs.append(df)

    # Concatenate all dataframes into a single dataframe
    concatenated_df = pd.concat(dfs)

    # Sort the dataframe by 'Date' in the descending order
    concatenated_df = concatenated_df.sort_values('Date', ascending=False)

    # Group by 'Bank ID' and get the first 'Name' for each group
    bank_names_series = concatenated_df.groupby(by='Bank ID')['Name'].first()

    # Map the bank_names_series to the 'Bank ID' column in the concatenated_df
    concatenated_df['Bank Name'] = concatenated_df['Bank ID'].map(bank_names_series)

    # Sum the consolidated assets of all banks per quarter
    total_assets_per_quarter = concatenated_df.groupby('Quarter')['Consolidated Assets'].sum()

    # Turn 'total_assets_per_quarter' into a dataframe
    total_assets_per_quarter = total_assets_per_quarter.to_frame()

    #Save the concatenated_df dataframe to a CSV file
    concatenated_df.to_csv(os.path.join(dataset_dir, 'concatenated_df.csv'), index=False)

    return concatenated_df, total_assets_per_quarter


    # Call the process_data function
concatenated_df, total_assets_per_quarter = process_data(csv_files, csv_dir)

In [15]:
total_assets_per_quarter

Unnamed: 0_level_0,Consolidated Assets
Quarter,Unnamed: 1_level_1
Q1-2004,6.982157e+12
Q1-2005,7.739072e+12
Q1-2006,8.473557e+12
Q1-2007,9.222430e+12
Q1-2008,1.053344e+13
...,...
Q4-2019,1.691051e+13
Q4-2020,1.991083e+13
Q4-2021,2.157899e+13
Q4-2022,2.169659e+13


### Creation of the Dataframe for the line plot

Function creates a pivot table of the concatenated_df dataframe to set 'Date' as the index, 'Commercial Name' as the columns, and 'Total Assets' as the values.
 * It adds 'Total Assets' column to the big_four_pivot dataframe.
 * It sums the consolidated assets of the Big Four banks for each quarter and adds the result to the 'big_four_pivot' dataframe as 'Big Four Assets'.
 * It adds 'Other Banks' column to the 'big_four_pivot' dataframe by subtracting 'Total Assets' from 'Big Four Assets'.
 * It adds 'Share of Consolidated Assets' column to the 'big_four_pivot' dataframe by dividing 'Big Four Assets' by 'Total Assets'.
 * It saves the 'big_four_pivot' dataframe to a CSV file.

In [16]:
def create_big_four_line(concatenated_df, total_assets_per_quarter):
    big_four_banks = {
        'JPMORGAN CHASE BK NA/JPMORGAN CHASE & CO': 852218,
        'BANK OF AMER NA/BANK OF AMER CORP': 480228,
        'CITIBANK NA/CITIGROUP': 476810,
        'WELLS FARGO BK NA/WELLS FARGO & CO': 451965
    }

    # Filter the dataframe to only include the Big Four banks
    big_four_df = concatenated_df[concatenated_df['Bank ID'].isin(big_four_banks.values())]

    # Create a dictionary to map the Big Four banks to their commercial names
    big_four_commercial_names = {
        852218: 'Chase',
        480228: 'Bank of America',
        476810: 'Citibank',
        451965: 'Wells Fargo'
    }

    # Add a 'Commercial Name' column to the big_four_df dataframe
    big_four_df['Commercial Name'] = big_four_df['Bank ID'].map(big_four_commercial_names)

    # Pivot the big_four_df dataframe to set 'Date' as the index, 'Commercial Name' as the columns, and 'Total Assets' as the values
    big_four_pivot = big_four_df.pivot_table(index=['Date', 'Quarter'], columns='Commercial Name', values='Consolidated Assets').reset_index()

    # Add 'Total Assets' column to the big_four_pivot dataframe
    big_four_pivot['Total Assets'] = big_four_pivot['Quarter'].map(total_assets_per_quarter['Consolidated Assets'])

    # Sum the consolidated assets of the Big Four banks for each quarter and add the result to the 'big_four_pivot' dataframe as 'Big Four Assets'
    big_four_pivot['Big Four Assets'] = big_four_pivot['Chase'] + big_four_pivot['Bank of America'] + big_four_pivot['Citibank'] + big_four_pivot['Wells Fargo']

    # Divide the 'Total Assets' column by the 'Big Four Assets' column to get the percentage of the total assets held by the Big Four banks and add the result to the 'big_four_pivot' dataframe as 'Percentage of Total Assets'
    big_four_pivot['Share of Consolidated Assets'] = big_four_pivot['Big Four Assets'] / big_four_pivot['Total Assets']

    # Display the 'Share of Consolidated Assets' column in percentage format with two decimal places
    big_four_pivot['Share of Consolidated Assets'] = big_four_pivot['Share of Consolidated Assets'].map(lambda x: ' {:.2%}'.format(x))

    # Save the big_four_pivot DataFrame to a CSV file
    big_four_pivot.to_csv(os.path.join(dataset_dir, 'bank_asset_line.csv'), index=False)

    #big_four_pivot.to_csv(os.path.join(csv_dir, 'line_plot.csv'), index=False)

    return big_four_pivot

# Call the create_big_four_line function

big_four_pivot = create_big_four_line(concatenated_df, total_assets_per_quarter)
big_four_pivot

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  big_four_df['Commercial Name'] = big_four_df['Bank ID'].map(big_four_commercial_names)


Commercial Name,Date,Quarter,Bank of America,Chase,Citibank,Wells Fargo,Total Assets,Big Four Assets,Share of Consolidated Assets
0,2003-09-30,Q3-2003,6.247230e+11,6.381200e+11,5.545400e+11,2.243760e+11,6.664665e+12,2.041759e+12,30.64%
1,2003-12-31,Q4-2003,6.179620e+11,6.286620e+11,5.821230e+11,2.504740e+11,6.773933e+12,2.079221e+12,30.69%
2,2004-03-31,Q1-2004,6.905730e+11,6.486920e+11,6.061910e+11,3.475600e+11,6.982157e+12,2.293016e+12,32.84%
3,2004-06-30,Q2-2004,7.068880e+11,6.546410e+11,6.482430e+11,3.646980e+11,7.222725e+12,2.374470e+12,32.87%
4,2004-09-30,Q3-2004,7.406950e+11,6.617720e+11,6.513450e+11,3.629730e+11,7.411606e+12,2.416785e+12,32.61%
...,...,...,...,...,...,...,...,...,...
80,2023-09-30,Q3-2023,2.465234e+12,3.385581e+12,1.657372e+12,1.704891e+12,2.159586e+13,9.213078e+12,42.66%
81,2023-12-31,Q4-2023,2.540116e+12,3.395126e+12,1.684710e+12,1.733244e+12,2.184642e+13,9.353196e+12,42.81%
82,2024-03-31,Q1-2024,2.550363e+12,3.503360e+12,1.698856e+12,1.743283e+12,2.216126e+13,9.495862e+12,42.85%
83,2024-06-30,Q2-2024,2.550584e+12,3.510536e+12,1.678936e+12,1.719839e+12,2.209674e+13,9.459895e+12,42.81%


### Creation of the dataframe for the scatter plot

 * The function 'create_melted_scatter_df' melts the 'big_four_pivot' dataframe to create a new dataframe called 'melted_line_df'.
 * The 'melted_line_df' dataframe is then saved to a CSV file in the 'dataset_dir' directory.
 * It also defines a function 'quarter_to_num' to convert the 'Quarter' column to a numerical format.

In [17]:
def create_melted_scatter_df(big_four_pivot, dataset_dir):
    # Melt the DataFrame
    melted_line_df = big_four_pivot.melt(id_vars=['Quarter','Date'], value_vars=['Chase', 'Bank of America', 'Citibank', 'Wells Fargo'], var_name='Bank', value_name='Assets')

    # Define function to convert the quarter string to a numerical format
    def quarter_to_num(quarter_string):
        # Split the quarter string into year and quarter parts
        parts = quarter_string.split('-')
        year = int(parts[1])
        quarter = parts[0]

        # Map the quarter part to a fraction of the year
        quarter_mapping = {'q1': 0.25, 'q2': 0.5, 'q3': 0.75, 'q4': 1.0}
        numerical_quarter = year + quarter_mapping[quarter.lower()]

        return numerical_quarter

    # Apply the quarter_to_num function to the 'Quarter' column
    melted_line_df['Numeric_Quarter'] = melted_line_df['Quarter'].apply(quarter_to_num)

    # Create the 'Quarter_Ordinal' column
    melted_line_df['Quarter_Ordinal'] = melted_line_df['Numeric_Quarter'].rank(method='dense').astype(int)

    # Save melted_line_df to a CSV file to the dataset_dir
    melted_line_df.to_csv(os.path.join(dataset_dir, 'bank_asset_scatter.csv'))


    return melted_line_df

# Call the create_melted_scatter_df function
create_melted_scatter_df(big_four_pivot, dataset_dir)

Unnamed: 0,Quarter,Date,Bank,Assets,Numeric_Quarter,Quarter_Ordinal
0,Q3-2003,2003-09-30,Chase,6.381200e+11,2003.75,1
1,Q4-2003,2003-12-31,Chase,6.286620e+11,2004.00,2
2,Q1-2004,2004-03-31,Chase,6.486920e+11,2004.25,3
3,Q2-2004,2004-06-30,Chase,6.546410e+11,2004.50,4
4,Q3-2004,2004-09-30,Chase,6.617720e+11,2004.75,5
...,...,...,...,...,...,...
335,Q3-2023,2023-09-30,Wells Fargo,1.704891e+12,2023.75,81
336,Q4-2023,2023-12-31,Wells Fargo,1.733244e+12,2024.00,82
337,Q1-2024,2024-03-31,Wells Fargo,1.743283e+12,2024.25,83
338,Q2-2024,2024-06-30,Wells Fargo,1.719839e+12,2024.50,84


### Creation of the dataframe for the racing pie chart

'transform_and_save' function transforms the 'big_four_pivot' dataframe to calculate the percentage share of each bank's assets and the percentage of the total assets held by the rest of the other banks.
It then saves the 'percentage_df' dataframe to a Excel file in the 'dataset_dir' directory.

In [18]:
def transform_and_save(big_four_pivot, dataset_dir):
    # Create a copy of the original dataframe
    percentage_df = big_four_pivot.copy()

    columns_to_convert = ['Bank of America', 'Chase', 'Citibank', 'Wells Fargo']

    # List of columns to drop
    columns_to_drop = ['Quarter', 'Total Assets', 'Big Four Assets', 'Share of Consolidated Assets']

    for column in columns_to_convert:
        # Calculate the percentage share of each bank's assets
        percentage_df[column] = percentage_df[column] / percentage_df['Total Assets']

        # Add a column that calculates the percentage of the total assets of the rest of the other banks
        percentage_df['Other Banks'] = (percentage_df['Total Assets'] - percentage_df['Big Four Assets']) / percentage_df['Total Assets']

        #Reduce the decimal places to two and maintain the datatype as a float
        percentage_df[column] = percentage_df[column].map(lambda x: round(x, 4))

        percentage_df['Other Banks'] = percentage_df['Other Banks'].map(lambda x: round(x, 4))

        
    # Remove the name of the index
    percentage_df.columns.name = None

    #Set the 'date' column as datetime datatype
    percentage_df['Date'] = pd.to_datetime(percentage_df['Date'])

    # Reset the index without keeping old index
    percentage_df.reset_index(drop=True, inplace=True)

    # Drop the columns in the 'columns_to_drop' list
    percentage_df = percentage_df.drop(columns=columns_to_drop)

    # Save the percentage_df dataframe to a Excel file
    percentage_df.to_excel(os.path.join(dataset_dir, 'bank_asset_percentage.xlsx'), index=False)

    return percentage_df

percentage_df=transform_and_save(big_four_pivot, dataset_dir)
percentage_df

Unnamed: 0,Date,Bank of America,Chase,Citibank,Wells Fargo,Other Banks
0,2003-09-30,0.0937,0.0957,0.0832,0.0337,0.6936
1,2003-12-31,0.0912,0.0928,0.0859,0.0370,0.6931
2,2004-03-31,0.0989,0.0929,0.0868,0.0498,0.6716
3,2004-06-30,0.0979,0.0906,0.0898,0.0505,0.6713
4,2004-09-30,0.0999,0.0893,0.0879,0.0490,0.6739
...,...,...,...,...,...,...
80,2023-09-30,0.1142,0.1568,0.0767,0.0789,0.5734
81,2023-12-31,0.1163,0.1554,0.0771,0.0793,0.5719
82,2024-03-31,0.1151,0.1581,0.0767,0.0787,0.5715
83,2024-06-30,0.1154,0.1589,0.0760,0.0778,0.5719


### Creation of the dataframe for the treemap
The function 'create_treemap_df' creates a new dataframe called 'treemap_df' from the 'percentage_df' dataframe.
It then saves the 'treemap_df' dataframe to a CSV file in the 'dataset_dir' directory.

In [19]:
def create_treemap_df(percentage_df, big_four_pivot, dataset_dir):
    # Create a copy of the original dataframe
    assets_df = big_four_pivot.copy()

    columns_of_interest = ['Bank of America', 'Chase', 'Citibank', 'Wells Fargo']

    # List of columns to drop
    columns_to_drop = ['Quarter', 'Share of Consolidated Assets']

    # Remove the name of the index
    assets_df.columns.name = None

    assets_df['Date'] = pd.to_datetime(assets_df['Date'])

    # Drop the columns in the 'columns_to_drop' list
    assets_df = assets_df.drop(columns=columns_to_drop)

    # Subtract 'Big Four Assets' from 'Total Assets' to get the 'Other Banks' column
    assets_df['Other Banks'] = assets_df['Total Assets'] - assets_df['Big Four Assets']

    # Convert the 'Date' column to datetime
    last_date_index = percentage_df['Date'].idxmax()

    # Select the row with the latest date
    latest_date = percentage_df.loc[last_date_index]

    # Convert the series into a dataframe and transpose it
    treemap_df = latest_date.to_frame()

    # Reset the index for treemap_df
    if not treemap_df.index.equals(pd.RangeIndex(start=0, stop=len(treemap_df))):
        treemap_df = treemap_df.reset_index()

    # Name the columns of the treemap_df dataframe
    treemap_df = treemap_df.rename(columns={treemap_df.columns[0]: 'Bank', treemap_df.columns[1]: 'Percentage'})

    # Save the 'Date' value from the first row as a variable
    date_note = treemap_df.loc[0, 'Percentage']

    # Drop the first row of the dataframe
    treemap_df = treemap_df.drop(0)

    # Convert the Timestamp into a string and slice it to only include the date part
    date_note = str(date_note)[:10]

    # In the 'date_note variable, re-arrange string to place month first, then day, and finally year as in 'mm-dd-yyyy' format
    date_note = date_note[5:7] + '-' + date_note[8:10] + '-' + date_note[:4]

    # Set the name of the Dataframe to the 'date_note'
    treemap_df.name = date_note

    # Convert the 'Percentage' column into % format
    treemap_df['Percentage'] = treemap_df['Percentage'].apply(lambda x: ' {:.2%}'.format(x) if pd.notnull(x) and isinstance(x, (int, float)) else x)

    # Create a new 'Parent' column and place it as the first column in the dataframe
    treemap_df['Parent'] = 'Big Four'

    # For the 5th row in the dataframe, I need to replace the 'Parent' value with 'Other Banks'
    treemap_df.loc[5, 'Parent'] = 'Other Banks'
    # leave the 'Bank' value as NaN
    treemap_df.loc[5, 'Bank'] = np.nan

    # Convert the 'Date' column to datetime
    last_date_assets = assets_df['Date'].idxmax()

    # Select the row with the latest date
    last_date_assets = assets_df.loc[last_date_index]

    # Drop first row of the dataframe
    last_date_assets = last_date_assets.drop('Date')

    # Drop the 'Total Assets' column
    last_date_assets = last_date_assets.drop('Total Assets')

    # Drop the 'Big Four Assets' column
    last_date_assets = last_date_assets.drop('Big Four Assets')

    # Reset the index for last_date_assets
    last_date_assets = last_date_assets.reset_index()

    # Name the first column 'Banks'
    last_date_assets = last_date_assets.rename(columns={last_date_assets.columns[0]: 'Bank'})

    # Rename the second columns as 'Assets'
    last_date_assets = last_date_assets.rename(columns={last_date_assets.columns[1]: 'Consolidated Assets'})

    # Create a variable named 'Total Assets' that has the value of the 'Total Assets' row under 'Consolidated Assets'
    total_assets = last_date_assets.loc[4, 'Consolidated Assets']

    # Map the values of 'last_date_assets; to the 'treemap_df' dataframe based on the 'Bank' column
    treemap_df['Consolidated Assets'] = treemap_df['Bank'].map(last_date_assets.set_index('Bank')['Consolidated Assets'])

    # Rearrange order of the columns for the 'Parent' column to be first after the index
    treemap_df = treemap_df[['Parent', 'Bank', 'Percentage', 'Consolidated Assets']]

    # Assign the 'Total Assets' variable to the 'Consolidated Assets' column for the 'Other Banks' row under the 'Parent' column
    treemap_df.loc[treemap_df['Parent'] == 'Other Banks', 'Consolidated Assets'] = total_assets
     
    #Download the treemap_df dataframe as a csv file
    treemap_df.to_csv(os.path.join(dataset_dir, 'bank_asset_treemap.csv'))

    return treemap_df

treemap_df=create_treemap_df(percentage_df, big_four_pivot, dataset_dir)
treemap_df

Unnamed: 0,Parent,Bank,Percentage,Consolidated Assets
1,Big Four,Bank of America,11.45%,2565878000000.0
2,Big Four,Chase,15.99%,3584105000000.0
3,Big Four,Citibank,7.73%,1733111000000.0
4,Big Four,Wells Fargo,7.58%,1698675000000.0
5,Other Banks,,57.26%,12834615000000.0


## Data Loading

 * 'get_file_sha' function retrieves the SHA of an existing file on GitHub to allow updates.

 * 'upload_to_github' function uploads a file to GitHub.


In [None]:

def get_file_sha(repo, path, filename, token):
    """
    Retrieve the SHA of an existing file in the specified repository/folder.
    """
    url = f"https://api.github.com/repos/{repo}/contents/{path}/{filename}"
    headers = {
        "Authorization": f"token {token}",
        "Accept": "application/vnd.github.v3+json"
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.json().get('sha')
    return None

def upload_to_github(repo, path, token, file_path):
    """
    Uploads (or updates) a single file to the specified repository and path.
    """
    # Use os.path.basename for cross-platform filename extraction
    filename = os.path.basename(file_path)
    file_sha = get_file_sha(repo, path, filename, token)  # Get SHA if file exists
    api_url = f"https://api.github.com/repos/{repo}/contents/{path}/{filename}"
    
    # Read and encode file content
    with open(file_path, 'rb') as file:
        file_content = base64.b64encode(file.read()).decode('utf-8')
    
    headers = {
        "Authorization": f"token {token}",
        "Accept": "application/vnd.github.v3+json",
    }
    data = {
        "message": f"Update {filename}",
        "content": file_content,
        "branch": "main",
    }
    if file_sha:  # Include SHA if updating an existing file
        data['sha'] = file_sha
    
    response = requests.put(api_url, headers=headers, data=json.dumps(data))
    print(f"Response Status Code: {response.status_code} for {filename}")
    print("Response:", response.json())

def upload_multiple_files(repo, path, token, file_paths):
    """
    Loops through a list of file paths and uploads each file to the specified repository and path.
    """
    for file_path in file_paths:
        upload_to_github(repo, path, token, file_path)

# Example usage:

# 1. Upload multiple dataset files to one repository:
file_paths = [
    './datasets/bank_asset_line.csv',
    './datasets/bank_asset_scatter.csv',
    './datasets/bank_asset_treemap.csv',
    './datasets/bank_asset_percentage.xlsx',
]
upload_multiple_files('juanchok12/Concentration-of-Banking', 'datasets', 'key_github_here', file_paths)

# 2. Upload the concatenated_df.csv file to a different repository and folder ("deploy"):
# Define the path as a string rather than a list.
concatenated_file_path = './datasets/concatenated_df.csv'
upload_to_github('juanchok12/Consoldiated-Assets-for-Banks-and-AI', 'deploy', 'key_github_here', concatenated_file_path)


Response Status Code: 201 for bank_asset_line.csv
Response: {'content': {'name': 'bank_asset_line.csv', 'path': 'datasets/bank_asset_line.csv', 'sha': '7269af6016d8aaa81193b24fcb8121c3bd151919', 'size': 10669, 'url': 'https://api.github.com/repos/juanchok12/Concentration-of-Banking/contents/datasets/bank_asset_line.csv?ref=main', 'html_url': 'https://github.com/juanchok12/Concentration-of-Banking/blob/main/datasets/bank_asset_line.csv', 'git_url': 'https://api.github.com/repos/juanchok12/Concentration-of-Banking/git/blobs/7269af6016d8aaa81193b24fcb8121c3bd151919', 'download_url': 'https://raw.githubusercontent.com/juanchok12/Concentration-of-Banking/main/datasets/bank_asset_line.csv', 'type': 'file', '_links': {'self': 'https://api.github.com/repos/juanchok12/Concentration-of-Banking/contents/datasets/bank_asset_line.csv?ref=main', 'git': 'https://api.github.com/repos/juanchok12/Concentration-of-Banking/git/blobs/7269af6016d8aaa81193b24fcb8121c3bd151919', 'html': 'https://github.com/ju