## Financial Dataset: FR - Y 9C

### Import Libraries:

In [4]:
import os ## System Library
import pandas as pd
import numpy as np

### Read and combine the txt files:

In [7]:
# Path to the folder containing the text files
folder_path = r"/Users/harshitaagrawal/Desktop/Capstone Project/Financial Data Download 2014-2024"

# List all files in the folder
file_list = [f for f in os.listdir(folder_path) if f.endswith('.txt')] ## List Comprehension

# Initialize an empty list to store DataFrames
dataframes = []

# Read each file and append the DataFrame to the list
for file_name in file_list:
    file_path = os.path.join(folder_path, file_name)
    df = pd.read_csv(file_path, sep='^', header=0, engine='python', encoding='latin1', on_bad_lines='skip')
    dataframes.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

# Display the first few rows of the combined DataFrame
combined_df.head()

Unnamed: 0,RSSD9001,RSSD9999,RSSD9007,RSSD9008,RSSD9132,RSSD9032,RSSD9146,BHBC3368,BHBC3402,BHBC3516,...,BHSPF842,TEXT4769,BHCKLG24,BHCKLG25,BHCKLG26,BHCKLG27,BHCKLG28,BHCKLL57,BHCWKX78,BHCWKX83
0,1020180,20180331,20151231,20230521,551111,9,1.0,,,,...,,,,,,,,,,
1,1020201,20180331,20151231,99991231,551111,7,2.0,,,,...,,,,,,,,,,
2,1020676,20180331,20150906,99991231,551111,7,1.0,,,,...,,,,,,,,,,
3,1020902,20180331,20140805,99991231,551111,10,1.0,,,,...,,,,,,,,,,
4,1022764,20180331,20170517,99991231,551111,12,1.0,,,,...,,,,,,,,,,


### Subset of Columns:

In [12]:
smalldf = combined_df[['RSSD9001', 'RSSD9007', 'RSSD9017', 'RSSD9032', 'RSSD9146', 'RSSD9999', 'BHCK2170', 'BHCK4107', 'BHCK4073', 'BHCK4079', 'BHCK4093']]
smalldf.head()

Unnamed: 0,RSSD9001,RSSD9007,RSSD9017,RSSD9032,RSSD9146,RSSD9999,BHCK2170,BHCK4107,BHCK4073,BHCK4079,BHCK4093
0,1020180,20151231,BREMER FINANCIAL CORPORATION,9,1.0,20180331,11959875.0,108983.0,15686.0,30819.0,79566.0
1,1020201,20151231,HSBC USA INC.,7,2.0,20180331,,,,,
2,1020676,20150906,AMALGAMATED INVESTMENTS COMPANY,7,1.0,20180331,778831.0,6679.0,244.0,4839.0,9376.0
3,1020902,20140805,"FIRST NATIONAL OF NEBRASKA, INC.",10,1.0,20180331,19950623.0,322364.0,24664.0,99888.0,224265.0
4,1022764,20170517,CENTRAL PACIFIC FINANCIAL CORP.,12,1.0,20180331,5651287.0,47310.0,4988.0,8692.0,33259.0


In [16]:
name_dict = {'RSSD9001':'RSSD ID',
             'RSSD9007':'Start Date',
             'RSSD9008':'End Date',
             'RSSD9017':'Firm Legal Name',
             'RSSD9032':'FR District Code',
             'RSSD9146':'Bank Count',
             'RSSD9999':'Reporting Date',
             'BHCK2170':'Total Assets', 
             'BHCK4073':'Interest Expense',
             'BHCK4107':'Interest Income Year-to-Date',
             'BHCK4079':'Non-Interest Income Year-to-Date',
             'BHCK4093':'Non-Interest Expense'}
smalldf = smalldf.rename(columns = name_dict)
smalldf.head()

Unnamed: 0,RSSD ID,Start Date,Firm Legal Name,FR District Code,Bank Count,Reporting Date,Total Assets,Interest Income Year-to-Date,Interest Expense,Non-Interest Income Year-to-Date,Non-Interest Expense
0,1020180,20151231,BREMER FINANCIAL CORPORATION,9,1.0,20180331,11959875.0,108983.0,15686.0,30819.0,79566.0
1,1020201,20151231,HSBC USA INC.,7,2.0,20180331,,,,,
2,1020676,20150906,AMALGAMATED INVESTMENTS COMPANY,7,1.0,20180331,778831.0,6679.0,244.0,4839.0,9376.0
3,1020902,20140805,"FIRST NATIONAL OF NEBRASKA, INC.",10,1.0,20180331,19950623.0,322364.0,24664.0,99888.0,224265.0
4,1022764,20170517,CENTRAL PACIFIC FINANCIAL CORP.,12,1.0,20180331,5651287.0,47310.0,4988.0,8692.0,33259.0


### Change Date Format:

In [19]:
smalldf['Reporting Date'] = pd.to_datetime(smalldf['Reporting Date'].astype(str), infer_datetime_format=True , utc=True)
smalldf['Start Date'] = pd.to_datetime(smalldf['Start Date'].astype(str), infer_datetime_format=True , utc=True)
smalldf.head()

  smalldf['Reporting Date'] = pd.to_datetime(smalldf['Reporting Date'].astype(str), infer_datetime_format=True , utc=True)
  smalldf['Start Date'] = pd.to_datetime(smalldf['Start Date'].astype(str), infer_datetime_format=True , utc=True)


Unnamed: 0,RSSD ID,Start Date,Firm Legal Name,FR District Code,Bank Count,Reporting Date,Total Assets,Interest Income Year-to-Date,Interest Expense,Non-Interest Income Year-to-Date,Non-Interest Expense
0,1020180,2015-12-31 00:00:00+00:00,BREMER FINANCIAL CORPORATION,9,1.0,2018-03-31 00:00:00+00:00,11959875.0,108983.0,15686.0,30819.0,79566.0
1,1020201,2015-12-31 00:00:00+00:00,HSBC USA INC.,7,2.0,2018-03-31 00:00:00+00:00,,,,,
2,1020676,2015-09-06 00:00:00+00:00,AMALGAMATED INVESTMENTS COMPANY,7,1.0,2018-03-31 00:00:00+00:00,778831.0,6679.0,244.0,4839.0,9376.0
3,1020902,2014-08-05 00:00:00+00:00,"FIRST NATIONAL OF NEBRASKA, INC.",10,1.0,2018-03-31 00:00:00+00:00,19950623.0,322364.0,24664.0,99888.0,224265.0
4,1022764,2017-05-17 00:00:00+00:00,CENTRAL PACIFIC FINANCIAL CORP.,12,1.0,2018-03-31 00:00:00+00:00,5651287.0,47310.0,4988.0,8692.0,33259.0


In [21]:
smalldf.shape

(101940, 11)

### Define Quarters and Year:

In [26]:
smalldf['Quarter'] = smalldf['Reporting Date'].dt.quarter
smalldf.head()

Unnamed: 0,RSSD ID,Start Date,Firm Legal Name,FR District Code,Bank Count,Reporting Date,Total Assets,Interest Income Year-to-Date,Interest Expense,Non-Interest Income Year-to-Date,Non-Interest Expense,Quarter
0,1020180,2015-12-31 00:00:00+00:00,BREMER FINANCIAL CORPORATION,9,1.0,2018-03-31 00:00:00+00:00,11959875.0,108983.0,15686.0,30819.0,79566.0,1
1,1020201,2015-12-31 00:00:00+00:00,HSBC USA INC.,7,2.0,2018-03-31 00:00:00+00:00,,,,,,1
2,1020676,2015-09-06 00:00:00+00:00,AMALGAMATED INVESTMENTS COMPANY,7,1.0,2018-03-31 00:00:00+00:00,778831.0,6679.0,244.0,4839.0,9376.0,1
3,1020902,2014-08-05 00:00:00+00:00,"FIRST NATIONAL OF NEBRASKA, INC.",10,1.0,2018-03-31 00:00:00+00:00,19950623.0,322364.0,24664.0,99888.0,224265.0,1
4,1022764,2017-05-17 00:00:00+00:00,CENTRAL PACIFIC FINANCIAL CORP.,12,1.0,2018-03-31 00:00:00+00:00,5651287.0,47310.0,4988.0,8692.0,33259.0,1


In [46]:
smalldf['Year'] = smalldf['Reporting Date'].dt.year
smalldf.head()

Unnamed: 0,RSSD ID,Start Date,Firm Legal Name,FR District Code,Bank Count,Reporting Date,Total Assets,Interest Income Year-to-Date,Interest Expense,Non-Interest Income Year-to-Date,Non-Interest Expense,Quarter,Year
0,1020180,2015-12-31 00:00:00+00:00,BREMER FINANCIAL CORPORATION,9,1.0,2018-03-31 00:00:00+00:00,11959875.0,108983.0,15686.0,30819.0,79566.0,1,2018
1,1020201,2015-12-31 00:00:00+00:00,HSBC USA INC.,7,2.0,2018-03-31 00:00:00+00:00,,,,,,1,2018
2,1020676,2015-09-06 00:00:00+00:00,AMALGAMATED INVESTMENTS COMPANY,7,1.0,2018-03-31 00:00:00+00:00,778831.0,6679.0,244.0,4839.0,9376.0,1,2018
3,1020902,2014-08-05 00:00:00+00:00,"FIRST NATIONAL OF NEBRASKA, INC.",10,1.0,2018-03-31 00:00:00+00:00,19950623.0,322364.0,24664.0,99888.0,224265.0,1,2018
4,1022764,2017-05-17 00:00:00+00:00,CENTRAL PACIFIC FINANCIAL CORP.,12,1.0,2018-03-31 00:00:00+00:00,5651287.0,47310.0,4988.0,8692.0,33259.0,1,2018


### Quarterization:

In [48]:
def f(val):
    global last_val
    new_val = val - last_val
    last_val = val
    return new_val

In [50]:
# Quarterization for Interest Income year-to-date

new_dfs = []
for i in smalldf['RSSD ID'].unique():
  new_df_i = smalldf.loc[smalldf['RSSD ID']==i,:]
  for j in smalldf['Year'].unique():
    new_df_i_j = new_df_i.loc[smalldf['Year']==j,:]
    new_df_i_j = new_df_i_j.sort_values('Quarter')
    last_val = 0
    new_df_i_j['Interest Income per Quarter'] = new_df_i_j['Interest Income Year-to-Date'].apply(f)
    new_dfs.append(new_df_i_j)

Quarterized_dfs = pd.concat(new_dfs, ignore_index=True)
Quarterized_dfs.head()

Unnamed: 0,RSSD ID,Start Date,Firm Legal Name,FR District Code,Bank Count,Reporting Date,Total Assets,Interest Income Year-to-Date,Interest Expense,Non-Interest Income Year-to-Date,Non-Interest Expense,Quarter,Year,Interest Income per Quarter
0,1020180,2015-12-31 00:00:00+00:00,BREMER FINANCIAL CORPORATION,9,1.0,2018-03-31 00:00:00+00:00,11959875.0,108983.0,15686.0,30819.0,79566.0,1,2018,108983.0
1,1020180,2015-12-31 00:00:00+00:00,BREMER FINANCIAL CORPORATION,9,1.0,2018-06-30 00:00:00+00:00,12077601.0,224623.0,36271.0,60958.0,159789.0,2,2018,115640.0
2,1020180,2015-12-31 00:00:00+00:00,BREMER FINANCIAL CORPORATION,9,1.0,2018-09-30 00:00:00+00:00,12119987.0,342266.0,58552.0,91838.0,239242.0,3,2018,117643.0
3,1020180,2015-12-31 00:00:00+00:00,BREMER FINANCIAL CORPORATION,9,1.0,2018-12-31 00:00:00+00:00,12192038.0,462684.0,83107.0,124334.0,325762.0,4,2018,120418.0
4,1020180,2014-12-15 00:00:00+00:00,BREMER FINANCIAL CORPORATION,9,1.0,2015-03-31 00:00:00+00:00,9896279.0,87326.0,5714.0,29135.0,70481.0,1,2015,87326.0


### Normalization:

In [54]:
Quarterized_dfs['Normalized Interest Income per Quarter'] = Quarterized_dfs['Interest Income per Quarter']/Quarterized_dfs['Total Assets']
Quarterized_dfs.head()

Unnamed: 0,RSSD ID,Start Date,Firm Legal Name,FR District Code,Bank Count,Reporting Date,Total Assets,Interest Income Year-to-Date,Interest Expense,Non-Interest Income Year-to-Date,Non-Interest Expense,Quarter,Year,Interest Income per Quarter,Normalized Interest Income per Quarter
0,1020180,2015-12-31 00:00:00+00:00,BREMER FINANCIAL CORPORATION,9,1.0,2018-03-31 00:00:00+00:00,11959875.0,108983.0,15686.0,30819.0,79566.0,1,2018,108983.0,0.009112
1,1020180,2015-12-31 00:00:00+00:00,BREMER FINANCIAL CORPORATION,9,1.0,2018-06-30 00:00:00+00:00,12077601.0,224623.0,36271.0,60958.0,159789.0,2,2018,115640.0,0.009575
2,1020180,2015-12-31 00:00:00+00:00,BREMER FINANCIAL CORPORATION,9,1.0,2018-09-30 00:00:00+00:00,12119987.0,342266.0,58552.0,91838.0,239242.0,3,2018,117643.0,0.009707
3,1020180,2015-12-31 00:00:00+00:00,BREMER FINANCIAL CORPORATION,9,1.0,2018-12-31 00:00:00+00:00,12192038.0,462684.0,83107.0,124334.0,325762.0,4,2018,120418.0,0.009877
4,1020180,2014-12-15 00:00:00+00:00,BREMER FINANCIAL CORPORATION,9,1.0,2015-03-31 00:00:00+00:00,9896279.0,87326.0,5714.0,29135.0,70481.0,1,2015,87326.0,0.008824
