# Capstone Project Final Dataset

## Import packages

In [3]:
import os ## System Library
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from matplotlib import rc
import matplotlib.dates as mdates
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

## Financial dataset

In [4]:
# Path to the folder containing the text files
folder_path = r"C:\Users\meagh\Documents\UNH\Conferences and Workshops\Math to Industry Bootcamp 2024\Capstone Project\data\Financial Data Download 2004-2024"

# List all files in the folder
file_list = [f for f in os.listdir(folder_path) if f.endswith('.txt')] ## List Comprehension

# Initialize an empty list to store DataFrames
dataframes = []

# Read each file and append the DataFrame to the list
for file_name in file_list:
    file_path = os.path.join(folder_path, file_name)
    df = pd.read_csv(file_path, sep='^', header=0, engine='python', encoding='latin1', on_bad_lines='skip')
    dataframes.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

# Display the first few rows of the combined DataFrame
combined_df.head()

Unnamed: 0,RSSD9001,RSSD9999,RSSD9007,RSSD9008,RSSD9132,RSSD9032,RSSD9146,BHBC3368,BHBC3402,BHBC3516,...,BHCANC99,BHCKFT42,BHCKFT43,BHCKFT44,BHSPFT42,BHSPFT43,BHSPFT44,BHSPMZ36,BHSPNK60,BHCKMG95
0,1020180,20040331,20020401,20040822,551111,9,11.0,0.0,0.0,0.0,...,,,,,,,,,,
1,1020201,20040331,20001028,20051230,551111,2,1.0,0.0,0.0,0.0,...,,,,,,,,,,
2,1020340,20040331,20040301,20040531,551111,7,29.0,0.0,0.0,0.0,...,,,,,,,,,,
3,1020395,20040331,20040211,20081230,551111,6,1.0,,,,...,,,,,,,,,,
4,1020582,20040331,19980101,20060228,551111,7,1.0,,,,...,,,,,,,,,,


### Taking a subset containing only the columns we are interested in

In [5]:
smalldf = combined_df[['RSSD9001', 'RSSD9007', 'RSSD9017', 'RSSD9032', 'RSSD9146', 'RSSD9999', 'BHCK2170', 'BHCK4107', 'BHCK4074', 'BHCK4073', 'BHCK4079', 'BHCK4093']]

Re-nameing the columns

In [6]:
name_dict = {'RSSD9001':'RSSD ID',
             'RSSD9007':'Start Date',
             'RSSD9008':'End Date',
             'RSSD9017':'Firm Legal Name',
             'RSSD9032':'FR District Code',
             'RSSD9146':'Bank Count',
             'RSSD9999':'Reporting Date',
             'BHCK2170':'Total Assets', 
             'BHCK4073':'Interest Expense',
             'BHCK4107':'Interest Income Year-to-Date',
             'BHCK4074':'Net Interest Income',
             'BHCK4079':'Non-Interest Income Year-to-Date',
             'BHCK4093':'Non-Interest Expense'}
smalldf = smalldf.rename(columns = name_dict)

Changing the data format

In [7]:
smalldf['Reporting Date'] = pd.to_datetime(smalldf['Reporting Date'].astype(str), utc=True)
smalldf['Start Date'] = pd.to_datetime(smalldf['Start Date'].astype(str), utc=True)

Defining Quarters and Years

In [8]:
smalldf['Quarter'] = smalldf['Reporting Date'].dt.quarter
smalldf['Year'] = smalldf['Reporting Date'].dt.year

### Quarterizing and Normalizing income data

In [9]:
def f(val):
    global last_val
    new_val = val - last_val
    last_val = val
    return new_val

In [10]:
# Quarterization for Interest Income year-to-date
new_dfs = []
for i in smalldf['RSSD ID'].unique():
  new_df_i = smalldf.loc[smalldf['RSSD ID']==i,:]
  for j in smalldf['Year'].unique():
    new_df_i_j = new_df_i.loc[smalldf['Year']==j,:]
    new_df_i_j = new_df_i_j.sort_values('Quarter')
    last_val = 0
    new_df_i_j['Interest Income per Quarter'] = new_df_i_j['Interest Income Year-to-Date'].apply(f)
    new_dfs.append(new_df_i_j)

Quarterized_dfs = pd.concat(new_dfs, ignore_index=True)
Quarterized_dfs.head()

Unnamed: 0,RSSD ID,Start Date,Firm Legal Name,FR District Code,Bank Count,Reporting Date,Total Assets,Interest Income Year-to-Date,Net Interest Income,Interest Expense,Non-Interest Income Year-to-Date,Non-Interest Expense,Quarter,Year,Interest Income per Quarter
0,1020180,2002-04-01 00:00:00+00:00,BREMER FINANCIAL CORPORATION,9,11.0,2004-03-31 00:00:00+00:00,5721282.0,68233.0,46844.0,21389.0,17200.0,41353.0,1,2004,68233.0
1,1020180,2002-04-01 00:00:00+00:00,BREMER FINANCIAL CORPORATION,9,11.0,2004-06-30 00:00:00+00:00,5870480.0,138086.0,95452.0,42634.0,35351.0,84535.0,2,2004,69853.0
2,1020180,2004-08-23 00:00:00+00:00,BREMER FINANCIAL CORPORATION,9,10.0,2004-09-30 00:00:00+00:00,5963700.0,212489.0,147314.0,65175.0,53242.0,127342.0,3,2004,74403.0
3,1020180,2004-08-23 00:00:00+00:00,BREMER FINANCIAL CORPORATION,9,10.0,2004-12-31 00:00:00+00:00,6141519.0,291916.0,203077.0,88839.0,72570.0,172413.0,4,2004,79427.0
4,1020180,2004-08-23 00:00:00+00:00,BREMER FINANCIAL CORPORATION,9,10.0,2005-03-31 00:00:00+00:00,6230236.0,81656.0,54637.0,27019.0,17515.0,44033.0,1,2005,81656.0


In [11]:
Quarterized_dfs['Normalized Interest Income per Quarter'] = Quarterized_dfs['Interest Income per Quarter']/Quarterized_dfs['Total Assets']

In [12]:
Quarterized_dfs.to_csv(r'C:\Users\meagh\Documents\UNH\Conferences and Workshops\Math to Industry Bootcamp 2024\Capstone Project\data\Quarterized_dfs.csv', encoding='utf-8')

In [13]:
len(Quarterized_dfs)

243757

## Yield Curve Dataset

In [14]:
rates_df = pd.read_csv(r"C:\Users\meagh\Documents\UNH\Conferences and Workshops\Math to Industry Bootcamp 2024\Capstone Project\data\yield-curve-rates-2004-2024.csv")

In [15]:
# Changing Date to Date-time Format
rates_df['Date'] = pd.to_datetime(rates_df['Date'].astype(str), format='mixed', utc=True)

In [16]:
# Giving the Year and Quarter each entry
rates_df['Year'] = rates_df['Date'].dt.year
rates_df['Quarter'] = rates_df['Date'].dt.quarter

In [17]:
finalrates = rates_df.groupby(['Year', 'Quarter']).mean()
finalrates.head(8)

Unnamed: 0_level_0,Unnamed: 1_level_0,Date,1 Mo,2 Mo,3 Mo,4 Mo,6 Mo,1 Yr,2 Yr,3 Yr,5 Yr,7 Yr,10 Yr,20 Yr,30 Yr
Year,Quarter,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2004,1,2004-02-16 08:30:58.064516096+00:00,0.909677,,0.93371,,1.003548,1.221452,1.686613,2.163065,2.980968,3.501452,4.01,4.877903,
2004,2,2004-05-16 03:29:01.935483904+00:00,0.954516,,1.096129,,1.359677,1.775484,2.454677,2.971935,3.720968,4.183871,4.597097,5.355645,
2004,3,2004-08-16 05:15:00+00:00,1.37,,1.513906,,1.788437,2.075,2.556563,2.919531,3.505625,3.922188,4.301406,5.068281,
2004,4,2004-11-16 05:01:56.129032320+00:00,1.841774,,2.047419,,2.297581,2.472742,2.822419,3.058871,3.49371,3.855,4.175806,4.874839,
2005,1,2005-02-15 01:58:01.967213056+00:00,2.360984,,2.58918,,2.876885,3.072459,3.454918,3.623934,3.89082,4.098852,4.303607,4.764098,
2005,2,2005-05-16 09:22:30+00:00,2.709687,,2.926719,,3.179688,3.337344,3.645938,3.732344,3.872969,3.984688,4.158594,4.54875,
2005,3,2005-08-16 15:45:00+00:00,3.228594,,3.438281,,3.707031,3.791094,3.957969,3.989219,4.039375,4.108437,4.215156,4.507656,
2005,4,2005-11-15 21:38:21.639344256+00:00,3.702131,,3.913607,,4.250492,4.288525,4.36459,4.372295,4.391148,4.425738,4.488197,4.767541,


In [19]:
finalrates.to_csv(r'C:\Users\meagh\Documents\UNH\Conferences and Workshops\Math to Industry Bootcamp 2024\Capstone Project\data\finalrates.csv')

In [18]:
finalrates_df = finalrates.copy()
finalrates_df = finalrates_df.drop(columns=["Date"])
finalrates_df.reset_index(inplace=True)

In [57]:
finalrates_df.to_csv(r'C:\Users\meagh\Documents\UNH\Conferences and Workshops\Math to Industry Bootcamp 2024\Capstone Project\data\finalrates_df.csv')

## Spreads Data

In [19]:
columns_to_consider = finalrates.columns[3:]
columns_to_consider

Index(['3 Mo', '4 Mo', '6 Mo', '1 Yr', '2 Yr', '3 Yr', '5 Yr', '7 Yr', '10 Yr',
       '20 Yr', '30 Yr'],
      dtype='object')

In [20]:
column_pairs = list(itertools.combinations(columns_to_consider, 2)) ## Gives possible combinations of Spread

In [25]:
# Data frame to store the spreads
spreads_df = pd.DataFrame()

for (col1, col2) in column_pairs:
    spread_col_name = f"{col2}-{col1}_spread"
    spreads_df[spread_col_name] = finalrates[col2] - finalrates[col1]
        
# Add the Year-Quarter columns: Must have 80 columns in total
result_df = pd.concat([finalrates.iloc[:, :3], spreads_df], axis=1)

result_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Date,1 Mo,2 Mo,4 Mo-3 Mo_spread,6 Mo-3 Mo_spread,1 Yr-3 Mo_spread,2 Yr-3 Mo_spread,3 Yr-3 Mo_spread,5 Yr-3 Mo_spread,7 Yr-3 Mo_spread,...,7 Yr-5 Yr_spread,10 Yr-5 Yr_spread,20 Yr-5 Yr_spread,30 Yr-5 Yr_spread,10 Yr-7 Yr_spread,20 Yr-7 Yr_spread,30 Yr-7 Yr_spread,20 Yr-10 Yr_spread,30 Yr-10 Yr_spread,30 Yr-20 Yr_spread
Year,Quarter,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2004,1,2004-02-16 08:30:58.064516096+00:00,0.909677,,,0.069839,0.287742,0.752903,1.229355,2.047258,2.567742,...,0.520484,1.029032,1.896935,,0.508548,1.376452,,0.867903,,
2004,2,2004-05-16 03:29:01.935483904+00:00,0.954516,,,0.263548,0.679355,1.358548,1.875806,2.624839,3.087742,...,0.462903,0.876129,1.634677,,0.413226,1.171774,,0.758548,,
2004,3,2004-08-16 05:15:00+00:00,1.37,,,0.274531,0.561094,1.042656,1.405625,1.991719,2.408281,...,0.416562,0.795781,1.562656,,0.379219,1.146094,,0.766875,,
2004,4,2004-11-16 05:01:56.129032320+00:00,1.841774,,,0.250161,0.425323,0.775,1.011452,1.44629,1.807581,...,0.36129,0.682097,1.381129,,0.320806,1.019839,,0.699032,,
2005,1,2005-02-15 01:58:01.967213056+00:00,2.360984,,,0.287705,0.483279,0.865738,1.034754,1.301639,1.509672,...,0.208033,0.412787,0.873279,,0.204754,0.665246,,0.460492,,


In [26]:
result_df.to_csv(r'C:\Users\meagh\Documents\UNH\Conferences and Workshops\Math to Industry Bootcamp 2024\Capstone Project\data\results_df.csv')

## GDP and Unemployment Dataset

In [21]:
gdp_df = pd.read_csv(r"C:\Users\meagh\Documents\UNH\Conferences and Workshops\Math to Industry Bootcamp 2024\Capstone Project\data\GDP1.csv", parse_dates=True) ## GDP
gdp_df['DATE'] = pd.to_datetime(gdp_df['DATE'])
print(gdp_df.shape)
gdp_df.head()

(309, 2)


Unnamed: 0,DATE,GDP
0,1947-01-01,243.164
1,1947-04-01,245.968
2,1947-07-01,249.585
3,1947-10-01,259.745
4,1948-01-01,265.742


In [22]:
uner_df = pd.read_csv(r"C:\Users\meagh\Documents\UNH\Conferences and Workshops\Math to Industry Bootcamp 2024\Capstone Project\data\UNRATE.csv") ## Unemployment Rate
uner_df['DATE'] = pd.to_datetime(uner_df['DATE'])
print(uner_df.shape)
uner_df.head()

(82, 2)


Unnamed: 0,DATE,UNRATE
0,2004-01-01,5.7
1,2004-04-01,5.6
2,2004-07-01,5.433333
3,2004-10-01,5.433333
4,2005-01-01,5.3


In [23]:
df_gdp_uner = pd.merge(gdp_df,uner_df,on='DATE')
print(df_gdp_uner.shape)
df_gdp_uner.head()

(81, 3)


Unnamed: 0,DATE,GDP,UNRATE
0,2004-01-01,11923.447,5.7
1,2004-04-01,12112.815,5.6
2,2004-07-01,12305.307,5.433333
3,2004-10-01,12527.214,5.433333
4,2005-01-01,12767.286,5.3


In [24]:
df_gdp_uner['DATE'] = pd.to_datetime(df_gdp_uner['DATE'].astype(str), format='mixed', utc=True)
df_gdp_uner.head()

Unnamed: 0,DATE,GDP,UNRATE
0,2004-01-01 00:00:00+00:00,11923.447,5.7
1,2004-04-01 00:00:00+00:00,12112.815,5.6
2,2004-07-01 00:00:00+00:00,12305.307,5.433333
3,2004-10-01 00:00:00+00:00,12527.214,5.433333
4,2005-01-01 00:00:00+00:00,12767.286,5.3


In [25]:
def update_quart_enddate_shift(df):
    # Define a dictionary to map the month to the new date
    quarter_end_dates = {1: (12, 31), 4: (3, 31), 7: (6, 30), 10: (9, 30)}

    # Apply the mapping to update the dates
    for month, (new_month, new_day) in quarter_end_dates.items():
        df.loc[df['DATE'].dt.month == month, 'DATE'] = df['DATE'].apply(
            lambda x: x.replace(month=new_month, day=new_day) if x.month == month else x
        )

    # offset the years by 1 for those whose month is december
    tmp_ids = df[df["DATE"].dt.month == 12].index
    df.loc[tmp_ids, "DATE"] = df.loc[tmp_ids, "DATE"] - pd.DateOffset(years=1)

    # drop the first row df_gdp_uner
    df.drop(df.index[0], inplace=True)
    df.reset_index(drop=True, inplace=True)

    return df

In [26]:
df_gdp_uner =  update_quart_enddate_shift(df_gdp_uner)
df_gdp_uner.head()

Unnamed: 0,DATE,GDP,UNRATE
0,2004-03-31 00:00:00+00:00,12112.815,5.6
1,2004-06-30 00:00:00+00:00,12305.307,5.433333
2,2004-09-30 00:00:00+00:00,12527.214,5.433333
3,2004-12-31 00:00:00+00:00,12767.286,5.3
4,2005-03-31 00:00:00+00:00,12922.656,5.1


In [27]:
name_dict = {'DATE':'Date',
             'GDP' : 'GDP',
             'UNRATE': 'UNRATE'}
df_gdp_uner = df_gdp_uner.rename(columns = name_dict)
df_gdp_uner.head()

Unnamed: 0,Date,GDP,UNRATE
0,2004-03-31 00:00:00+00:00,12112.815,5.6
1,2004-06-30 00:00:00+00:00,12305.307,5.433333
2,2004-09-30 00:00:00+00:00,12527.214,5.433333
3,2004-12-31 00:00:00+00:00,12767.286,5.3
4,2005-03-31 00:00:00+00:00,12922.656,5.1


In [28]:
df_gdp_uner['Year'] = df_gdp_uner['Date'].dt.year
df_gdp_uner['Quarter'] = df_gdp_uner['Date'].dt.quarter
df_gdp_uner.head()

Unnamed: 0,Date,GDP,UNRATE,Year,Quarter
0,2004-03-31 00:00:00+00:00,12112.815,5.6,2004,1
1,2004-06-30 00:00:00+00:00,12305.307,5.433333,2004,2
2,2004-09-30 00:00:00+00:00,12527.214,5.433333,2004,3
3,2004-12-31 00:00:00+00:00,12767.286,5.3,2004,4
4,2005-03-31 00:00:00+00:00,12922.656,5.1,2005,1


In [53]:
df_gdp_uner.to_csv(r'C:\Users\meagh\Documents\UNH\Conferences and Workshops\Math to Industry Bootcamp 2024\Capstone Project\data\df_gdp_uner.csv')

## Combine Macroencomoic Variables: Interest Rates, GDP, Unemployment Rates

In [29]:
cmev = pd.concat([finalrates_df, df_gdp_uner], axis=1)
cmev.head()

Unnamed: 0,Year,Quarter,1 Mo,2 Mo,3 Mo,4 Mo,6 Mo,1 Yr,2 Yr,3 Yr,5 Yr,7 Yr,10 Yr,20 Yr,30 Yr,Date,GDP,UNRATE,Year.1,Quarter.1
0,2004,1,0.909677,,0.93371,,1.003548,1.221452,1.686613,2.163065,2.980968,3.501452,4.01,4.877903,,2004-03-31 00:00:00+00:00,12112.815,5.6,2004.0,1.0
1,2004,2,0.954516,,1.096129,,1.359677,1.775484,2.454677,2.971935,3.720968,4.183871,4.597097,5.355645,,2004-06-30 00:00:00+00:00,12305.307,5.433333,2004.0,2.0
2,2004,3,1.37,,1.513906,,1.788437,2.075,2.556563,2.919531,3.505625,3.922188,4.301406,5.068281,,2004-09-30 00:00:00+00:00,12527.214,5.433333,2004.0,3.0
3,2004,4,1.841774,,2.047419,,2.297581,2.472742,2.822419,3.058871,3.49371,3.855,4.175806,4.874839,,2004-12-31 00:00:00+00:00,12767.286,5.3,2004.0,4.0
4,2005,1,2.360984,,2.58918,,2.876885,3.072459,3.454918,3.623934,3.89082,4.098852,4.303607,4.764098,,2005-03-31 00:00:00+00:00,12922.656,5.1,2005.0,1.0


In [59]:
cmev.to_csv(r'C:\Users\meagh\Documents\UNH\Conferences and Workshops\Math to Industry Bootcamp 2024\Capstone Project\data\cmev.csv')

## Combine Fincial Dataset with the Macroeconomic variables

In [30]:
# selecting some columns of Quater
small_Quaterized_dfs = Quarterized_dfs[['RSSD ID', 'Firm Legal Name', 'Bank Count', 'Reporting Date', 'Total Assets', 'Year', 'Quarter',
                                        'Normalized Interest Income per Quarter']]

In [1]:
small_Quaterized_dfs.to_csv(r'C:\Users\meagh\Documents\UNH\Conferences and Workshops\Math to Industry Bootcamp 2024\Capstone Project\data\small_Quaterized_dfs.csv')

NameError: name 'small_Quaterized_dfs' is not defined

In [31]:
# Localize datetime columns to None
small_Quaterized_dfs.loc[:, 'Reporting Date'] = small_Quaterized_dfs['Reporting Date'].dt.tz_localize(None)
cmev.loc[:, 'Date'] = cmev['Date'].dt.tz_localize(None)

# Merge the DataFrames
df = small_Quaterized_dfs.merge(cmev, left_on='Reporting Date', right_on='Date', how='left')
df.drop(columns=['Date'], inplace=True)

# Drop unwanted columns
df.drop(columns=['Year_y', 'Quarter_y'], inplace=True)

# Rename columns
df.rename(columns={'Year_x': 'Year', 'Quarter_x': 'Quarter'}, inplace=True)

df.head()

Unnamed: 0,RSSD ID,Firm Legal Name,Bank Count,Reporting Date,Total Assets,Year,Quarter,Normalized Interest Income per Quarter,1 Mo,2 Mo,...,1 Yr,2 Yr,3 Yr,5 Yr,7 Yr,10 Yr,20 Yr,30 Yr,GDP,UNRATE
0,1020180,BREMER FINANCIAL CORPORATION,11.0,2004-03-31,5721282.0,2004,1,0.011926,0.909677,,...,1.221452,1.686613,2.163065,2.980968,3.501452,4.01,4.877903,,12112.815,5.6
1,1020180,BREMER FINANCIAL CORPORATION,11.0,2004-06-30,5870480.0,2004,2,0.011899,0.954516,,...,1.775484,2.454677,2.971935,3.720968,4.183871,4.597097,5.355645,,12305.307,5.433333
2,1020180,BREMER FINANCIAL CORPORATION,10.0,2004-09-30,5963700.0,2004,3,0.012476,1.37,,...,2.075,2.556563,2.919531,3.505625,3.922188,4.301406,5.068281,,12527.214,5.433333
3,1020180,BREMER FINANCIAL CORPORATION,10.0,2004-12-31,6141519.0,2004,4,0.012933,1.841774,,...,2.472742,2.822419,3.058871,3.49371,3.855,4.175806,4.874839,,12767.286,5.3
4,1020180,BREMER FINANCIAL CORPORATION,10.0,2005-03-31,6230236.0,2005,1,0.013106,2.360984,,...,3.072459,3.454918,3.623934,3.89082,4.098852,4.303607,4.764098,,12922.656,5.1


In [64]:
df.to_csv(r'C:\Users\meagh\Documents\UNH\Conferences and Workshops\Math to Industry Bootcamp 2024\Capstone Project\data\df.csv')