In [150]:
# from google.colab import drive
# drive.mount('/content/drive')

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime
import itertools
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

import os

### Read and Combine the txt files

In [2]:
# Path to the folder containing the text files
folder_path = r"C:/Users/odoms001/Downloads/IMA_Project/CS_Finance_Data"

# List all files in the folder
file_list = [f for f in os.listdir(folder_path) if f.endswith('.txt')] ## List Comprehension

# Initialize an empty list to store DataFrames
dataframes = []

# Read each file and append the DataFrame to the list
for file_name in file_list:
    file_path = os.path.join(folder_path, file_name)
    df = pd.read_csv(file_path, sep='^', header=0, engine='python', encoding='latin1', on_bad_lines='skip')
    dataframes.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

# Display the first few rows of the combined DataFrame
combined_df.head()

Unnamed: 0,RSSD9001,RSSD9999,RSSD9007,RSSD9008,RSSD9132,RSSD9032,RSSD9146,BHBC3368,BHBC3402,BHBC3516,...,BHCANC99,BHCKFT42,BHCKFT43,BHCKFT44,BHSPFT42,BHSPFT43,BHSPFT44,BHSPMZ36,BHSPNK60,BHCKMG95
0,1020180,20040331,20020401,20040822,551111,9,11.0,0.0,0.0,0.0,...,,,,,,,,,,
1,1020201,20040331,20001028,20051230,551111,2,1.0,0.0,0.0,0.0,...,,,,,,,,,,
2,1020340,20040331,20040301,20040531,551111,7,29.0,0.0,0.0,0.0,...,,,,,,,,,,
3,1020395,20040331,20040211,20081230,551111,6,1.0,,,,...,,,,,,,,,,
4,1020582,20040331,19980101,20060228,551111,7,1.0,,,,...,,,,,,,,,,


### Subset some columns:

In [3]:
smalldf = combined_df[['RSSD9001','RSSD9017', 'RSSD9032', 'RSSD9146', 'RSSD9999', 'BHCK4107', 'BHCK2170']]
smalldf.head()

Unnamed: 0,RSSD9001,RSSD9017,RSSD9032,RSSD9146,RSSD9999,BHCK4107,BHCK2170
0,1020180,BREMER FINANCIAL CORPORATION,9,11.0,20040331,68233.0,5721282.0
1,1020201,HSBC USA INC.,2,1.0,20040331,883035.0,102501934.0
2,1020340,"HARRIS BANKCORP, INC.",7,29.0,20040331,276513.0,32344528.0
3,1020395,SOUTHERN NATIONAL CORPORATION,6,1.0,20040331,2980.0,217529.0
4,1020582,"WCN BANCORP, INC.",7,1.0,20040331,2692.0,240470.0


In [4]:
name_dict = {'RSSD9001':'RSSD ID',
             'RSSD9017':'Firm Legal Name',
             'RSSD9032':'FR District Code',
             'RSSD9146':'Bank Count',
             'RSSD9999':'Reporting Date',
             'BHCK4107':'Interest Income',
             'BHCK2170':'Total Assets',
             }
smalldf = smalldf.rename(columns = name_dict)
print(smalldf.shape)
smalldf.head()

(243757, 7)


Unnamed: 0,RSSD ID,Firm Legal Name,FR District Code,Bank Count,Reporting Date,Interest Income,Total Assets
0,1020180,BREMER FINANCIAL CORPORATION,9,11.0,20040331,68233.0,5721282.0
1,1020201,HSBC USA INC.,2,1.0,20040331,883035.0,102501934.0
2,1020340,"HARRIS BANKCORP, INC.",7,29.0,20040331,276513.0,32344528.0
3,1020395,SOUTHERN NATIONAL CORPORATION,6,1.0,20040331,2980.0,217529.0
4,1020582,"WCN BANCORP, INC.",7,1.0,20040331,2692.0,240470.0


In [5]:
smalldf['Reporting Date'] = pd.to_datetime(smalldf['Reporting Date'], format = "%Y%m%d")

In [6]:
smalldf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243757 entries, 0 to 243756
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   RSSD ID           243757 non-null  int64         
 1   Firm Legal Name   243757 non-null  object        
 2   FR District Code  243757 non-null  int64         
 3   Bank Count        234638 non-null  float64       
 4   Reporting Date    243757 non-null  datetime64[ns]
 5   Interest Income   73843 non-null   float64       
 6   Total Assets      73843 non-null   float64       
dtypes: datetime64[ns](1), float64(3), int64(2), object(1)
memory usage: 13.0+ MB


In [7]:
smalldf.isnull().sum()

RSSD ID                  0
Firm Legal Name          0
FR District Code         0
Bank Count            9119
Reporting Date           0
Interest Income     169914
Total Assets        169914
dtype: int64

In [8]:
smalldf.shape

(243757, 7)

In [9]:
smalldf = smalldf[smalldf['Interest Income'].notna()]

In [10]:
smalldf.shape

(73843, 7)

### Define Quarters and Years


In [11]:
# Define Quarter

smalldf['Quarter'] = smalldf['Reporting Date'].dt.quarter
# smalldf.head()

In [12]:
# Define Year

smalldf['Year'] = smalldf['Reporting Date'].dt.year
smalldf.head()

Unnamed: 0,RSSD ID,Firm Legal Name,FR District Code,Bank Count,Reporting Date,Interest Income,Total Assets,Quarter,Year
0,1020180,BREMER FINANCIAL CORPORATION,9,11.0,2004-03-31,68233.0,5721282.0,1,2004
1,1020201,HSBC USA INC.,2,1.0,2004-03-31,883035.0,102501934.0,1,2004
2,1020340,"HARRIS BANKCORP, INC.",7,29.0,2004-03-31,276513.0,32344528.0,1,2004
3,1020395,SOUTHERN NATIONAL CORPORATION,6,1.0,2004-03-31,2980.0,217529.0,1,2004
4,1020582,"WCN BANCORP, INC.",7,1.0,2004-03-31,2692.0,240470.0,1,2004


## Quaterization

In [13]:
def quarterize(df, cols_to_process, dt_column):
    
  # indices is considered to be default
    fq_ids = df[(df[dt_column].dt.month == 3) & (df[dt_column].dt.day == 31)].index
    for col in cols_to_process:
        
        df[f"Q{col}"] = df[col].diff()
        df.loc[fq_ids, f"Q{col}"] = df.loc[fq_ids, col]
    return df

In [14]:
ls_dfs = []
for bank in smalldf['RSSD ID'].unique():
    OneBank = smalldf[smalldf["RSSD ID"] == bank].sort_values(by=['Reporting Date'])
    tmp_df = quarterize(OneBank, ["Interest Income"], dt_column="Reporting Date")
    ls_dfs.append(tmp_df)

Quarterized_dfs = pd.concat(ls_dfs, ignore_index=True)
Quarterized_dfs.head(10)

Unnamed: 0,RSSD ID,Firm Legal Name,FR District Code,Bank Count,Reporting Date,Interest Income,Total Assets,Quarter,Year,QInterest Income
0,1020180,BREMER FINANCIAL CORPORATION,9,11.0,2004-03-31,68233.0,5721282.0,1,2004,68233.0
1,1020180,BREMER FINANCIAL CORPORATION,9,11.0,2004-06-30,138086.0,5870480.0,2,2004,69853.0
2,1020180,BREMER FINANCIAL CORPORATION,9,10.0,2004-09-30,212489.0,5963700.0,3,2004,74403.0
3,1020180,BREMER FINANCIAL CORPORATION,9,10.0,2004-12-31,291916.0,6141519.0,4,2004,79427.0
4,1020180,BREMER FINANCIAL CORPORATION,9,10.0,2005-03-31,81656.0,6230236.0,1,2005,81656.0
5,1020180,BREMER FINANCIAL CORPORATION,9,10.0,2005-06-30,170175.0,6450716.0,2,2005,88519.0
6,1020180,BREMER FINANCIAL CORPORATION,9,10.0,2005-09-30,265290.0,6336699.0,3,2005,95115.0
7,1020180,BREMER FINANCIAL CORPORATION,9,9.0,2005-12-31,362347.0,6555895.0,4,2005,97057.0
8,1020180,BREMER FINANCIAL CORPORATION,9,9.0,2006-03-31,100170.0,6480680.0,1,2006,100170.0
9,1020180,BREMER FINANCIAL CORPORATION,9,9.0,2006-06-30,206822.0,6638952.0,2,2006,106652.0


In [15]:
# pd.set_option('display.max_rows', None)

In [16]:
Quarterized_dfs.head(200)

Unnamed: 0,RSSD ID,Firm Legal Name,FR District Code,Bank Count,Reporting Date,Interest Income,Total Assets,Quarter,Year,QInterest Income
0,1020180,BREMER FINANCIAL CORPORATION,9,11.0,2004-03-31,68233.0,5721282.0,1,2004,68233.0
1,1020180,BREMER FINANCIAL CORPORATION,9,11.0,2004-06-30,138086.0,5870480.0,2,2004,69853.0
2,1020180,BREMER FINANCIAL CORPORATION,9,10.0,2004-09-30,212489.0,5963700.0,3,2004,74403.0
3,1020180,BREMER FINANCIAL CORPORATION,9,10.0,2004-12-31,291916.0,6141519.0,4,2004,79427.0
4,1020180,BREMER FINANCIAL CORPORATION,9,10.0,2005-03-31,81656.0,6230236.0,1,2005,81656.0
...,...,...,...,...,...,...,...,...,...,...
195,1020902,"FIRST NATIONAL OF NEBRASKA, INC.",10,13.0,2007-06-30,500692.0,14954127.0,2,2007,263559.0
196,1020902,"FIRST NATIONAL OF NEBRASKA, INC.",10,13.0,2007-09-30,779039.0,15284709.0,3,2007,278347.0
197,1020902,"FIRST NATIONAL OF NEBRASKA, INC.",10,12.0,2007-12-31,1046824.0,16020746.0,4,2007,267785.0
198,1020902,"FIRST NATIONAL OF NEBRASKA, INC.",10,10.0,2008-03-31,253546.0,16296348.0,1,2008,253546.0


In [17]:
Quarterized_dfs.isnull().sum()

RSSD ID                0
Firm Legal Name        0
FR District Code       0
Bank Count          2989
Reporting Date         0
Interest Income        0
Total Assets           0
Quarter                0
Year                   0
QInterest Income     341
dtype: int64

In [18]:
q_null_rows = Quarterized_dfs[Quarterized_dfs["QInterest Income"].isnull()]

In [19]:
q_null_rows

Unnamed: 0,RSSD ID,Firm Legal Name,FR District Code,Bank Count,Reporting Date,Interest Income,Total Assets,Quarter,Year,QInterest Income
60893,1059827,"DREXEL BANCSHARES, INC.",10,1.0,2004-06-30,4180.0,167439.0,2,2004,
60897,1067000,"FSC BANCSHARES, INC.",10,1.0,2004-06-30,4455.0,152129.0,2,2004,
60904,1082786,RIVERHILLS CAPITAL CORPORATION,6,1.0,2004-06-30,3696.0,151752.0,2,2004,
60911,1083318,FIRST NATIONAL CORPORATION OF PICAYUNE,6,1.0,2004-06-30,5043.0,156760.0,2,2004,
60918,1084203,"MERCHANTS & FARMERS BANCSHARES, INC.",6,1.0,2004-06-30,3742.0,155968.0,2,2004,
...,...,...,...,...,...,...,...,...,...,...
73744,1100028,UNITY CAPITAL CORPORATION,8,1.0,2021-09-30,3426.0,272750.0,3,2021,
73755,5478903,"CSBH, LLC",5,1.0,2021-12-31,3544.0,125232.0,4,2021,
73792,5756704,BURKE & HERBERT FINANCIAL SERVICES CORP.,5,1.0,2022-12-31,112957.0,3562898.0,4,2022,
73834,3828661,OHIO FARMERS INSURANCE COMPANY,4,,2023-09-30,156582.0,10246242.0,3,2023,


In [20]:
quotient = round(Quarterized_dfs['Interest Income']/ Quarterized_dfs['Quarter'], 1)
quotient

0        68233.0
1        69043.0
2        70829.7
3        72979.0
4        81656.0
          ...   
73838    68203.0
73839    42530.0
73840    51929.0
73841    90475.0
73842    39223.0
Length: 73843, dtype: float64

In [21]:
Quarterized_dfs["QInterest Income"] = Quarterized_dfs["QInterest Income"].fillna(quotient)
Quarterized_dfs.head()

Unnamed: 0,RSSD ID,Firm Legal Name,FR District Code,Bank Count,Reporting Date,Interest Income,Total Assets,Quarter,Year,QInterest Income
0,1020180,BREMER FINANCIAL CORPORATION,9,11.0,2004-03-31,68233.0,5721282.0,1,2004,68233.0
1,1020180,BREMER FINANCIAL CORPORATION,9,11.0,2004-06-30,138086.0,5870480.0,2,2004,69853.0
2,1020180,BREMER FINANCIAL CORPORATION,9,10.0,2004-09-30,212489.0,5963700.0,3,2004,74403.0
3,1020180,BREMER FINANCIAL CORPORATION,9,10.0,2004-12-31,291916.0,6141519.0,4,2004,79427.0
4,1020180,BREMER FINANCIAL CORPORATION,9,10.0,2005-03-31,81656.0,6230236.0,1,2005,81656.0


In [22]:
Quarterized_dfs.iloc[60932, :]

RSSD ID                                    1099784
Firm Legal Name     CITIZENS FINANCIAL GROUP, INC.
FR District Code                                 8
Bank Count                                     1.0
Reporting Date                 2004-06-30 00:00:00
Interest Income                             4607.0
Total Assets                              152719.0
Quarter                                          2
Year                                          2004
QInterest Income                            2303.5
Name: 60932, dtype: object

### Normalization

In [23]:
# Normalize Components
Quarterized_dfs['Normalized Interest Income per Quarter'] = Quarterized_dfs['QInterest Income']/Quarterized_dfs['Total Assets']

# Quarterized_dfs['Normalized Net Interest Income per Quarter'] = Quarterized_dfs['QNet Interest Income']/Quarterized_dfs['Total Assets']

Quarterized_dfs.head()

Unnamed: 0,RSSD ID,Firm Legal Name,FR District Code,Bank Count,Reporting Date,Interest Income,Total Assets,Quarter,Year,QInterest Income,Normalized Interest Income per Quarter
0,1020180,BREMER FINANCIAL CORPORATION,9,11.0,2004-03-31,68233.0,5721282.0,1,2004,68233.0,0.011926
1,1020180,BREMER FINANCIAL CORPORATION,9,11.0,2004-06-30,138086.0,5870480.0,2,2004,69853.0,0.011899
2,1020180,BREMER FINANCIAL CORPORATION,9,10.0,2004-09-30,212489.0,5963700.0,3,2004,74403.0,0.012476
3,1020180,BREMER FINANCIAL CORPORATION,9,10.0,2004-12-31,291916.0,6141519.0,4,2004,79427.0,0.012933
4,1020180,BREMER FINANCIAL CORPORATION,9,10.0,2005-03-31,81656.0,6230236.0,1,2005,81656.0,0.013106


### We bring our Macroeconomic Variables

In [24]:
gdp_df = pd.read_csv(r"/Users/odoms001/Downloads/IMA_Project/CS_Finance_Data/MacroEconomic_Variables/GDP.csv", parse_dates=True)
gdp_df['DATE'] = pd.to_datetime(gdp_df['DATE'])
print(gdp_df.shape)
gdp_df.head()

(81, 2)


Unnamed: 0,DATE,GDP
0,2004-01-01,11923.447
1,2004-04-01,12112.815
2,2004-07-01,12305.307
3,2004-10-01,12527.214
4,2005-01-01,12767.286


In [25]:
uner_df = pd.read_csv(r"/Users/odoms001/Downloads/IMA_Project/CS_Finance_Data/MacroEconomic_Variables/UNRATE.csv")
uner_df['DATE'] = pd.to_datetime(uner_df['DATE'])
print(uner_df.shape)
uner_df.head()

(82, 2)


Unnamed: 0,DATE,UNRATE
0,2004-01-01,5.7
1,2004-04-01,5.6
2,2004-07-01,5.433333
3,2004-10-01,5.433333
4,2005-01-01,5.3


In [26]:
df_gdp_uner = pd.merge(gdp_df,uner_df,on='DATE')
print(df_gdp_uner.shape)
df_gdp_uner.head()

(81, 3)


Unnamed: 0,DATE,GDP,UNRATE
0,2004-01-01,11923.447,5.7
1,2004-04-01,12112.815,5.6
2,2004-07-01,12305.307,5.433333
3,2004-10-01,12527.214,5.433333
4,2005-01-01,12767.286,5.3


In [27]:
df_gdp_uner.rename(columns={'DATE': 'date'}, inplace=True)

In [28]:
def update_quart_enddate_shift(df):
    # Define a dictionary to map the month to the new date
    quarter_end_dates = {1: (12, 31), 4: (3, 31), 7: (6, 30), 10: (9, 30)}

    # Apply the mapping to update the dates
    for month, (new_month, new_day) in quarter_end_dates.items():
        df.loc[df['date'].dt.month == month, 'date'] = df['date'].apply(
            lambda x: x.replace(month=new_month, day=new_day) if x.month == month else x
        )

    # offset the years by 1 for those whose month is december
    tmp_ids = df[df["date"].dt.month == 12].index
    df.loc[tmp_ids, "date"] = df.loc[tmp_ids, "date"] - pd.DateOffset(years=1)

    # drop the first row df_gdp_uner
    df.drop(df.index[0], inplace=True)
    df.reset_index(drop=True, inplace=True)

    return df

In [29]:
df_gdp_uner =  update_quart_enddate_shift(df_gdp_uner)
df_gdp_uner.head()

Unnamed: 0,date,GDP,UNRATE
0,2004-03-31,12112.815,5.6
1,2004-06-30,12305.307,5.433333
2,2004-09-30,12527.214,5.433333
3,2004-12-31,12767.286,5.3
4,2005-03-31,12922.656,5.1


### US Department of Treasury Dataset

In [30]:
rates_df = pd.read_csv(r"/Users/odoms001/Downloads/IMA_Project/CS_Finance_Data/MacroEconomic_Variables/yield-curve-rates-2004-2024.csv", encoding='latin1')
rates_df.head()

Unnamed: 0,Date,1 Mo,2 Mo,3 Mo,4 Mo,6 Mo,1 Yr,2 Yr,3 Yr,5 Yr,7 Yr,10 Yr,20 Yr,30 Yr
0,07/15/2024,5.48,5.51,5.43,5.4,5.23,4.85,4.44,4.23,4.13,4.16,4.23,4.56,4.46
1,07/12/2024,5.47,5.52,5.43,5.41,5.23,4.87,4.45,4.22,4.1,4.13,4.18,4.5,4.39
2,07/11/2024,5.48,5.53,5.44,5.41,5.25,4.91,4.5,4.26,4.13,4.15,4.2,4.51,4.41
3,07/10/2024,5.46,5.5,5.46,5.46,5.33,5.01,4.62,4.38,4.24,4.24,4.28,4.58,4.47
4,07/09/2024,5.45,5.51,5.46,5.46,5.34,5.02,4.62,4.37,4.24,4.25,4.3,4.59,4.49


In [31]:
# Changing Date to Date-time Format
rates_df['Date'] = pd.to_datetime(rates_df['Date'].astype(str), utc = True)
rates_df.head()

Unnamed: 0,Date,1 Mo,2 Mo,3 Mo,4 Mo,6 Mo,1 Yr,2 Yr,3 Yr,5 Yr,7 Yr,10 Yr,20 Yr,30 Yr
0,2024-07-15 00:00:00+00:00,5.48,5.51,5.43,5.4,5.23,4.85,4.44,4.23,4.13,4.16,4.23,4.56,4.46
1,2024-07-12 00:00:00+00:00,5.47,5.52,5.43,5.41,5.23,4.87,4.45,4.22,4.1,4.13,4.18,4.5,4.39
2,2024-07-11 00:00:00+00:00,5.48,5.53,5.44,5.41,5.25,4.91,4.5,4.26,4.13,4.15,4.2,4.51,4.41
3,2024-07-10 00:00:00+00:00,5.46,5.5,5.46,5.46,5.33,5.01,4.62,4.38,4.24,4.24,4.28,4.58,4.47
4,2024-07-09 00:00:00+00:00,5.45,5.51,5.46,5.46,5.34,5.02,4.62,4.37,4.24,4.25,4.3,4.59,4.49


In [32]:
# Giving the Year and Quarter each entry
rates_df['Year'] = rates_df['Date'].dt.year
rates_df['Quarter'] = rates_df['Date'].dt.quarter
rates_df.sort_values('Date').head()

Unnamed: 0,Date,1 Mo,2 Mo,3 Mo,4 Mo,6 Mo,1 Yr,2 Yr,3 Yr,5 Yr,7 Yr,10 Yr,20 Yr,30 Yr,Year,Quarter
5138,2004-01-02 00:00:00+00:00,0.88,,0.93,,1.02,1.31,1.94,2.47,3.36,3.9,4.38,5.21,,2004,1
5137,2004-01-05 00:00:00+00:00,0.88,,0.91,,1.05,1.35,1.95,2.51,3.39,3.92,4.41,5.23,,2004,1
5136,2004-01-06 00:00:00+00:00,0.88,,0.91,,1.03,1.3,1.84,2.38,3.26,3.8,4.29,5.13,,2004,1
5135,2004-01-07 00:00:00+00:00,0.88,,0.91,,1.02,1.29,1.84,2.36,3.25,3.76,4.27,5.11,,2004,1
5134,2004-01-08 00:00:00+00:00,0.87,,0.88,,1.01,1.29,1.85,2.37,3.24,3.76,4.27,5.12,,2004,1


In [33]:
finalrates = rates_df.groupby(['Year', 'Quarter']).mean()
# finalrates = finalrates.drop(columns=["Date"], axis=1)
finalrates.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,1 Mo,2 Mo,3 Mo,4 Mo,6 Mo,1 Yr,2 Yr,3 Yr,5 Yr,7 Yr,10 Yr,20 Yr,30 Yr
Year,Quarter,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2004,1,0.909677,,0.93371,,1.003548,1.221452,1.686613,2.163065,2.980968,3.501452,4.01,4.877903,
2004,2,0.954516,,1.096129,,1.359677,1.775484,2.454677,2.971935,3.720968,4.183871,4.597097,5.355645,
2004,3,1.37,,1.513906,,1.788437,2.075,2.556563,2.919531,3.505625,3.922188,4.301406,5.068281,
2004,4,1.841774,,2.047419,,2.297581,2.472742,2.822419,3.058871,3.49371,3.855,4.175806,4.874839,
2005,1,2.360984,,2.58918,,2.876885,3.072459,3.454918,3.623934,3.89082,4.098852,4.303607,4.764098,
2005,2,2.709687,,2.926719,,3.179688,3.337344,3.645938,3.732344,3.872969,3.984688,4.158594,4.54875,
2005,3,3.228594,,3.438281,,3.707031,3.791094,3.957969,3.989219,4.039375,4.108437,4.215156,4.507656,
2005,4,3.702131,,3.913607,,4.250492,4.288525,4.36459,4.372295,4.391148,4.425738,4.488197,4.767541,
2006,1,4.35871,,4.506613,,4.657258,4.640323,4.604677,4.582581,4.552419,4.555484,4.577097,4.774032,4.663056
2006,2,4.674921,,4.830159,,5.033651,5.021429,4.997937,4.987302,4.993175,5.018095,5.072698,5.290635,5.143016


In [34]:
finalrates_df = finalrates.copy()
finalrates_df.reset_index(inplace=True)
finalrates_df.head()

Unnamed: 0,Year,Quarter,1 Mo,2 Mo,3 Mo,4 Mo,6 Mo,1 Yr,2 Yr,3 Yr,5 Yr,7 Yr,10 Yr,20 Yr,30 Yr
0,2004,1,0.909677,,0.93371,,1.003548,1.221452,1.686613,2.163065,2.980968,3.501452,4.01,4.877903,
1,2004,2,0.954516,,1.096129,,1.359677,1.775484,2.454677,2.971935,3.720968,4.183871,4.597097,5.355645,
2,2004,3,1.37,,1.513906,,1.788437,2.075,2.556563,2.919531,3.505625,3.922188,4.301406,5.068281,
3,2004,4,1.841774,,2.047419,,2.297581,2.472742,2.822419,3.058871,3.49371,3.855,4.175806,4.874839,
4,2005,1,2.360984,,2.58918,,2.876885,3.072459,3.454918,3.623934,3.89082,4.098852,4.303607,4.764098,


In [35]:
cmev = pd.concat([finalrates_df, df_gdp_uner], axis = 1)
cmev.drop(columns=['Year', 'Quarter'], inplace=True)
cmev.head()

Unnamed: 0,1 Mo,2 Mo,3 Mo,4 Mo,6 Mo,1 Yr,2 Yr,3 Yr,5 Yr,7 Yr,10 Yr,20 Yr,30 Yr,date,GDP,UNRATE
0,0.909677,,0.93371,,1.003548,1.221452,1.686613,2.163065,2.980968,3.501452,4.01,4.877903,,2004-03-31,12112.815,5.6
1,0.954516,,1.096129,,1.359677,1.775484,2.454677,2.971935,3.720968,4.183871,4.597097,5.355645,,2004-06-30,12305.307,5.433333
2,1.37,,1.513906,,1.788437,2.075,2.556563,2.919531,3.505625,3.922188,4.301406,5.068281,,2004-09-30,12527.214,5.433333
3,1.841774,,2.047419,,2.297581,2.472742,2.822419,3.058871,3.49371,3.855,4.175806,4.874839,,2004-12-31,12767.286,5.3
4,2.360984,,2.58918,,2.876885,3.072459,3.454918,3.623934,3.89082,4.098852,4.303607,4.764098,,2005-03-31,12922.656,5.1


In [36]:
Quarterized_dfs.head()

Unnamed: 0,RSSD ID,Firm Legal Name,FR District Code,Bank Count,Reporting Date,Interest Income,Total Assets,Quarter,Year,QInterest Income,Normalized Interest Income per Quarter
0,1020180,BREMER FINANCIAL CORPORATION,9,11.0,2004-03-31,68233.0,5721282.0,1,2004,68233.0,0.011926
1,1020180,BREMER FINANCIAL CORPORATION,9,11.0,2004-06-30,138086.0,5870480.0,2,2004,69853.0,0.011899
2,1020180,BREMER FINANCIAL CORPORATION,9,10.0,2004-09-30,212489.0,5963700.0,3,2004,74403.0,0.012476
3,1020180,BREMER FINANCIAL CORPORATION,9,10.0,2004-12-31,291916.0,6141519.0,4,2004,79427.0,0.012933
4,1020180,BREMER FINANCIAL CORPORATION,9,10.0,2005-03-31,81656.0,6230236.0,1,2005,81656.0,0.013106


In [37]:
# selecting some columns of Quater
small_Quaterized_dfs = Quarterized_dfs[['RSSD ID', 'Firm Legal Name', 'Bank Count', 'Reporting Date', "Total Assets", 'Year', 'Quarter',
                                        'Normalized Interest Income per Quarter']]

small_Quaterized_dfs.head()

Unnamed: 0,RSSD ID,Firm Legal Name,Bank Count,Reporting Date,Total Assets,Year,Quarter,Normalized Interest Income per Quarter
0,1020180,BREMER FINANCIAL CORPORATION,11.0,2004-03-31,5721282.0,2004,1,0.011926
1,1020180,BREMER FINANCIAL CORPORATION,11.0,2004-06-30,5870480.0,2004,2,0.011899
2,1020180,BREMER FINANCIAL CORPORATION,10.0,2004-09-30,5963700.0,2004,3,0.012476
3,1020180,BREMER FINANCIAL CORPORATION,10.0,2004-12-31,6141519.0,2004,4,0.012933
4,1020180,BREMER FINANCIAL CORPORATION,10.0,2005-03-31,6230236.0,2005,1,0.013106


In [38]:
small_Quaterized_dfs.isnull().sum()

RSSD ID                                      0
Firm Legal Name                              0
Bank Count                                2989
Reporting Date                               0
Total Assets                                 0
Year                                         0
Quarter                                      0
Normalized Interest Income per Quarter       0
dtype: int64

In [39]:
df = small_Quaterized_dfs.merge(cmev, left_on='Reporting Date', right_on='date', how='left')
df.drop(columns=['date'], inplace=True)
df.head()

Unnamed: 0,RSSD ID,Firm Legal Name,Bank Count,Reporting Date,Total Assets,Year,Quarter,Normalized Interest Income per Quarter,1 Mo,2 Mo,...,1 Yr,2 Yr,3 Yr,5 Yr,7 Yr,10 Yr,20 Yr,30 Yr,GDP,UNRATE
0,1020180,BREMER FINANCIAL CORPORATION,11.0,2004-03-31,5721282.0,2004,1,0.011926,0.909677,,...,1.221452,1.686613,2.163065,2.980968,3.501452,4.01,4.877903,,12112.815,5.6
1,1020180,BREMER FINANCIAL CORPORATION,11.0,2004-06-30,5870480.0,2004,2,0.011899,0.954516,,...,1.775484,2.454677,2.971935,3.720968,4.183871,4.597097,5.355645,,12305.307,5.433333
2,1020180,BREMER FINANCIAL CORPORATION,10.0,2004-09-30,5963700.0,2004,3,0.012476,1.37,,...,2.075,2.556563,2.919531,3.505625,3.922188,4.301406,5.068281,,12527.214,5.433333
3,1020180,BREMER FINANCIAL CORPORATION,10.0,2004-12-31,6141519.0,2004,4,0.012933,1.841774,,...,2.472742,2.822419,3.058871,3.49371,3.855,4.175806,4.874839,,12767.286,5.3
4,1020180,BREMER FINANCIAL CORPORATION,10.0,2005-03-31,6230236.0,2005,1,0.013106,2.360984,,...,3.072459,3.454918,3.623934,3.89082,4.098852,4.303607,4.764098,,12922.656,5.1


In [40]:
df.shape

(73843, 23)

In [41]:
df.isnull().sum()

RSSD ID                                       0
Firm Legal Name                               0
Bank Count                                 2989
Reporting Date                                0
Total Assets                                  0
Year                                          0
Quarter                                       0
Normalized Interest Income per Quarter        0
1 Mo                                        387
2 Mo                                      66265
3 Mo                                        387
4 Mo                                      71966
6 Mo                                        387
1 Yr                                        387
2 Yr                                        387
3 Yr                                        387
5 Yr                                        387
7 Yr                                        387
10 Yr                                       387
20 Yr                                       387
30 Yr                                   

In [42]:
df_dropped = df.drop(columns=["Bank Count","2 Mo", "4 Mo", "30 Yr"])
df_dropped.head()

Unnamed: 0,RSSD ID,Firm Legal Name,Reporting Date,Total Assets,Year,Quarter,Normalized Interest Income per Quarter,1 Mo,3 Mo,6 Mo,1 Yr,2 Yr,3 Yr,5 Yr,7 Yr,10 Yr,20 Yr,GDP,UNRATE
0,1020180,BREMER FINANCIAL CORPORATION,2004-03-31,5721282.0,2004,1,0.011926,0.909677,0.93371,1.003548,1.221452,1.686613,2.163065,2.980968,3.501452,4.01,4.877903,12112.815,5.6
1,1020180,BREMER FINANCIAL CORPORATION,2004-06-30,5870480.0,2004,2,0.011899,0.954516,1.096129,1.359677,1.775484,2.454677,2.971935,3.720968,4.183871,4.597097,5.355645,12305.307,5.433333
2,1020180,BREMER FINANCIAL CORPORATION,2004-09-30,5963700.0,2004,3,0.012476,1.37,1.513906,1.788437,2.075,2.556563,2.919531,3.505625,3.922188,4.301406,5.068281,12527.214,5.433333
3,1020180,BREMER FINANCIAL CORPORATION,2004-12-31,6141519.0,2004,4,0.012933,1.841774,2.047419,2.297581,2.472742,2.822419,3.058871,3.49371,3.855,4.175806,4.874839,12767.286,5.3
4,1020180,BREMER FINANCIAL CORPORATION,2005-03-31,6230236.0,2005,1,0.013106,2.360984,2.58918,2.876885,3.072459,3.454918,3.623934,3.89082,4.098852,4.303607,4.764098,12922.656,5.1


In [None]:
df_final.shape

In [None]:
df_final.isnull().sum()

In [43]:
df_dropped.isnull().sum()

RSSD ID                                     0
Firm Legal Name                             0
Reporting Date                              0
Total Assets                                0
Year                                        0
Quarter                                     0
Normalized Interest Income per Quarter      0
1 Mo                                      387
3 Mo                                      387
6 Mo                                      387
1 Yr                                      387
2 Yr                                      387
3 Yr                                      387
5 Yr                                      387
7 Yr                                      387
10 Yr                                     387
20 Yr                                     387
GDP                                       387
UNRATE                                    387
dtype: int64

In [44]:
new_df = df_dropped[df_dropped["1 Mo"].notna()]

In [45]:
new_df.isnull().sum()

RSSD ID                                   0
Firm Legal Name                           0
Reporting Date                            0
Total Assets                              0
Year                                      0
Quarter                                   0
Normalized Interest Income per Quarter    0
1 Mo                                      0
3 Mo                                      0
6 Mo                                      0
1 Yr                                      0
2 Yr                                      0
3 Yr                                      0
5 Yr                                      0
7 Yr                                      0
10 Yr                                     0
20 Yr                                     0
GDP                                       0
UNRATE                                    0
dtype: int64

In [46]:
new_df.shape

(73456, 19)

In [47]:
df = new_df.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73456 entries, 0 to 73837
Data columns (total 19 columns):
 #   Column                                  Non-Null Count  Dtype         
---  ------                                  --------------  -----         
 0   RSSD ID                                 73456 non-null  int64         
 1   Firm Legal Name                         73456 non-null  object        
 2   Reporting Date                          73456 non-null  datetime64[ns]
 3   Total Assets                            73456 non-null  float64       
 4   Year                                    73456 non-null  int64         
 5   Quarter                                 73456 non-null  int64         
 6   Normalized Interest Income per Quarter  73456 non-null  float64       
 7   1 Mo                                    73456 non-null  float64       
 8   3 Mo                                    73456 non-null  float64       
 9   6 Mo                                    73456 non-

### Fitting Linear Regression

In [48]:
X= df[['3 Mo', '1 Yr', '3 Yr', '10 Yr', 'UNRATE', 'GDP']]  # variables

y= df['Normalized Interest Income per Quarter'] #Independent variable PPNR

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R² Score: {r2}')
print(f'Coefficients: {model.coef_}')
print(f'Intercept: {model.intercept_}')

Mean Squared Error: 7.753346559236614e-06
R² Score: 0.4119284898966027
Coefficients: [ 2.68867903e-04  1.61280694e-03 -1.39722402e-03  9.92374731e-04
  1.81793561e-04 -2.50481896e-07]
Intercept: 0.011269542780515767


In [49]:
X= df[['3 Mo', '1 Yr', '3 Yr', '10 Yr', 'UNRATE', 'GDP']]  # variables

y= df['Normalized Interest Income per Quarter'] #Independent variable PPNR

scalar = StandardScaler()
X_scaled = scalar.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Create a linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R² Score: {r2}')
print(f'Coefficients: {model.coef_}')
print(f'Intercept: {model.intercept_}')

Mean Squared Error: 7.753346559236686e-06
R² Score: 0.41192848989659714
Coefficients: [ 0.00045313  0.00272069 -0.00205487  0.00107102  0.00034478 -0.00088445]
Intercept: 0.011693442001161751


### Add a bank Characteristic

**1. Lagged Income**

In [53]:
#Making Lagged Variable for Normalized Interest Income

new_dfs = []
for i in df['RSSD ID'].unique():
    new_df_i = df.loc[df['RSSD ID']==i,:]
    for j in df['Year'].unique():
        new_df_i_j = new_df_i.loc[new_df_i['Year']==j,:]
        new_df_i_j = new_df_i_j.sort_values('Quarter')
        new_df_i_j["Lagged Income"] = new_df_i_j['Normalized Interest Income per Quarter'].shift(1)
        new_dfs.append(new_df_i_j)

df_lag = pd.concat(new_dfs, ignore_index=True)
df_lag.head()

Unnamed: 0,RSSD ID,Firm Legal Name,Reporting Date,Total Assets,Year,Quarter,Normalized Interest Income per Quarter,1 Mo,3 Mo,6 Mo,1 Yr,2 Yr,3 Yr,5 Yr,7 Yr,10 Yr,20 Yr,GDP,UNRATE,Lagged Income
0,1020180,BREMER FINANCIAL CORPORATION,2004-03-31,5721282.0,2004,1,0.011926,0.909677,0.93371,1.003548,1.221452,1.686613,2.163065,2.980968,3.501452,4.01,4.877903,12112.815,5.6,
1,1020180,BREMER FINANCIAL CORPORATION,2004-06-30,5870480.0,2004,2,0.011899,0.954516,1.096129,1.359677,1.775484,2.454677,2.971935,3.720968,4.183871,4.597097,5.355645,12305.307,5.433333,0.011926
2,1020180,BREMER FINANCIAL CORPORATION,2004-09-30,5963700.0,2004,3,0.012476,1.37,1.513906,1.788437,2.075,2.556563,2.919531,3.505625,3.922188,4.301406,5.068281,12527.214,5.433333,0.011899
3,1020180,BREMER FINANCIAL CORPORATION,2004-12-31,6141519.0,2004,4,0.012933,1.841774,2.047419,2.297581,2.472742,2.822419,3.058871,3.49371,3.855,4.175806,4.874839,12767.286,5.3,0.012476
4,1020180,BREMER FINANCIAL CORPORATION,2005-03-31,6230236.0,2005,1,0.013106,2.360984,2.58918,2.876885,3.072459,3.454918,3.623934,3.89082,4.098852,4.303607,4.764098,12922.656,5.1,


Re-run the two linear Regression above. This time including the Lagged Values

In [54]:
df = df_lag.copy()

In [55]:
df.head()

Unnamed: 0,RSSD ID,Firm Legal Name,Reporting Date,Total Assets,Year,Quarter,Normalized Interest Income per Quarter,1 Mo,3 Mo,6 Mo,1 Yr,2 Yr,3 Yr,5 Yr,7 Yr,10 Yr,20 Yr,GDP,UNRATE,Lagged Income
0,1020180,BREMER FINANCIAL CORPORATION,2004-03-31,5721282.0,2004,1,0.011926,0.909677,0.93371,1.003548,1.221452,1.686613,2.163065,2.980968,3.501452,4.01,4.877903,12112.815,5.6,
1,1020180,BREMER FINANCIAL CORPORATION,2004-06-30,5870480.0,2004,2,0.011899,0.954516,1.096129,1.359677,1.775484,2.454677,2.971935,3.720968,4.183871,4.597097,5.355645,12305.307,5.433333,0.011926
2,1020180,BREMER FINANCIAL CORPORATION,2004-09-30,5963700.0,2004,3,0.012476,1.37,1.513906,1.788437,2.075,2.556563,2.919531,3.505625,3.922188,4.301406,5.068281,12527.214,5.433333,0.011899
3,1020180,BREMER FINANCIAL CORPORATION,2004-12-31,6141519.0,2004,4,0.012933,1.841774,2.047419,2.297581,2.472742,2.822419,3.058871,3.49371,3.855,4.175806,4.874839,12767.286,5.3,0.012476
4,1020180,BREMER FINANCIAL CORPORATION,2005-03-31,6230236.0,2005,1,0.013106,2.360984,2.58918,2.876885,3.072459,3.454918,3.623934,3.89082,4.098852,4.303607,4.764098,12922.656,5.1,


In [56]:
dfNew = df.dropna()

In [57]:
df = dfNew.copy()

In [58]:
X= df[['3 Mo', '1 Yr', '3 Yr', '10 Yr', 'Lagged Income', 'UNRATE', 'GDP']]  # variables

y= df['Normalized Interest Income per Quarter'] #Independent variable PPNR

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R² Score: {r2}')
print(f'Coefficients: {model.coef_}')
print(f'Intercept: {model.intercept_}')

Mean Squared Error: 1.3120715028557276e-06
R² Score: 0.9065472577896033
Coefficients: [ 1.21660096e-04  1.80623740e-04  4.12829742e-05  1.21140171e-04
  7.48522057e-01  3.82643967e-05 -5.08965478e-08]
Intercept: 0.0026298568308427504


So we observe here the Lagged Income has significantly improved the R^2 and having a coefficient $0.7$ suggests that autoregressive model could perform well. We also observe that all the other coefficents are getting to zero.

In [59]:
X= df[['3 Mo', '1 Yr', '3 Yr', '10 Yr', 'Lagged Income', 'UNRATE', 'GDP']]  # variables

y= df['Normalized Interest Income per Quarter'] #Independent variable PPNR

scalar = StandardScaler()
X_scaled = scalar.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Create a linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R² Score: {r2}')
print(f'Coefficients: {model.coef_}')
print(f'Intercept: {model.intercept_}')

Mean Squared Error: 1.312071502856152e-06
R² Score: 0.9065472577895731
Coefficients: [ 2.08809768e-04  3.11291540e-04  6.22677297e-05  1.33905379e-04
  2.83854662e-03  7.04779945e-05 -1.81614741e-04]
Intercept: 0.011759259316135265


So we observe here that the scaling of the predictor did not make a difference.

**2. Large Bank**

In [132]:
df_lag.head()

Unnamed: 0,RSSD ID,Firm Legal Name,Reporting Date,Total Assets,Year,Quarter,Normalized Interest Income per Quarter,1 Mo,3 Mo,6 Mo,1 Yr,2 Yr,3 Yr,5 Yr,7 Yr,10 Yr,20 Yr,GDP,UNRATE,Lagged Income
0,1020180,BREMER FINANCIAL CORPORATION,2004-03-31,5721282.0,2004,1,0.011926,0.909677,0.93371,1.003548,1.221452,1.686613,2.163065,2.980968,3.501452,4.01,4.877903,12112.815,5.6,
1,1020180,BREMER FINANCIAL CORPORATION,2004-06-30,5870480.0,2004,2,0.011899,0.954516,1.096129,1.359677,1.775484,2.454677,2.971935,3.720968,4.183871,4.597097,5.355645,12305.307,5.433333,0.011926
2,1020180,BREMER FINANCIAL CORPORATION,2004-09-30,5963700.0,2004,3,0.012476,1.37,1.513906,1.788437,2.075,2.556563,2.919531,3.505625,3.922188,4.301406,5.068281,12527.214,5.433333,0.011899
3,1020180,BREMER FINANCIAL CORPORATION,2004-12-31,6141519.0,2004,4,0.012933,1.841774,2.047419,2.297581,2.472742,2.822419,3.058871,3.49371,3.855,4.175806,4.874839,12767.286,5.3,0.012476
4,1020180,BREMER FINANCIAL CORPORATION,2005-03-31,6230236.0,2005,1,0.013106,2.360984,2.58918,2.876885,3.072459,3.454918,3.623934,3.89082,4.098852,4.303607,4.764098,12922.656,5.1,


In [133]:
#Making a large bank variable
new_dfs = []

for i in df_lag['Quarter'].unique():
    new_df_i = df_lag.loc[df_lag['Quarter']==i,:]
    for j in df_lag['Year'].unique():
        new_df_i_j = new_df_i.loc[new_df_i['Year']==j,:]
        new_df_i_j['Large Bank'] = (new_df_i_j['Total Assets'] > new_df_i_j['Total Assets'].mean())
        new_dfs.append(new_df_i_j)

df_final = pd.concat(new_dfs, ignore_index=True)
df_final.head()

Unnamed: 0,RSSD ID,Firm Legal Name,Reporting Date,Total Assets,Year,Quarter,Normalized Interest Income per Quarter,1 Mo,3 Mo,6 Mo,...,2 Yr,3 Yr,5 Yr,7 Yr,10 Yr,20 Yr,GDP,UNRATE,Lagged Income,Large Bank
0,1020180,BREMER FINANCIAL CORPORATION,2004-03-31,5721282.0,2004,1,0.011926,0.909677,0.93371,1.003548,...,1.686613,2.163065,2.980968,3.501452,4.01,4.877903,12112.815,5.6,,False
1,1020201,HSBC USA INC.,2004-03-31,102501934.0,2004,1,0.008615,0.909677,0.93371,1.003548,...,1.686613,2.163065,2.980968,3.501452,4.01,4.877903,12112.815,5.6,,True
2,1020340,"HARRIS BANKCORP, INC.",2004-03-31,32344528.0,2004,1,0.008549,0.909677,0.93371,1.003548,...,1.686613,2.163065,2.980968,3.501452,4.01,4.877903,12112.815,5.6,,True
3,1020395,SOUTHERN NATIONAL CORPORATION,2004-03-31,217529.0,2004,1,0.013699,0.909677,0.93371,1.003548,...,1.686613,2.163065,2.980968,3.501452,4.01,4.877903,12112.815,5.6,,False
4,1020582,"WCN BANCORP, INC.",2004-03-31,240470.0,2004,1,0.011195,0.909677,0.93371,1.003548,...,1.686613,2.163065,2.980968,3.501452,4.01,4.877903,12112.815,5.6,,False


In [137]:
#Chek number of NA values
df_final.isna().sum()

#Drop NA values
df_final.dropna(inplace = True)

In [140]:
df_final.head(50)

Unnamed: 0,RSSD ID,Firm Legal Name,Reporting Date,Total Assets,Year,Quarter,Normalized Interest Income per Quarter,1 Mo,3 Mo,6 Mo,...,2 Yr,3 Yr,5 Yr,7 Yr,10 Yr,20 Yr,GDP,UNRATE,Lagged Income,Large Bank
18630,1020180,BREMER FINANCIAL CORPORATION,2004-06-30,5870480.0,2004,2,0.011899,0.954516,1.096129,1.359677,...,2.454677,2.971935,3.720968,4.183871,4.597097,5.355645,12305.307,5.433333,0.011926,False
18631,1020201,HSBC USA INC.,2004-06-30,112790755.0,2004,2,0.008371,0.954516,1.096129,1.359677,...,2.454677,2.971935,3.720968,4.183871,4.597097,5.355645,12305.307,5.433333,0.008615,True
18632,1020340,"HARRIS BANKCORP, INC.",2004-06-30,32604234.0,2004,2,0.008764,0.954516,1.096129,1.359677,...,2.454677,2.971935,3.720968,4.183871,4.597097,5.355645,12305.307,5.433333,0.008549,True
18633,1020395,SOUTHERN NATIONAL CORPORATION,2004-06-30,217775.0,2004,2,0.013661,0.954516,1.096129,1.359677,...,2.454677,2.971935,3.720968,4.183871,4.597097,5.355645,12305.307,5.433333,0.013699,False
18634,1020582,"WCN BANCORP, INC.",2004-06-30,244040.0,2004,2,0.011047,0.954516,1.096129,1.359677,...,2.454677,2.971935,3.720968,4.183871,4.597097,5.355645,12305.307,5.433333,0.011195,False
18635,1020676,AMALGAMATED INVESTMENTS COMPANY,2004-06-30,645409.0,2004,2,0.010187,0.954516,1.096129,1.359677,...,2.454677,2.971935,3.720968,4.183871,4.597097,5.355645,12305.307,5.433333,0.010457,False
18636,1020902,"FIRST NATIONAL OF NEBRASKA, INC.",2004-06-30,12172721.0,2004,2,0.013712,0.954516,1.096129,1.359677,...,2.454677,2.971935,3.720968,4.183871,4.597097,5.355645,12305.307,5.433333,0.013461,True
18637,1020920,CENTRAL TRUST COMPANY,2004-06-30,172435.0,2004,2,0.012944,0.954516,1.096129,1.359677,...,2.454677,2.971935,3.720968,4.183871,4.597097,5.355645,12305.307,5.433333,0.012615,False
18638,1021570,"VIKING CORPORATION, THE",2004-06-30,155150.0,2004,2,0.014289,0.954516,1.096129,1.359677,...,2.454677,2.971935,3.720968,4.183871,4.597097,5.355645,12305.307,5.433333,0.013524,False
18639,1021628,BANAMEX USA BANCORP,2004-06-30,1902828.0,2004,2,0.008395,0.954516,1.096129,1.359677,...,2.454677,2.971935,3.720968,4.183871,4.597097,5.355645,12305.307,5.433333,0.008022,False


In [139]:
df_final.shape

(54399, 21)

Would want to re-run the regression model including the Large Bank Variable. But because of what was talked about in the chat, Will hold on 

### Fitting Lasso Model

In [50]:
df.columns

Index(['RSSD ID', 'Firm Legal Name', 'Reporting Date', 'Total Assets', 'Year',
       'Quarter', 'Normalized Interest Income per Quarter', '1 Mo', '3 Mo',
       '6 Mo', '1 Yr', '2 Yr', '3 Yr', '5 Yr', '7 Yr', '10 Yr', '20 Yr', 'GDP',
       'UNRATE'],
      dtype='object')

In [51]:
X= df[['3 Mo', '1 Yr', '3 Yr', '10 Yr', 'UNRATE', 'GDP']]  # variables

y= df['Normalized Interest Income per Quarter'] #Independent variable PPNR

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit a Lasso model
alpha = 0.1  # regularization strength
model = Lasso(alpha = alpha)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R² Score: {r2}')
print(f'Coefficients: {model.coef_}')
print(f'Intercept: {model.intercept_}')

Mean Squared Error: 1.0981567683188884e-05
R² Score: 0.16707617266738084
Coefficients: [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
 -0.00000000e+00 -4.17210939e-07]
Intercept: 0.018498612011919047


In [52]:
X= df[['1 Mo', '3 Mo', '6 Mo', '1 Yr', '2 Yr', '3 Yr', '5 Yr', '7 Yr', '10 Yr', '20 Yr', 'GDP', 'UNRATE']]  # variables

y= df['Normalized Interest Income per Quarter'] #Independent variable PPNR

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit a Lasso model
alpha = 0.1  # regularization strength
model = Lasso(alpha = alpha)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R² Score: {r2}')
print(f'Coefficients: {model.coef_}')
print(f'Intercept: {model.intercept_}')

Mean Squared Error: 1.0981567683188884e-05
R² Score: 0.16707617266738084
Coefficients: [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -4.17210939e-07 -0.00000000e+00]
Intercept: 0.018498612011919047


In [149]:
X= df[['1 Mo', '3 Mo', '6 Mo', '1 Yr', '2 Yr', '3 Yr', '5 Yr', '7 Yr', '10 Yr', '20 Yr', 'GDP', 'UNRATE', 'Lagged Income']]  # variables

y= df['Normalized Interest Income per Quarter'] #Independent variable PPNR

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit a Lasso model
alpha = 0.01  # regularization strength
model = Lasso(alpha = alpha)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R² Score: {r2}')
print(f'Coefficients: {model.coef_}')
print(f'Intercept: {model.intercept_}')

Mean Squared Error: 1.1909465552064904e-05
R² Score: 0.15174423674446602
Coefficients: [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -4.24818496e-07 -0.00000000e+00
  0.00000000e+00]
Intercept: 0.01872697996204714


## Modeling

In [None]:
sel_df = df.copy()
sel_df = sel_df.loc[:, ["RSSD ID", "Normalized Interest Income per Quarter", "1 Mo", "3 Mo", "6 Mo", "1 Yr", "2 Yr", "3 Yr", "5 Yr", "7 Yr", "10 Yr", "20 Yr", "GDP", "UNRATE" ]]
sel_df

In [None]:
sel_df.isnull().sum()

In [None]:
# Extract the 'Normalized Interest Income per Quarter' column
normalized_income = sel_df['Normalized Interest Income per Quarter']

# Fill missing values with the mean of the column
normalized_income_filled = normalized_income.fillna(normalized_income.mean())

# Update the original DataFrame
sel_df['Normalized Interest Income per Quarter'] = normalized_income_filled

In [None]:
sel_df.dropna(inplace=True)
sel_df.isnull().sum()

In [None]:
sel_df.shape

In [None]:
sel_df.head()

In [None]:
columns_to_include = ["1 Mo", "6 Mo", "3 Yr", "10 Yr", "GDP", "UNRATE"] # columns of rates, GDP, and unerate

X= sel_df[columns_to_include]

y= sel_df['Normalized Interest Income per Quarter']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R² Score: {r2}')
print(f'Coefficients: {model.coef_}')
print(f'Intercept: {model.intercept_}')