Base models without a grid search will first be fitted to get a sense of which work best with the imbalanced dataset. Each model will be fitted with various iterations of the dataset. Then, models are down selected and greater tuning through grid searches occur before finally settling on a final model.

In [2]:
import pandas as pd
import numpy as np

%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns

# Data Prep

## Load data

In [3]:
data1 = pd.read_csv('https://data.sba.gov/dataset/0ff8e8e9-b967-4f4e-987c-6ac78c575087/resource/c71ba6cf-b4e0-4e60-98f0-48aeaf4c6460/download/foia-7afy2020-present-asof-220331.csv', 
                    encoding = "ISO-8859-1", low_memory = False)

data2 = pd.read_csv('https://data.sba.gov/dataset/0ff8e8e9-b967-4f4e-987c-6ac78c575087/resource/02e2e83a-2af1-4ce8-91db-85e20ffadbf7/download/foia-7afy2010-fy2019-asof-220331.csv', 
                    encoding = "ISO-8859-1", low_memory = False)

data3 = pd.read_csv('https://data.sba.gov/dataset/0ff8e8e9-b967-4f4e-987c-6ac78c575087/resource/95d6972c-38df-4ecf-8f0a-eed224e8b02c/download/foia-7afy2000-fy2009-asof-220331.csv', 
                    encoding = "ISO-8859-1", low_memory = False)

data = pd.concat([data1, data2, data3], ignore_index=True).reset_index(drop=True)

In [4]:
# subset to observations with the LoanStatus PIF or CHGOFF aka paid in full or charged off. We don't care about loans that are currently being paid back
df = data[(data['LoanStatus'] == 'PIF') | (data['LoanStatus'] == 'CHGOFF')].drop_duplicates().reset_index(drop=True)

# convert PIF to 0 and CHGOFF to 1
df.loc[df['LoanStatus'] == 'PIF', 'LoanStatus'] = '0' 
df.loc[df['LoanStatus'] == 'CHGOFF', 'LoanStatus'] = '1'
df['LoanStatus'] = df['LoanStatus'].astype(int)

## Create core dataset

In [9]:
core = df.copy()

# Same state for Bank and Borrower
core['BorrBankSameState'] = 0
core.loc[core.BankState == core.BorrState, 'BorrBankSameState'] = 1

# Number of total loans, total defaults, and percentage of default for each bank
df_BankDefaults = pd.merge(pd.DataFrame(df.groupby('BankName')['LoanStatus'].count()).reset_index().rename(columns={'LoanStatus':'TotLoanCounts'}), 
                           pd.DataFrame(df.groupby('BankName')['LoanStatus'].sum()).reset_index().rename(columns={'LoanStatus':'TotDefaultCounts'}), 
                           how='inner', on='BankName')
core = pd.merge(core[['LoanStatus', 'BankName', 'RevolverStatus', 'TermInMonths', 'InitialInterestRate', 'BorrBankSameState', 'ApprovalFiscalYear', 'NaicsCode', 'subpgmdesc']], 
                df_BankDefaults, 
                how='inner', 
                on='BankName')
core['TotPctDefault'] = core['TotDefaultCounts']/core['TotLoanCounts']

# Number of yearly loans, yearly defaults, and percentage of default for each bank for each year of issuance 
df_BankDefaultsYearly = pd.merge(pd.DataFrame(df.groupby(['BankName', 'ApprovalFiscalYear'])['LoanStatus'].count()).reset_index().rename(columns={'LoanStatus':'YearlyLoanCounts'}), 
                           pd.DataFrame(df.groupby(['BankName', 'ApprovalFiscalYear'])['LoanStatus'].sum()).reset_index().rename(columns={'LoanStatus':'YearlyDefaultCounts'}), 
                           how='inner', on=['BankName', 'ApprovalFiscalYear'])
core = pd.merge(core[['LoanStatus', 'BankName', 'ApprovalFiscalYear', 'RevolverStatus', 'TermInMonths', 'InitialInterestRate', 'BorrBankSameState',
                     'TotPctDefault', 'TotDefaultCounts', 'TotLoanCounts', 'NaicsCode', 'subpgmdesc']], 
                df_BankDefaultsYearly, 
                how='inner', 
                on=['BankName', 'ApprovalFiscalYear'])
core['YearlyPctDefault'] = core['YearlyDefaultCounts']/core['YearlyLoanCounts']


In [10]:
core.head()

Unnamed: 0,LoanStatus,BankName,ApprovalFiscalYear,RevolverStatus,TermInMonths,InitialInterestRate,BorrBankSameState,TotPctDefault,TotDefaultCounts,TotLoanCounts,NaicsCode,subpgmdesc,YearlyLoanCounts,YearlyDefaultCounts,YearlyPctDefault
0,0,"JPMorgan Chase Bank, National Association",2020,0,84,9.75,0,0.220983,13554,61335,445110.0,FA$TRK (Small Loan Express),106,6,0.056604
1,0,"JPMorgan Chase Bank, National Association",2020,1,120,11.4,0,0.220983,13554,61335,621399.0,FA$TRK (Small Loan Express),106,6,0.056604
2,0,"JPMorgan Chase Bank, National Association",2020,0,84,7.0,0,0.220983,13554,61335,236118.0,FA$TRK (Small Loan Express),106,6,0.056604
3,0,"JPMorgan Chase Bank, National Association",2020,1,120,7.4,0,0.220983,13554,61335,541512.0,FA$TRK (Small Loan Express),106,6,0.056604
4,0,"JPMorgan Chase Bank, National Association",2020,0,126,7.11,0,0.220983,13554,61335,722511.0,Guaranty,106,6,0.056604


## Create necessary features

In [12]:
core_small['subpgmdesc_community_express'] = 
core_large = pd.merge(core.drop('subpgmdesc', axis=1), pd.get_dummies(df['subpgmdesc'], prefix = 'subpgmdesc_'), how='inner', left_index = True, right_index = True)

In [None]:
df_subpgmdesc = pd.merge(df[['LoanStatus']], pd.get_dummies(df['subpgmdesc'], prefix = 'subpgmdesc_'), how='inner', left_index = True, right_index = True) - two options 1) with all 2) just community express

In [None]:
df_NaicsCode = pd.merge(df_NaicsCode[['LoanStatus']], pd.get_dummies(df_NaicsCode['NaicsCode'], prefix = 'NaicsCode_'), how='inner', left_index = True, right_index = True) 

In [None]:
df_Year = pd.merge(df[['LoanStatus']], pd.get_dummies(df['ApprovalFiscalYear'], prefix = 'ApprovalFiscalYear_'), how='inner', left_index = True, right_index = True) - either 2006 and 2007 or all

In [None]:
'RevolverStatus', 'TermInMonths', 'InitialInterestRate', 'BorrBankSameState'

df_BankDefaultsYearly = pd.merge(pd.DataFrame(df.groupby(['BankName', 'ApprovalFiscalYear'])['LoanStatus'].count()).reset_index().rename(columns={'LoanStatus':'LoanCounts'}), 
                           pd.DataFrame(df.groupby(['BankName', 'ApprovalFiscalYear'])['LoanStatus'].sum()).reset_index().rename(columns={'LoanStatus':'DefaultCounts'}), 
                           how='inner', on=['BankName', 'ApprovalFiscalYear'])
df_BankDefaultsYearly = pd.merge(df[['LoanStatus', 'BankName', 'ApprovalFiscalYear']], df_BankDefaultsYearly, how='inner', on=['BankName', 'ApprovalFiscalYear'])
df_BankDefaultsYearly['PctDefault'] = df_BankDefaultsYearly['DefaultCounts']/df_BankDefaultsYearly['LoanCounts']

In [None]:
Random Forest

In [None]:
BalancedRandomForestClassifier is another ensemble method in which each tree of the forest will be provided a balanced bootstrap sample

In [None]:
RUSBoostClassifier randomly under-sample the dataset before to perform a boosting iteration

In [None]:
XGBOOST

In [None]:
LASSO

In [None]:
SVM, 

In [None]:
/Regualization ones, 

In [None]:
LOGISTIC REG