Base models without a grid search will first be fitted to get a sense of which work best with the imbalanced dataset. Each model will be fitted with various iterations of the dataset. Then, models are down selected and greater tuning through grid searches occur before finally settling on a final model.

In [2]:
import pandas as pd
import numpy as np

%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns

# Data Prep

## Load data

In [3]:
data1 = pd.read_csv('https://data.sba.gov/dataset/0ff8e8e9-b967-4f4e-987c-6ac78c575087/resource/c71ba6cf-b4e0-4e60-98f0-48aeaf4c6460/download/foia-7afy2020-present-asof-220331.csv', 
                    encoding = "ISO-8859-1", low_memory = False)

data2 = pd.read_csv('https://data.sba.gov/dataset/0ff8e8e9-b967-4f4e-987c-6ac78c575087/resource/02e2e83a-2af1-4ce8-91db-85e20ffadbf7/download/foia-7afy2010-fy2019-asof-220331.csv', 
                    encoding = "ISO-8859-1", low_memory = False)

data3 = pd.read_csv('https://data.sba.gov/dataset/0ff8e8e9-b967-4f4e-987c-6ac78c575087/resource/95d6972c-38df-4ecf-8f0a-eed224e8b02c/download/foia-7afy2000-fy2009-asof-220331.csv', 
                    encoding = "ISO-8859-1", low_memory = False)

data = pd.concat([data1, data2, data3], ignore_index=True).reset_index(drop=True)

In [4]:
# subset to observations with the LoanStatus PIF or CHGOFF aka paid in full or charged off. We don't care about loans that are currently being paid back
df = data[(data['LoanStatus'] == 'PIF') | (data['LoanStatus'] == 'CHGOFF')].drop_duplicates().reset_index(drop=True)

# convert PIF to 0 and CHGOFF to 1
df.loc[df['LoanStatus'] == 'PIF', 'LoanStatus'] = '0' 
df.loc[df['LoanStatus'] == 'CHGOFF', 'LoanStatus'] = '1'
df['LoanStatus'] = df['LoanStatus'].astype(int)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 916423 entries, 0 to 916422
Data columns (total 36 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   AsOfDate               916423 non-null  int64  
 1   Program                916423 non-null  object 
 2   BorrName               916407 non-null  object 
 3   BorrStreet             916403 non-null  object 
 4   BorrCity               916413 non-null  object 
 5   BorrState              916413 non-null  object 
 6   BorrZip                916423 non-null  int64  
 7   BankName               916423 non-null  object 
 8   BankStreet             916387 non-null  object 
 9   BankCity               916387 non-null  object 
 10  BankState              916381 non-null  object 
 11  BankZip                916387 non-null  object 
 12  GrossApproval          916413 non-null  float64
 13  SBAGuaranteedApproval  916413 non-null  float64
 14  ApprovalDate           916423 non-nu

In [28]:
temp = df.copy()
temp['ind'] = 1
temp.loc[temp.InitialInterestRate.isna(), 'ind'] = 0
temp.groupby(['ApprovalFiscalYear', 'ind'])['JobsSupported'].count()

ApprovalFiscalYear  ind
2000                0      37279
2001                0      36990
2002                0      44756
2003                0      58556
2004                0      70459
2005                0      83283
2006                0      85258
2007                0      86781
2008                0      59826
2009                0       3857
                    1      31402
2010                1      38134
2011                1      42842
2012                1      35173
2013                1      34487
2014                1      37559
2015                1      38787
2016                1      34596
2017                1      25235
2018                1      17465
2019                1       8630
2020                1       3907
2021                1       1123
2022                1         38
Name: JobsSupported, dtype: int64

## Create core dataset (interest rate included)

In [24]:
core = df[['LoanStatus', 'BankName', 'RevolverStatus', 'TermInMonths', 'InitialInterestRate', 'ApprovalFiscalYear', 'NaicsCode', 'subpgmdesc', 'BankState', 'BorrState']].copy().dropna()

# Same state for Bank and Borrower
core['BorrBankSameState'] = 0
core.loc[core.BankState == core.BorrState, 'BorrBankSameState'] = 1

# Number of total loans, total defaults, and percentage of default for each bank
df_BankDefaults = pd.merge(pd.DataFrame(df.groupby('BankName')['LoanStatus'].count()).reset_index().rename(columns={'LoanStatus':'TotLoanCounts'}), 
                           pd.DataFrame(df.groupby('BankName')['LoanStatus'].sum()).reset_index().rename(columns={'LoanStatus':'TotDefaultCounts'}), 
                           how='inner', on='BankName')
core = pd.merge(core[['LoanStatus', 'BankName', 'RevolverStatus', 'TermInMonths', 'InitialInterestRate', 'BorrBankSameState', 'ApprovalFiscalYear', 'NaicsCode', 'subpgmdesc']], 
                df_BankDefaults, 
                how='inner', 
                on='BankName')
core['TotPctDefault'] = core['TotDefaultCounts']/core['TotLoanCounts']

# Number of yearly loans, yearly defaults, and percentage of default for each bank for each year of issuance 
df_BankDefaultsYearly = pd.merge(pd.DataFrame(df.groupby(['BankName', 'ApprovalFiscalYear'])['LoanStatus'].count()).reset_index().rename(columns={'LoanStatus':'YearlyLoanCounts'}), 
                           pd.DataFrame(df.groupby(['BankName', 'ApprovalFiscalYear'])['LoanStatus'].sum()).reset_index().rename(columns={'LoanStatus':'YearlyDefaultCounts'}), 
                           how='inner', on=['BankName', 'ApprovalFiscalYear'])
core = pd.merge(core[['LoanStatus', 'BankName', 'ApprovalFiscalYear', 'RevolverStatus', 'TermInMonths', 'InitialInterestRate', 'BorrBankSameState',
                     'TotPctDefault', 'TotDefaultCounts', 'TotLoanCounts', 'NaicsCode', 'subpgmdesc']], 
                df_BankDefaultsYearly, 
                how='inner', 
                on=['BankName', 'ApprovalFiscalYear'])
core['YearlyPctDefault'] = core['YearlyDefaultCounts']/core['YearlyLoanCounts']


## Create features to test

In [29]:
# Just the community express dummy indicator
core_small = core.copy()
core_small['subpgmdesc_community_express'] = 0
core_small.loc[core_small.subpgmdesc == 'Community Express', 'subpgmdesc_community_express'] = 1

# Dummy variables for all of the sub program descriptions
core_large = pd.merge(core.drop('subpgmdesc', axis=1), pd.get_dummies(core['subpgmdesc'], prefix = 'subpgmdesc_', drop_first=True), how='inner', left_index = True, right_index = True)

In [30]:
# core_small gets none of the naics codes
core_small.drop('NaicsCode', axis=1, inplace=True)

# core_large gets all of the naics codes
core_large['NaicsCode'] = core_large['NaicsCode'].astype(str)
core_large['NaicsCode'] = core_large['NaicsCode'].str.slice(stop=2)
core_large['NaicsCode'] = core_large['NaicsCode'].astype(int)
core_large = pd.merge(core_large.drop('NaicsCode', axis=1), pd.get_dummies(core_large['NaicsCode'], prefix = 'NaicsCode_', drop_first=True), how='inner', left_index = True, right_index = True) 

In [31]:
# core_small just has dummy variables for 2006 and 2007
core_small['Year2006'] = 0
core_small.loc[core_small.ApprovalFiscalYear == 2006, 'Year2006'] = 1

core_small['Year2007'] = 0
core_small.loc[core_small.ApprovalFiscalYear == 2007, 'Year2007'] = 1

In [32]:
# core_large has dummy variables for all years
core_large = pd.merge(core_large.drop('ApprovalFiscalYear', axis=1), pd.get_dummies(core_large['ApprovalFiscalYear'], prefix = 'ApprovalFiscalYear_', drop_first=True), how='inner', left_index = True, right_index = True) 

## Create core dataset (without interest rate)

In [36]:
core_no_int = df[['LoanStatus', 'BankName', 'RevolverStatus', 'TermInMonths', 'ApprovalFiscalYear', 'NaicsCode', 'subpgmdesc', 'BankState', 'BorrState']].copy().dropna()

# Same state for Bank and Borrower
core_no_int['BorrBankSameState'] = 0
core_no_int.loc[core_no_int.BankState == core_no_int.BorrState, 'BorrBankSameState'] = 1

# Number of total loans, total defaults, and percentage of default for each bank
df_BankDefaults = pd.merge(pd.DataFrame(df.groupby('BankName')['LoanStatus'].count()).reset_index().rename(columns={'LoanStatus':'TotLoanCounts'}), 
                           pd.DataFrame(df.groupby('BankName')['LoanStatus'].sum()).reset_index().rename(columns={'LoanStatus':'TotDefaultCounts'}), 
                           how='inner', on='BankName')
core_no_int = pd.merge(core_no_int[['LoanStatus', 'BankName', 'RevolverStatus', 'TermInMonths', 'BorrBankSameState', 'ApprovalFiscalYear', 'NaicsCode', 'subpgmdesc']], 
                df_BankDefaults, 
                how='inner', 
                on='BankName')
core_no_int['TotPctDefault'] = core_no_int['TotDefaultCounts']/core_no_int['TotLoanCounts']

# Number of yearly loans, yearly defaults, and percentage of default for each bank for each year of issuance 
df_BankDefaultsYearly = pd.merge(pd.DataFrame(df.groupby(['BankName', 'ApprovalFiscalYear'])['LoanStatus'].count()).reset_index().rename(columns={'LoanStatus':'YearlyLoanCounts'}), 
                           pd.DataFrame(df.groupby(['BankName', 'ApprovalFiscalYear'])['LoanStatus'].sum()).reset_index().rename(columns={'LoanStatus':'YearlyDefaultCounts'}), 
                           how='inner', on=['BankName', 'ApprovalFiscalYear'])
core_no_int = pd.merge(core_no_int[['LoanStatus', 'BankName', 'ApprovalFiscalYear', 'RevolverStatus', 'TermInMonths', 'BorrBankSameState',
                     'TotPctDefault', 'TotDefaultCounts', 'TotLoanCounts', 'NaicsCode', 'subpgmdesc']], 
                df_BankDefaultsYearly, 
                how='inner', 
                on=['BankName', 'ApprovalFiscalYear'])
core_no_int['YearlyPctDefault'] = core_no_int['YearlyDefaultCounts']/core_no_int['YearlyLoanCounts']

## Create features to test

In [42]:
# Just the community express dummy indicator
core_no_int_small = core_no_int.copy()
core_no_int_small['subpgmdesc_community_express'] = 0
core_no_int_small.loc[core_no_int_small.subpgmdesc == 'Community Express', 'subpgmdesc_community_express'] = 1

# Dummy variables for all of the sub program descriptions
core_no_int_large = pd.merge(core_no_int.drop('subpgmdesc', axis=1), pd.get_dummies(core_no_int['subpgmdesc'], prefix = 'subpgmdesc_', drop_first=True), how='inner', left_index = True, right_index = True)

In [43]:
# core_small gets none of the naics codes
core_no_int_small.drop('NaicsCode', axis=1, inplace=True)

# core_large gets all of the naics codes
core_no_int_large['NaicsCode'] = core_no_int_large['NaicsCode'].astype(str)
core_no_int_large['NaicsCode'] = core_no_int_large['NaicsCode'].str.slice(stop=2)
core_no_int_large['NaicsCode'] = core_no_int_large['NaicsCode'].astype(int)
core_no_int_large = pd.merge(core_no_int_large.drop('NaicsCode', axis=1), pd.get_dummies(core_no_int_large['NaicsCode'], prefix = 'NaicsCode_', drop_first=True), how='inner', left_index = True, right_index = True) 

In [44]:
# core_small just has dummy variables for 2006 and 2007
core_no_int_small['Year2006'] = 0
core_no_int_small.loc[core_no_int_small.ApprovalFiscalYear == 2006, 'Year2006'] = 1

core_no_int_small['Year2007'] = 0
core_no_int_small.loc[core_no_int_small.ApprovalFiscalYear == 2007, 'Year2007'] = 1

In [45]:
# core_large has dummy variables for all years
core_no_int_large = pd.merge(core_no_int_large.drop('ApprovalFiscalYear', axis=1), pd.get_dummies(core_no_int_large['ApprovalFiscalYear'], prefix = 'ApprovalFiscalYear_', drop_first=True), how='inner', left_index = True, right_index = True) 

# Models

In [None]:
Random Forest

In [None]:
BalancedRandomForestClassifier is another ensemble method in which each tree of the forest will be provided a balanced bootstrap sample

In [None]:
RUSBoostClassifier randomly under-sample the dataset before to perform a boosting iteration

In [None]:
XGBOOST

In [None]:
LASSO

In [None]:
SVM, 

In [None]:
/Regualization ones, 

In [None]:
LOGISTIC REG