In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
train_df

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.1240,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.1240,100.22,RENT,45000.0,Not Verified,low_risk,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12175,354912,354912,19975.0,0.2565,801.09,RENT,28000.0,Not Verified,high_risk,n,...,100.0,16.7,0.0,0.0,50055.0,28192.0,18700.0,19055.0,N,N
12176,354944,354944,15000.0,0.1774,540.34,RENT,50000.0,Verified,high_risk,n,...,90.5,11.1,0.0,0.0,70324.0,57025.0,13300.0,54824.0,N,N
12177,354973,354973,3600.0,0.1862,131.28,RENT,60000.0,Not Verified,high_risk,n,...,100.0,0.0,0.0,0.0,83765.0,55156.0,14800.0,53065.0,N,N
12178,355002,355002,15000.0,0.0881,475.68,MORTGAGE,62000.0,Source Verified,high_risk,n,...,100.0,0.0,0.0,0.0,189930.0,23748.0,7000.0,32930.0,N,N


In [4]:
test_df

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.70,MORTGAGE,140000.0,Not Verified,low_risk,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,25429,6000.0,0.1524,208.70,RENT,55000.0,Not Verified,low_risk,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,38496,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,low_risk,n,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,19667,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,37505,3600.0,0.1240,120.27,RENT,50000.0,Not Verified,low_risk,n,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4697,77282,77282,30000.0,0.1240,673.42,RENT,140480.0,Source Verified,high_risk,n,...,100.0,28.6,0.0,0.0,159688.0,110873.0,48400.0,107388.0,N,N
4698,77291,77291,24000.0,0.0756,747.22,RENT,50000.0,Not Verified,high_risk,n,...,100.0,0.0,0.0,0.0,62375.0,18928.0,13300.0,30775.0,N,N
4699,77292,77292,10000.0,0.2305,387.36,RENT,33000.0,Verified,high_risk,n,...,100.0,0.0,0.0,0.0,43250.0,33022.0,8500.0,29550.0,N,N
4700,77297,77297,8000.0,0.1862,205.86,RENT,38000.0,Source Verified,high_risk,n,...,95.0,0.0,1.0,0.0,31357.0,19595.0,1500.0,9657.0,N,N


## Categorical Columns 
home_ownership,
verification_status,
loan_status,
pymnt_plan,initial_list_status,application_type,hardship_flag,debt_settlement_flag

## Preprocessing: Convert categorical data to numeric

Create a training set from the 2019 loans using `pd.get_dummies()` to convert the categorical data to numeric columns. Similarly, create a testing set from the 2020 loans, also using `pd.get_dummies()`. Note! There are categories in the 2019 loans that do not exist in the testing set. If you fit a model to the training set and try to score it on the testing set as is, you will get an error. You need to use code to fill in the missing categories in the testing set. 

In [5]:
train_df = train_df.drop(['Unnamed: 0'],axis='columns')
train_df

Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,29.99,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,11.26,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,20000.0,0.1240,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,11.28,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,3000.0,0.1240,100.22,RENT,45000.0,Not Verified,low_risk,n,18.08,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,27.77,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12175,354912,19975.0,0.2565,801.09,RENT,28000.0,Not Verified,high_risk,n,28.42,...,100.0,16.7,0.0,0.0,50055.0,28192.0,18700.0,19055.0,N,N
12176,354944,15000.0,0.1774,540.34,RENT,50000.0,Verified,high_risk,n,23.43,...,90.5,11.1,0.0,0.0,70324.0,57025.0,13300.0,54824.0,N,N
12177,354973,3600.0,0.1862,131.28,RENT,60000.0,Not Verified,high_risk,n,28.80,...,100.0,0.0,0.0,0.0,83765.0,55156.0,14800.0,53065.0,N,N
12178,355002,15000.0,0.0881,475.68,MORTGAGE,62000.0,Source Verified,high_risk,n,11.44,...,100.0,0.0,0.0,0.0,189930.0,23748.0,7000.0,32930.0,N,N


In [6]:
test_df = test_df.drop(['Unnamed: 0'],axis='columns')
test_df

Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,40000.0,0.0819,814.70,MORTGAGE,140000.0,Not Verified,low_risk,n,19.75,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,6000.0,0.1524,208.70,RENT,55000.0,Not Verified,low_risk,n,11.52,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,low_risk,n,6.74,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,low_risk,n,12.13,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,3600.0,0.1240,120.27,RENT,50000.0,Not Verified,low_risk,n,16.08,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4697,77282,30000.0,0.1240,673.42,RENT,140480.0,Source Verified,high_risk,n,15.74,...,100.0,28.6,0.0,0.0,159688.0,110873.0,48400.0,107388.0,N,N
4698,77291,24000.0,0.0756,747.22,RENT,50000.0,Not Verified,high_risk,n,26.81,...,100.0,0.0,0.0,0.0,62375.0,18928.0,13300.0,30775.0,N,N
4699,77292,10000.0,0.2305,387.36,RENT,33000.0,Verified,high_risk,n,38.51,...,100.0,0.0,0.0,0.0,43250.0,33022.0,8500.0,29550.0,N,N
4700,77297,8000.0,0.1862,205.86,RENT,38000.0,Source Verified,high_risk,n,16.36,...,95.0,0.0,1.0,0.0,31357.0,19595.0,1500.0,9657.0,N,N


In [7]:
#train_df.pymnt_plan.unique()
train_df['pymnt_plan']

0        n
1        n
2        n
3        n
4        n
        ..
12175    n
12176    n
12177    n
12178    n
12179    n
Name: pymnt_plan, Length: 12180, dtype: object

In [8]:
test_df.pymnt_plan.unique()

array(['n'], dtype=object)

In [9]:
YesNo_flag = {'Y':1,'N':0}
train_df2 = train_df.replace({'hardship_flag':YesNo_flag, 'debt_settlement_flag':YesNo_flag})

Home_ownership_val = {'ANY':0,'RENT':1,'MORTGAGE':2,'OWN':3}
train_df3 = train_df2.replace({'home_ownership':Home_ownership_val})

verification_val = {'Not Verified':0,'Source Verified':1,'Verified':1}
train_df4 = train_df3.replace({'verification_status':verification_val})

Loan_status_val = {'low_risk':1,'high_risk':0}
train_df5 = train_df4.replace({'loan_status':Loan_status_val})

Initial_list_status_val = {'w':0,'f':1}
train_df6 = train_df5.replace({'initial_list_status':Initial_list_status_val})

Application_Type_val = {'Individual':1,'Joint App':0}
train_df7 = train_df6.replace({'application_type':Application_Type_val})



In [10]:
train_df7

Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,13375.0,0.1797,483.34,2,223000.0,0,1,n,29.99,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,0,0
1,141451,21000.0,0.1308,478.68,2,123000.0,1,1,n,11.26,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,0,0
2,321143,20000.0,0.1240,448.95,2,197000.0,1,1,n,11.28,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,0,0
3,11778,3000.0,0.1240,100.22,1,45000.0,0,1,n,18.08,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,0,0
4,169382,30000.0,0.1612,1056.49,2,133000.0,1,1,n,27.77,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12175,354912,19975.0,0.2565,801.09,1,28000.0,0,0,n,28.42,...,100.0,16.7,0.0,0.0,50055.0,28192.0,18700.0,19055.0,0,0
12176,354944,15000.0,0.1774,540.34,1,50000.0,1,0,n,23.43,...,90.5,11.1,0.0,0.0,70324.0,57025.0,13300.0,54824.0,0,0
12177,354973,3600.0,0.1862,131.28,1,60000.0,0,0,n,28.80,...,100.0,0.0,0.0,0.0,83765.0,55156.0,14800.0,53065.0,0,0
12178,355002,15000.0,0.0881,475.68,2,62000.0,1,0,n,11.44,...,100.0,0.0,0.0,0.0,189930.0,23748.0,7000.0,32930.0,0,0


In [11]:
train_df8 = train_df7.drop(['pymnt_plan'],axis=1)

#df.drop(['A'], axis = 1)
train_df8

Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,dti,delinq_2yrs,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,13375.0,0.1797,483.34,2,223000.0,0,1,29.99,0.0,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,0,0
1,141451,21000.0,0.1308,478.68,2,123000.0,1,1,11.26,2.0,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,0,0
2,321143,20000.0,0.1240,448.95,2,197000.0,1,1,11.28,0.0,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,0,0
3,11778,3000.0,0.1240,100.22,1,45000.0,0,1,18.08,0.0,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,0,0
4,169382,30000.0,0.1612,1056.49,2,133000.0,1,1,27.77,0.0,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12175,354912,19975.0,0.2565,801.09,1,28000.0,0,0,28.42,0.0,...,100.0,16.7,0.0,0.0,50055.0,28192.0,18700.0,19055.0,0,0
12176,354944,15000.0,0.1774,540.34,1,50000.0,1,0,23.43,4.0,...,90.5,11.1,0.0,0.0,70324.0,57025.0,13300.0,54824.0,0,0
12177,354973,3600.0,0.1862,131.28,1,60000.0,0,0,28.80,0.0,...,100.0,0.0,0.0,0.0,83765.0,55156.0,14800.0,53065.0,0,0
12178,355002,15000.0,0.0881,475.68,2,62000.0,1,0,11.44,0.0,...,100.0,0.0,0.0,0.0,189930.0,23748.0,7000.0,32930.0,0,0


In [12]:
#train_df8 = train_df7.drop(['index','pymnt_plan'],axis='columns')

# file_path = Path('Resources/cleaned_2019_loan_data.csv')
train_df8.to_csv('Resources/cleaned_2019_loan_data.csv', index=False)

In [13]:
# Convert categorical data to numeric and separate target feature for training data
# One-hot encoding the entire dataframe
X_traindf_dummies = pd.get_dummies(train_df8)
print(X_traindf_dummies.columns)
X_traindf_dummies

Index(['index', 'loan_amnt', 'int_rate', 'installment', 'home_ownership',
       'annual_inc', 'verification_status', 'loan_status', 'dti',
       'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal',
       'total_acc', 'initial_list_status', 'out_prncp', 'out_prncp_inv',
       'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'collections_12_mths_ex_med', 'policy_code',
       'application_type', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal',
       'open_acc_6m', 'open_act_il', 'open_il_12m', 'open_il_24m',
       'mths_since_rcnt_il', 'total_bal_il', 'il_util', 'open_rv_12m',
       'open_rv_24m', 'max_bal_bc', 'all_util', 'total_rev_hi_lim', 'inq_fi',
       'total_cu_tl', 'inq_last_12m', 'acc_open_past_24mths', 'avg_cur_bal',
       'bc_open_to_buy', 'bc_util', 'chargeoff_within_12_mths', 'delinq_amnt',
       'mo_sin_old_il_acct', 'mo_sin_old_rev_

Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,dti,delinq_2yrs,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,13375.0,0.1797,483.34,2,223000.0,0,1,29.99,0.0,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,0,0
1,141451,21000.0,0.1308,478.68,2,123000.0,1,1,11.26,2.0,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,0,0
2,321143,20000.0,0.1240,448.95,2,197000.0,1,1,11.28,0.0,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,0,0
3,11778,3000.0,0.1240,100.22,1,45000.0,0,1,18.08,0.0,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,0,0
4,169382,30000.0,0.1612,1056.49,2,133000.0,1,1,27.77,0.0,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12175,354912,19975.0,0.2565,801.09,1,28000.0,0,0,28.42,0.0,...,100.0,16.7,0.0,0.0,50055.0,28192.0,18700.0,19055.0,0,0
12176,354944,15000.0,0.1774,540.34,1,50000.0,1,0,23.43,4.0,...,90.5,11.1,0.0,0.0,70324.0,57025.0,13300.0,54824.0,0,0
12177,354973,3600.0,0.1862,131.28,1,60000.0,0,0,28.80,0.0,...,100.0,0.0,0.0,0.0,83765.0,55156.0,14800.0,53065.0,0,0
12178,355002,15000.0,0.0881,475.68,2,62000.0,1,0,11.44,0.0,...,100.0,0.0,0.0,0.0,189930.0,23748.0,7000.0,32930.0,0,0


## Categorical Columns
home_ownership, verification_status, loan_status, pymnt_plan,initial_list_status,application_type,hardship_flag,debt_settlement_flag

In [14]:
# Convert categorical data to numeric and separate target feature for testing data

YesNo_flag = {'Y':1,'N':0}
test_df2 = test_df.replace({'hardship_flag':YesNo_flag, 'debt_settlement_flag':YesNo_flag})

Home_ownership_val = {'ANY':0,'RENT':1,'MORTGAGE':2,'OWN':3}
test_df3 = test_df2.replace({'home_ownership':Home_ownership_val})

verification_val = {'Not Verified':0,'Source Verified':1,'Verified':1}
test_df4 = test_df3.replace({'verification_status':verification_val})

Loan_status_val = {'low_risk':1,'high_risk':0}
test_df5 = test_df4.replace({'loan_status':Loan_status_val})

Initial_list_status_val = {'w':0,'f':1}
test_df6 = test_df5.replace({'initial_list_status':Initial_list_status_val})

Application_Type_val = {'Individual':1,'Joint App':0}
test_df7 = test_df6.replace({'application_type':Application_Type_val})

In [15]:
test_df8 = test_df7.drop(['pymnt_plan'],axis=1)

#df.drop(['A'], axis = 1)
test_df8

Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,dti,delinq_2yrs,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,40000.0,0.0819,814.70,2,140000.0,0,1,19.75,0.0,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,0,0
1,25429,6000.0,0.1524,208.70,1,55000.0,0,1,11.52,2.0,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,0,0
2,38496,3600.0,0.1695,128.27,1,42000.0,0,1,6.74,0.0,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,0,0
3,19667,20000.0,0.1524,478.33,1,100000.0,0,1,12.13,0.0,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,0,0
4,37505,3600.0,0.1240,120.27,1,50000.0,0,1,16.08,0.0,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4697,77282,30000.0,0.1240,673.42,1,140480.0,1,0,15.74,0.0,...,100.0,28.6,0.0,0.0,159688.0,110873.0,48400.0,107388.0,0,0
4698,77291,24000.0,0.0756,747.22,1,50000.0,0,0,26.81,0.0,...,100.0,0.0,0.0,0.0,62375.0,18928.0,13300.0,30775.0,0,0
4699,77292,10000.0,0.2305,387.36,1,33000.0,1,0,38.51,0.0,...,100.0,0.0,0.0,0.0,43250.0,33022.0,8500.0,29550.0,0,0
4700,77297,8000.0,0.1862,205.86,1,38000.0,1,0,16.36,0.0,...,95.0,0.0,1.0,0.0,31357.0,19595.0,1500.0,9657.0,0,0


In [16]:
#train_df8 = train_df7.drop(['index','pymnt_plan'],axis='columns')

# file_path = Path('Resources/cleaned_2020_loan_data.csv')
test_df8.to_csv('Resources/cleaned_2020_loan_data.csv', index=False)

In [17]:
# Convert categorical data to numeric and separate target feature for testing data
# One-hot encoding the entire dataframe
X_testdf_dummies = pd.get_dummies(test_df)
print(X_testdf_dummies.columns)
X_testdf_dummies

Index(['index', 'loan_amnt', 'int_rate', 'installment', 'annual_inc', 'dti',
       'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal',
       'total_acc', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'collections_12_mths_ex_med', 'policy_code',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
       'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
       'mort_acc', 'mths_since_recent_bc', 'mths_since_

Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,...,loan_status_high_risk,loan_status_low_risk,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,67991,40000.0,0.0819,814.70,140000.0,19.75,0.0,1.0,18.0,0.0,...,0,1,1,0,1,1,0,1,0,1
1,25429,6000.0,0.1524,208.70,55000.0,11.52,2.0,0.0,8.0,0.0,...,0,1,1,0,1,1,0,1,0,1
2,38496,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,0.0,...,0,1,1,0,1,1,0,1,0,1
3,19667,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,0.0,...,0,1,1,0,1,1,0,1,0,1
4,37505,3600.0,0.1240,120.27,50000.0,16.08,0.0,3.0,6.0,0.0,...,0,1,1,0,1,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4697,77282,30000.0,0.1240,673.42,140480.0,15.74,0.0,0.0,20.0,0.0,...,1,0,1,1,0,1,0,1,0,1
4698,77291,24000.0,0.0756,747.22,50000.0,26.81,0.0,0.0,9.0,0.0,...,1,0,1,0,1,1,0,1,0,1
4699,77292,10000.0,0.2305,387.36,33000.0,38.51,0.0,2.0,7.0,0.0,...,1,0,1,1,0,1,0,1,0,1
4700,77297,8000.0,0.1862,205.86,38000.0,16.36,0.0,1.0,8.0,1.0,...,1,0,1,0,1,1,0,1,0,1


In [18]:
# add missing dummy variables to testing set

X_train = train_df8.drop('loan_status', axis=1)
y_train = train_df8['loan_status'].values

X_test = test_df8.drop('loan_status', axis=1)
y_test = test_df8['loan_status'].values



In [19]:
# Train the Logistic Regression model on the unscaled data and print the model score
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier


LogisticRegression()

In [20]:
clflog=classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
print(f'Training Score for unscaled with Logistic Regression: {clflog.score(X_train, y_train)}')
print(f'Testing Score for unscaled with Logistic Regression: {clflog.score(X_test, y_test)}')

Training Score for unscaled with Logistic Regression: 0.6558292282430214
Testing Score for unscaled with Logistic Regression: 0.518290089323692


In [22]:
# Train a Random Forest Classifier model and print the model score

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train, y_train)
print(f'Training Score for unscaled with Random Forest Classifier: {clf.score(X_train, y_train)}')
print(f'Testing Score for unscaled with Random Forest Classifier: {clf.score(X_test, y_test)}')

Training Score for unscaled with Random Forest Classifier: 1.0
Testing Score for unscaled with Random Forest Classifier: 0.6631220757124627


In [23]:
# Scale the data

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [24]:
# Train the Logistic Regression model on the scaled data and print the model score

clflog=classifier.fit(X_train_scaled, y_train)

print(f'Training Score for scaled with Logistic Regression: {clflog.score(X_train_scaled, y_train)}')
print(f'Testing Score for scaled with Logistic Regression: {clflog.score(X_test_scaled, y_test)}')

Training Score for scaled with Logistic Regression: 0.7121510673234811
Testing Score for scaled with Logistic Regression: 0.7213951509995746


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [25]:
# Train a Random Forest Classifier model on the scaled data and print the model score

clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
print(f'Training Score for scaled with Random Forest Classifier: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score for scaled with Random Forest Classifier: {clf.score(X_test_scaled, y_test)}')

Training Score for scaled with Random Forest Classifier: 1.0
Testing Score for scaled with Random Forest Classifier: 0.6648234793704807
