In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
train_df.columns

Index(['Unnamed: 0', 'index', 'loan_amnt', 'int_rate', 'installment',
       'home_ownership', 'annual_inc', 'verification_status', 'loan_status',
       'pymnt_plan', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
       'pub_rec', 'revol_bal', 'total_acc', 'initial_list_status', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_amnt',
       'collections_12_mths_ex_med', 'policy_code', 'application_type',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_ol

In [4]:
train_df.isnull().sum()>0

Unnamed: 0                    False
index                         False
loan_amnt                     False
int_rate                      False
installment                   False
                              ...  
total_bal_ex_mort             False
total_bc_limit                False
total_il_high_credit_limit    False
hardship_flag                 False
debt_settlement_flag          False
Length: 86, dtype: bool

In [5]:
train_df['loan_status'].value_counts()
test_df['debt_settlement_flag'].value_counts()

N    4702
Name: debt_settlement_flag, dtype: int64

In [6]:
test_df

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.70,MORTGAGE,140000.0,Not Verified,low_risk,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,25429,6000.0,0.1524,208.70,RENT,55000.0,Not Verified,low_risk,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,38496,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,low_risk,n,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,19667,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,37505,3600.0,0.1240,120.27,RENT,50000.0,Not Verified,low_risk,n,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4697,77282,77282,30000.0,0.1240,673.42,RENT,140480.0,Source Verified,high_risk,n,...,100.0,28.6,0.0,0.0,159688.0,110873.0,48400.0,107388.0,N,N
4698,77291,77291,24000.0,0.0756,747.22,RENT,50000.0,Not Verified,high_risk,n,...,100.0,0.0,0.0,0.0,62375.0,18928.0,13300.0,30775.0,N,N
4699,77292,77292,10000.0,0.2305,387.36,RENT,33000.0,Verified,high_risk,n,...,100.0,0.0,0.0,0.0,43250.0,33022.0,8500.0,29550.0,N,N
4700,77297,77297,8000.0,0.1862,205.86,RENT,38000.0,Source Verified,high_risk,n,...,95.0,0.0,1.0,0.0,31357.0,19595.0,1500.0,9657.0,N,N


In [7]:
test_df['loan_status'].value_counts()

high_risk    2351
low_risk     2351
Name: loan_status, dtype: int64

In [8]:
# Convert categorical data to numeric and separate target feature for training data
train_num_df = train_df.drop(columns=['Unnamed: 0', 'index', 'loan_status'], axis=1)
train_num_df = pd.get_dummies(train_num_df, drop_first=True)
#train_num_df.info()

In [9]:
# Convert categorical data to numeric and separate target feature for testing data
test_num_df = test_df.drop(columns=['Unnamed: 0', 'index', 'loan_status'], axis=1)
test_num_df = pd.get_dummies(test_num_df, drop_first=True)
#test_num_df.info()

In [10]:
# add missing dummy variables to testing set
test_num_df['debt_settlement_flag_Y'] = 0
#test_num_df.info()
#test_num_df['debt_settlement_flag_Y']

In [11]:
X_train = train_num_df
y_train = train_df['loan_status']
X_test = test_num_df
y_test = test_df['loan_status']

In [12]:
print(X_train['debt_settlement_flag_Y'].sum())
print(X_test['debt_settlement_flag_Y'].sum())

5
0


In [13]:
print(len(y_train))
print(len(y_test))

12180
4702


I would expect Random Forest Classifier to perform better with the unscaled data. This is due to the way that the decision-making is done using Random Forest Classifier and making the best decision at each juncture.

In [14]:
# Train the Logistic Regression model on the unscaled data and print the model score

classifier = LogisticRegression(max_iter=13000)
classifier.fit(X_train, y_train)
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.6981937602627257
Testing Data Score: 0.5725223309230115


In [15]:
# Train a Random Forest Classifier model and print the model score

clf = RandomForestClassifier(random_state=42, n_estimators=500).fit(X_train, y_train)
print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')

Training Score: 1.0
Testing Score: 0.6354742662696725


As expected the Random Forest Classifier outperformed Logistic Regression for the unscaled Data.

In [16]:
# Scale the data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

I expect Logistic Regression to outperform Random Forest Classifier with the scaled data. Logistic Regression tends to perform better when the data is scaled compared to Random Forest Classifier.

In [17]:
# Train the Logistic Regression model on the scaled data and print the model score
classifier_s = LogisticRegression(max_iter=13000)
classifier_s.fit(X_train_scaled, y_train)
print(f"Scaled Training Data Score: {classifier_s.score(X_train_scaled, y_train)}")
print(f"Scaled Testing Data Score: {classifier_s.score(X_test_scaled, y_test)}")
#train_score = classifier.score(X_train_scaled, y_train)
#test_score = classifier.score(X_test_scaled, y_test)
#print(f"Train Score: {train_score:.3f}")
#print(f"Test Score: {test_score:.3f}")

Scaled Training Data Score: 0.7078817733990148
Scaled Testing Data Score: 0.7679710761378137


In [18]:
# Train a Random Forest Classifier model on the scaled data and print the model score
clf_s = RandomForestClassifier(random_state=42, n_estimators=500).fit(X_train_scaled, y_train)
print(f'Scaled Training Score: {clf_s.score(X_train_scaled, y_train)}')
print(f'Scaled Testing Score: {clf_s.score(X_test_scaled, y_test)}')

Scaled Training Score: 1.0
Scaled Testing Score: 0.6354742662696725


As expected the Logistic Regression outperformed Random Forest Classifier with the scaled data.