In [1]:
%matplotlib inline 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

### checking columns for the 2 dfs

In [3]:
train_cols=train_df.columns
test_cols=test_df.columns
train_types = train_df.dtypes
test_types= test_df.dtypes

In [4]:
df_compare = list(zip(train_cols, train_types, test_cols, test_types))
df_compare

[('loan_amnt', dtype('float64'), 'loan_amnt', dtype('float64')),
 ('int_rate', dtype('float64'), 'int_rate', dtype('float64')),
 ('installment', dtype('float64'), 'installment', dtype('float64')),
 ('home_ownership', dtype('O'), 'home_ownership', dtype('O')),
 ('annual_inc', dtype('float64'), 'annual_inc', dtype('float64')),
 ('verification_status', dtype('O'), 'verification_status', dtype('O')),
 ('pymnt_plan', dtype('O'), 'pymnt_plan', dtype('O')),
 ('dti', dtype('float64'), 'dti', dtype('float64')),
 ('delinq_2yrs', dtype('float64'), 'delinq_2yrs', dtype('float64')),
 ('inq_last_6mths', dtype('float64'), 'inq_last_6mths', dtype('float64')),
 ('open_acc', dtype('float64'), 'open_acc', dtype('float64')),
 ('pub_rec', dtype('float64'), 'pub_rec', dtype('float64')),
 ('revol_bal', dtype('float64'), 'revol_bal', dtype('float64')),
 ('total_acc', dtype('float64'), 'total_acc', dtype('float64')),
 ('initial_list_status', dtype('O'), 'initial_list_status', dtype('O')),
 ('out_prncp', dtype(

### Convert categorical data to numeric and separate target feature for training data

In [5]:
train_convert_df = pd.get_dummies(train_df)
train_convert_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y,target_high_risk,target_low_risk
0,7000.0,0.1894,256.38,75000.0,28.62,0.0,2.0,20.0,0.0,40414.0,...,1,0,1,0,1,0,1,0,0,1
1,40000.0,0.1614,975.71,102000.0,11.72,2.0,0.0,10.0,0.0,43531.0,...,0,1,1,0,1,0,1,0,0,1
2,11000.0,0.2055,294.81,45000.0,37.25,1.0,3.0,23.0,0.0,8242.0,...,0,1,1,0,1,0,1,0,0,1
3,4000.0,0.1612,140.87,38000.0,42.89,1.0,0.0,7.0,0.0,12767.0,...,0,1,0,1,1,0,1,0,0,1
4,14000.0,0.1797,505.93,43000.0,22.16,1.0,0.0,22.0,0.0,11182.0,...,0,1,1,0,1,0,1,0,0,1


In [6]:
train_con_cols = train_convert_df.columns

### Convert categorical data to numeric and separate target feature for testing data

In [7]:
test_convert_df = pd.get_dummies(test_df)
test_convert_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,target_high_risk,target_low_risk
0,40000.0,0.1033,856.4,128700.0,12.47,0.0,1.0,8.0,0.0,38113.0,...,1,0,1,1,0,0,1,1,0,1
1,24450.0,0.143,572.72,44574.0,15.05,0.0,1.0,6.0,0.0,1665.0,...,1,0,1,1,0,1,0,1,0,1
2,13500.0,0.143,316.23,60000.0,28.72,0.0,0.0,8.0,0.0,13857.0,...,1,0,1,1,0,0,1,1,0,1
3,10625.0,0.1774,268.31,60000.0,15.7,0.0,4.0,17.0,0.0,6216.0,...,1,0,1,1,0,1,0,1,0,1
4,6375.0,0.1862,232.46,60000.0,35.5,0.0,0.0,13.0,0.0,12681.0,...,1,0,1,1,0,1,0,1,0,1


In [8]:
test_con_cols = test_convert_df.columns
converted_compare = list(zip(train_con_cols, test_con_cols))
converted_compare

[('loan_amnt', 'loan_amnt'),
 ('int_rate', 'int_rate'),
 ('installment', 'installment'),
 ('annual_inc', 'annual_inc'),
 ('dti', 'dti'),
 ('delinq_2yrs', 'delinq_2yrs'),
 ('inq_last_6mths', 'inq_last_6mths'),
 ('open_acc', 'open_acc'),
 ('pub_rec', 'pub_rec'),
 ('revol_bal', 'revol_bal'),
 ('total_acc', 'total_acc'),
 ('out_prncp', 'out_prncp'),
 ('out_prncp_inv', 'out_prncp_inv'),
 ('total_pymnt', 'total_pymnt'),
 ('total_pymnt_inv', 'total_pymnt_inv'),
 ('total_rec_prncp', 'total_rec_prncp'),
 ('total_rec_int', 'total_rec_int'),
 ('total_rec_late_fee', 'total_rec_late_fee'),
 ('recoveries', 'recoveries'),
 ('collection_recovery_fee', 'collection_recovery_fee'),
 ('last_pymnt_amnt', 'last_pymnt_amnt'),
 ('collections_12_mths_ex_med', 'collections_12_mths_ex_med'),
 ('policy_code', 'policy_code'),
 ('acc_now_delinq', 'acc_now_delinq'),
 ('tot_coll_amt', 'tot_coll_amt'),
 ('tot_cur_bal', 'tot_cur_bal'),
 ('open_acc_6m', 'open_acc_6m'),
 ('open_act_il', 'open_act_il'),
 ('open_il_12m', '

### add missing dummy variables to testing set

In [9]:
test_convert_df['debt_settlement_flag_Y'] = np.where(test_convert_df['debt_settlement_flag_N'] == 1, 0, 1)
test_convert_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,target_high_risk,target_low_risk,debt_settlement_flag_Y
0,40000.0,0.1033,856.4,128700.0,12.47,0.0,1.0,8.0,0.0,38113.0,...,0,1,1,0,0,1,1,0,1,0
1,24450.0,0.143,572.72,44574.0,15.05,0.0,1.0,6.0,0.0,1665.0,...,0,1,1,0,1,0,1,0,1,0
2,13500.0,0.143,316.23,60000.0,28.72,0.0,0.0,8.0,0.0,13857.0,...,0,1,1,0,0,1,1,0,1,0
3,10625.0,0.1774,268.31,60000.0,15.7,0.0,4.0,17.0,0.0,6216.0,...,0,1,1,0,1,0,1,0,1,0
4,6375.0,0.1862,232.46,60000.0,35.5,0.0,0.0,13.0,0.0,12681.0,...,0,1,1,0,1,0,1,0,1,0


## Consider the models...

### Which model will perform better: logistic regression or random forests classifier?

* I predict the Random Forest will outperform the Logistic Regression. I'm grounding this in 2 scholarly articles I
found online that said Random Forest models increasingly outperform Logistic Regression.

* https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-018-2264-5
* https://scholar.smu.edu/cgi/viewcontent.cgi?article=1041&context=datasciencereview

## Train the Logistic Regression model on the unscaled data and print the model score

In [10]:
train_convert_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y,target_high_risk,target_low_risk
0,7000.0,0.1894,256.38,75000.0,28.62,0.0,2.0,20.0,0.0,40414.0,...,1,0,1,0,1,0,1,0,0,1
1,40000.0,0.1614,975.71,102000.0,11.72,2.0,0.0,10.0,0.0,43531.0,...,0,1,1,0,1,0,1,0,0,1
2,11000.0,0.2055,294.81,45000.0,37.25,1.0,3.0,23.0,0.0,8242.0,...,0,1,1,0,1,0,1,0,0,1
3,4000.0,0.1612,140.87,38000.0,42.89,1.0,0.0,7.0,0.0,12767.0,...,0,1,0,1,1,0,1,0,0,1
4,14000.0,0.1797,505.93,43000.0,22.16,1.0,0.0,22.0,0.0,11182.0,...,0,1,1,0,1,0,1,0,0,1


In [11]:
# Assign the data to X and y
# Note: Sklearn requires a two-dimensional array of values
# so we use reshape() to create this

X = train_convert_df.drop(['target_high_risk', 'target_low_risk'], axis=1)
y = train_convert_df['target_high_risk']   # outcome variable from training data

print("Shape: ", X.shape, y.shape)

Shape:  (12180, 92) (12180,)


In [12]:
# 2020 Q1 data that we are trying to predict
X_20 = test_convert_df.drop(['target_high_risk', 'target_low_risk'], axis=1)
y_20 = test_convert_df['target_high_risk']   # outcome variable from training data

print("Shape: ", X_20.shape, y_20.shape)

Shape:  (4702, 92) (4702,)


In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7)

In [14]:
# kept getting error about increasing max iterations, so went up to 20k
from sklearn.linear_model import LogisticRegression
classifier= LogisticRegression(max_iter=20000)
classifier

LogisticRegression(max_iter=20000)

In [15]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=20000)

In [16]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score (from training set): {classifier.score(X_test, y_test)}")
print(f'First Quarter 2020 fit: {classifier.score(X_20, y_20)}')

Training Data Score: 0.7094690749863164
Testing Data Score (from training set): 0.69688013136289
First Quarter 2020 fit: 0.5546575925138238


In [17]:
# setting vars to create table later
train_logreg = classifier.score(X_train, y_train)
test_logreg = classifier.score(X_test, y_test)
Q1_2020_logreg = classifier.score(X_20, y_20)

# Train a Random Forest Classifier model and print the model score

In [18]:
from sklearn.ensemble import RandomForestClassifier

* not entirely sure what hyperparameters I would change

In [19]:
clf = RandomForestClassifier(random_state=7, n_estimators=500).fit(X_train, y_train)
print(f"Training Data Score: {clf.score(X_train, y_train)}")
print(f"Testing Data Score (from training set): {clf.score(X_test, y_test)}")
print(f'First Quarter 2020 fit: {clf.score(X_20, y_20)}')

Training Data Score: 1.0
Testing Data Score (from training set): 0.7878489326765189
First Quarter 2020 fit: 0.6461080391322841


In [20]:
# setting vars to create table later
train_clf = clf.score(X_train, y_train)
test_clf = clf.score(X_test, y_test)
Q1_2020_clf = clf.score(X_20, y_20)

## Results
* As predicted, the random forest model outperformed the logistic regression model. 
* Logistic regression had Training Score: 0.71, Testing Score (from training set): 0.70, and more important First Quarter 2020 fit: 0.55
* Random forest had Training Score: 1.0, Testing Score (from training set): 0.79, and First Quarter 2020 fit: 0.65

## Revisit Preprocessing: Scale the data
* predict scaling data will improve both models

In [21]:
from sklearn.preprocessing import StandardScaler

In [22]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_20_scaled=scaler.transform(X_20)

## Train the Logistic Regression model on the scaled data and print the model score

In [23]:
classifier.fit(X_train_scaled, y_train)
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score (from training set): {classifier.score(X_test_scaled, y_test)}")
print(f'First Quarter 2020 fit: {classifier.score(X_20_scaled, y_20)}')

Training Data Score: 0.7139573070607553
Testing Data Score (from training set): 0.7004926108374384
First Quarter 2020 fit: 0.753721820501914


In [24]:
# setting vars to create table later
train_logreg_scaled = classifier.score(X_train_scaled, y_train)
test_logreg_scaled = classifier.score(X_test_scaled, y_test)
Q1_2020_logreg_scaled = classifier.score(X_20_scaled, y_20)

## Train a Random Forest Classifier model on the scaled data and print the model score

In [25]:
clf = RandomForestClassifier(random_state=7, n_estimators=500).fit(X_train_scaled, y_train)
print(f"Training Data Score: {clf.score(X_train_scaled, y_train)}")
print(f"Testing Data Score (from training set): {clf.score(X_test_scaled, y_test)}")
print(f'First Quarter 2020 fit: {clf.score(X_20_scaled, y_20)}')

Training Data Score: 1.0
Testing Data Score (from training set): 0.7878489326765189
First Quarter 2020 fit: 0.6452573373032752


In [26]:
train_clf_scaled = clf.score(X_train_scaled, y_train)
test_clf_scaled = clf.score(X_test_scaled, y_test)
Q1_2020_clf_scaled = clf.score(X_20_scaled, y_20)

In [27]:
# creating a df to hold the results
results = {'data_set': ['2019_training', '2019_test', '2020_Q1'], 'LogReg_Unscaled': [train_logreg, test_logreg, Q1_2020_logreg],
    'RandomForest_Unscaled': [train_clf, test_clf, Q1_2020_clf], 'LogReg_Scaled': [train_logreg_scaled, test_logreg_scaled, Q1_2020_logreg_scaled],
    'RandomForest_Scaled': [train_clf_scaled, test_clf_scaled, Q1_2020_clf_scaled]}
results_df = pd.DataFrame(data=results)
results_df

Unnamed: 0,data_set,LogReg_Unscaled,RandomForest_Unscaled,LogReg_Scaled,RandomForest_Scaled
0,2019_training,0.709469,1.0,0.713957,1.0
1,2019_test,0.69688,0.787849,0.700493,0.787849
2,2020_Q1,0.554658,0.646108,0.753722,0.645257


## Wrap-up
* Scaling data did not improve model fit for the training or 2019 test data. It did, however significantly improve logistic regression model fit for the 2020_Q1 data which jumped from .55 to .75 (which was higher than the model performed on itself). The random forest models performed so similarly that I checked multiple times to make sure I had modified the code to show the correct values.
