In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
## Retrieve the data

In [3]:
# Read CSV
train_df = pd.read_csv(Path('data/2019loans.csv'))
test_df = pd.read_csv(Path('data/2020Q1loans.csv'))

train_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,target
0,7000.0,0.1894,256.38,MORTGAGE,75000.0,Not Verified,n,28.62,0.0,2.0,...,87.5,0.0,0.0,352260.0,62666.0,35000.0,10000.0,N,N,low_risk
1,40000.0,0.1614,975.71,MORTGAGE,102000.0,Source Verified,n,11.72,2.0,0.0,...,0.0,0.0,0.0,294664.0,109911.0,9000.0,71044.0,N,N,low_risk
2,11000.0,0.2055,294.81,RENT,45000.0,Verified,n,37.25,1.0,3.0,...,7.7,0.0,0.0,92228.0,36007.0,33000.0,46328.0,N,N,low_risk
3,4000.0,0.1612,140.87,MORTGAGE,38000.0,Not Verified,n,42.89,1.0,0.0,...,100.0,0.0,0.0,284273.0,52236.0,13500.0,52017.0,N,N,low_risk
4,14000.0,0.1797,505.93,MORTGAGE,43000.0,Source Verified,n,22.16,1.0,0.0,...,25.0,0.0,0.0,120280.0,88147.0,33300.0,78680.0,N,N,low_risk


In [4]:
# Convert categorical data to numeric and separate target feature for training data
y_train = train_df['target']

from sklearn.preprocessing import LabelEncoder
y_train_label = LabelEncoder().fit_transform(train_df['target'])

x_train = train_df.drop(columns = ['target'])

In [5]:
# Convert categorical data to numeric and separate target feature for testing data
y_test = test_df['target']
y_test_label = LabelEncoder().fit_transform(test_df['target'])
x_test = test_df.drop(columns = ['target'])

In [6]:
## Preprocessing: Convert categorical data to numeric

In [7]:
# Add missing dummy variables to testing set
x_train_dummies = pd.get_dummies(x_train)
x_test_dummies = pd.get_dummies(x_test)

In [8]:
# Fill in missing categories in the testing set
for column in x_train_dummies.columns:
    if column not in x_test_dummies.columns:
        x_test_dummies[column]=0

In [9]:
## Prediction - Logistic Regression vs. Random Forest (unscaled)

# The dataset we are using is highly categorical and non-linear. 
# Linear relationships are best used with logistic regression.
# Datasets with non-linear relationships operate best with random forest for a balance between precision and overfitting.
# The prediction would be that in this case, the random forest model will be more accurate compared to a logistic regression model.

In [10]:
# Train the Logistic Regression model on the unscaled data and print the model score
from sklearn.linear_model import LogisticRegression
classifier_lr = LogisticRegression()
classifier_lr.fit(x_train_dummies, y_train_label)

print(f"Training Data Score: {classifier_lr.score(x_train_dummies, y_train_label)}")
print(f"Testing Data Score: {classifier_lr.score(x_test_dummies, y_test_label)}")

Training Data Score: 0.6528735632183909
Testing Data Score: 0.5095703955763505


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
# Train a Random Forest Classifier model and print the model score
from sklearn.ensemble import RandomForestClassifier
classifier_rf = RandomForestClassifier(random_state=1, n_estimators=500).fit(x_train_dummies, y_train_label)

print(f'Training Score: {classifier_rf.score(x_train_dummies, y_train_label)}')
print(f'Testing Score: {classifier_rf.score(x_test_dummies, y_test_label)}')

Training Score: 1.0
Testing Score: 0.646958740961293


In [12]:
## Results - Logistic Regression vs. Random Forest (unscaled)

# The random forest model yielded a score of 0.64 compared to the logistic regression model score of 0.51.
# The result was in line with the initial prediction of a random forest being more accurate in this particular dataset for the non-linear relationships.

In [13]:
# Scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(x_train_dummies)
x_train_dummies_scaled = scaler.transform(x_train_dummies)
x_test_dummies_scaled = scaler.transform(x_test_dummies)

In [15]:
## Prediction - Logistic Regression vs. Random Forest (scaled)

# Random forest models do not require the data to be scaled and so will not see much of a change from the data being scaled.
# Logistic regression models will see an improved score from the scaling of the data.

In [16]:
# Train the Logistic Regression model on the scaled data and print the model score
classifier_lr = LogisticRegression()
classifier_lr.fit(x_train_dummies_scaled, y_train_label)

print(f"Training Data Score: {classifier_lr.score(x_train_dummies_scaled, y_train_label)}")
print(f"Testing Data Score: {classifier_lr.score(x_test_dummies_scaled, y_test_label)}")

Training Data Score: 0.710919540229885
Testing Data Score: 0.7598894087622289


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
# Train a Random Forest Classifier model on the scaled data and print the model score
classifier_rf = RandomForestClassifier(random_state=1, n_estimators=500).fit(x_train_dummies_scaled, y_train_label)

print(f'Training Score: {classifier_rf.score(x_train_dummies_scaled, y_train_label)}')
print(f'Testing Score: {classifier_rf.score(x_test_dummies_scaled, y_test_label)}')

Training Score: 1.0
Testing Score: 0.6480221182475542


In [18]:
## Results - Logistic Regression vs. Random Forest (scaled)

# The random forest model yielded the same test score result as with the unscaled dataset.
# The logistic regression model shows a test score improvement of 0.25.
# The results are in line with the initial prediction that scaling has a significant impact on logistic regression models, with little to no impact on random forest models.
# With an unscaled dataset, the random forest was resulting in a higher test score compared to logistic regression.
# However, after scaling, logistic regression yielded a higher result than random forest, showing that a properly scaled data on a simpler model could be more effective than complex decision tree models like random forests.