In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
X_test = test_df.drop('target', axis=1)
X_train = train_df.drop('target', axis=1)

In [4]:
encoder = LabelEncoder().fit(train_df["target"])
y_train = encoder.transform(train_df["target"])
y_test = encoder.transform(test_df['target'])

In [5]:
# add missing dummy variables to testing set
X_train_dummies = pd.get_dummies(X_train)
X_test_dummies = pd.get_dummies(X_test)

In [6]:
# Finding the missing column
for col in X_train_dummies.columns:
    if col not in X_test_dummies.columns:
        print(col)

debt_settlement_flag_Y


In [7]:
#Adding the missing column
X_test_dummies['debt_settlement_flag_Y'] = 0
X_test_dummies = X_test_dummies[X_train_dummies.columns]

# Prediction (Unscaled)
For this specific case, I believe that there will be several contributing factors to determine if our target will be high-risk or low-risk. And because of that, I think that the Logistic Regression model will perform better and will either be overfit slightly or not at all. this is because it will be better at taking all of the variables to create a more accurate test model. I think the Random Forest Classifier will also perform well, but not as well as the Logistic Regression model.

In [8]:
# Train the Logistic Regression model on the unscaled data and print the model score
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train_dummies, y_train)
print(f"Training Data Score: {lr.score(X_train_dummies, y_train)}")
print(f"Testing Data Score: {lr.score(X_test_dummies, y_test)}")

Training Data Score: 0.65311986863711
Testing Data Score: 0.5076563164610803


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
# Train a Random Forest Classifier model and print the model score
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=350, max_depth=3)
rf.fit(X_train_dummies, y_train)
print(f"Training Data Score: {rf.score(X_train_dummies, y_train)}")
print(f"Testing Data Score: {rf.score(X_test_dummies, y_test)}")

Training Data Score: 0.7405582922824302
Testing Data Score: 0.620374308804764


# Analysis (Unscaled)
To my surprise, the RFC model actually performed slightly better, and was less overfit than the LR model. I did not expect this result but I believe that it could potentially be due to the data being unscaled. It will be interesting to see how that changes our results. 

In [10]:
# Scale the data
scaler = StandardScaler().fit(X_train_dummies)

# Use the scaler on X_train and X_test
X_train_scaled = scaler.transform(X_train_dummies)
X_test_scaled = scaler.transform(X_test_dummies)

# Prediction (Scaled)
It is my thought that after scaling the data, we will see significant improvements in the testing score of both models and they will no longer be overfit. But i still think that the Logistic Regression model will perform better overall. 

In [11]:
# Train the Logistic Regression model on the scaled data and print the model score
lr.fit(X_train_scaled, y_train)
print(f"Training Data Score: {lr.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {lr.score(X_test_scaled, y_test)}")

Training Data Score: 0.710919540229885
Testing Data Score: 0.7598894087622289


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
# Train a Random Forest Classifier model on the scaled data and print the model score
rf.fit(X_train_scaled, y_train)
print(f"Training Data Score: {rf.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {rf.score(X_test_scaled, y_test)}")

Training Data Score: 0.7300492610837438
Testing Data Score: 0.6050616758826032


In [13]:
#Finding the impacts of each variable
rf.feature_importances_

array([7.72622824e-03, 9.75894899e-02, 1.71763305e-02, 9.68490538e-04,
       3.62476782e-04, 7.71801620e-05, 2.02532164e-03, 2.50924274e-04,
       2.00362469e-06, 1.32179578e-03, 3.10559253e-04, 2.40083206e-02,
       2.59709321e-02, 1.05927766e-01, 9.92536923e-02, 7.84769279e-02,
       1.25625542e-01, 6.46810234e-02, 0.00000000e+00, 0.00000000e+00,
       2.31356184e-01, 3.32731339e-05, 0.00000000e+00, 0.00000000e+00,
       4.00085355e-05, 9.89266726e-04, 1.36566618e-03, 2.06291264e-05,
       7.33957603e-05, 1.93512282e-04, 4.33067929e-04, 3.76614309e-04,
       3.42738479e-03, 2.28007028e-03, 4.91863136e-03, 3.68565063e-03,
       2.76950143e-03, 3.86050277e-03, 5.78444269e-04, 2.83647427e-04,
       3.34822291e-03, 1.11154087e-02, 1.64476418e-03, 6.11780939e-03,
       9.81017103e-04, 0.00000000e+00, 0.00000000e+00, 1.66201995e-03,
       5.24179237e-03, 1.71907875e-03, 3.60666306e-03, 1.57450698e-03,
       2.02779649e-03, 6.90166357e-03, 9.02998910e-04, 1.93952813e-04,
      

# Analysis (Scaled)
After scaling, our Logistic Regression model was no longer overfitting, and we also were able to get a higher testing score. However,  the Random Forest Classifier testing score actually got worse and remained overfit, which was the opposite of my predition. My predictions for Logistic Regression were right, and it has proven to be the better model after I scaled the data.