In [65]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel

In [41]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [42]:
# Convert categorical data to numeric and separate target feature for training data
train_df = pd.get_dummies(train_df)

In [43]:
# Convert categorical data to numeric and separate target feature for testing data
test_df = pd.get_dummies(test_df)

In [44]:
len(train_df.columns)

96

In [45]:
len(test_df.columns)

95

In [46]:
# add missing dummy variables to testing set

# Get missing columns in the training test
missing_cols = set( train_df.columns ) - set( test_df.columns )

# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    test_df[c] = 0
    
# Ensure the order of column in the test set is in the same order than in train set
test_df = test_df[train_df.columns]

In [47]:
len(train_df.columns)

96

In [48]:
len(test_df.columns)

96

In [49]:
train_df.head()
train_df.shape
test_df.head()
test_df.shape

(4702, 96)

In [50]:
# Train the Logistic Regression model on the unscaled data and print the model score
# 2019

X = train_df.drop('loan_status_low_risk', 1)
y = train_df['loan_status_low_risk']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
print(f"Training Data Score 2019: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score 2019: {classifier.score(X_test, y_test)}")

Training Data Score 2019: 0.6524356869184456
Testing Data Score 2019: 0.6466338259441707


  X = train_df.drop('loan_status_low_risk', 1)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [51]:
# Train the Logistic Regression model on the unscaled data and print the model score
# 2020

A = test_df.drop('loan_status_low_risk', 1)
b = test_df['loan_status_low_risk']
A_train, A_test, b_train, b_test = train_test_split(A, b, random_state=1)
classifier.fit(A_train, b_train)
print(f"Training Data Score 2020: {classifier.score(A_train, b_train)}")
print(f"Testing Data Score 2020: {classifier.score(A_test, b_test)}")

Training Data Score 2020: 0.8326715825297788
Testing Data Score 2020: 0.8324829931972789


  A = test_df.drop('loan_status_low_risk', 1)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [55]:
# Train a Random Forest Classifier model and print the model score
# 2019

clf19 = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train, y_train)
print(f'Random Forest Training Score 2019: {clf19.score(X_train, y_train)}')
print(f'Random Forest Testing Score 2019: {clf19.score(X_test, y_test)}')

Random Forest Training Score 2019: 1.0
Random Forest Testing Score 2019: 0.999671592775041


In [56]:
# Train a Random Forest Classifier model and print the model score
# 2020

clf20 = RandomForestClassifier(random_state=1, n_estimators=500).fit(A_train, b_train)
print(f'Random Forest Training Score 2020: {clf20.score(A_train, b_train)}')
print(f'Random Forest Testing Score 2020: {clf20.score(A_test, b_test)}')

Random Forest Training Score 2020: 1.0
Random Forest Testing Score 2020: 1.0


In [59]:
# Scale the data
# 2019

scaler19 = StandardScaler().fit(X_train)
X_train_scaled = scaler19.transform(X_train)
X_test_scaled = scaler19.transform(X_test)

In [61]:
# Scale the data
# 2020

scaler20 = StandardScaler().fit(A_train)
A_train_scaled = scaler20.transform(A_train)
A_test_scaled = scaler20.transform(A_test)

In [67]:
# Train the Logistic Regression model on the unscaled data and print the model score
# 2019 (Scaled)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

scaler19 = StandardScaler().fit(X_train)
X_train_scaled = scaler19.transform(X_train)
X_test_scaled = scaler19.transform(X_test)

sel = SelectFromModel(clf19)
sel.fit(X_train_scaled, y_train)

X_selected_train, X_selected_test, y_train, y_test = train_test_split(sel.transform(X), y, random_state=1)

scaler19 = StandardScaler().fit(X_selected_train)
X_selected_train_scaled = scaler19.transform(X_selected_train)
X_selected_test_scaled = scaler19.transform(X_selected_test)

clf = LogisticRegression().fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')



# X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X, y, random_state=1)
# classifier = LogisticRegression()
# classifier.fit(X_train_scaled, y_train)

# print(f"Training Data Score 2019: {classifier.score(X_train_scaled, y_train)}")
# print(f"Testing Data Score 2019: {classifier.score(X_test_scaled, y_test)}")

Training Score: 1.0
Testing Score: 1.0


In [None]:
# Train a Random Forest Classifier model on the scaled data and print the model score