In [30]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

In [8]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [10]:
# Convert categorical data to numeric and separate target feature for training data
X_train = train_df.drop('loan_status', axis = 1)
X_train_dummies = pd.get_dummies(X_train)
y_train = train_df['loan_status']

In [19]:
# Convert categorical data to numeric and separate target feature for testing data
X_test = test_df.drop('loan_status', axis = 1)
X_test_dummies = pd.get_dummies(X_test)
y_test = test_df['loan_status']

In [34]:
# add missing dummy variables to testing set
X_test_dummies['debt_settlement_flag_Y'] = X_test_dummies['debt_settlement_flag_N'] ^ 1

In [32]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression(solver = 'lbfgs', max_iter = X_train_dummies.size)
classifier.fit(X_train_dummies, y_train)
print(f"Training Data Score: {classifier.score(X_train_dummies, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_dummies, y_test)}")

Training Data Score: 0.694991789819376
Testing Data Score: 0.5650786899191833


In [29]:
# Train a Random Forest Classifier model and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_dummies, y_train)
print(f'Training Score: {clf.score(X_train_dummies, y_train)}')
print(f'Testing Score: {clf.score(X_test_dummies, y_test)}')

Training Score: 1.0
Testing Score: 0.6180348787749894


In [31]:
# Scale the data
scaler = StandardScaler().fit(X_train_dummies)
X_train_scaled = scaler.transform(X_train_dummies)
X_test_scaled = scaler.transform(X_test_dummies)

In [35]:
# Train the Logistic Regression model on the scaled data and print the model score
classifier = LogisticRegression(solver = 'lbfgs', max_iter = X_train_scaled.size)
classifier.fit(X_train_scaled, y_train)
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")


Training Data Score: 0.712807881773399
Testing Data Score: 0.7205444491705657


In [36]:
# Train a Random Forest Classifier model on the scaled data and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.6193109315185028
