In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

In [3]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [19]:
# Split X and y
y_train = train_df['target']
y_test = test_df['target']

X_train = train_df.drop(columns=["target"])
X_test = test_df.drop(columns=["target"])

# Convert categorical data to numeric and separate target feature for training data
X_train_dummies = pd.get_dummies(X_train)
X_test_dummies = pd.get_dummies(X_test)

# Check for missing columns
train_col_list = X_train_dummies.columns.tolist()
test_col_list = X_test_dummies.columns.tolist()
s = set(test_col_list)
temp3 = [x for x in train_col_list if x not in s]
print(temp3)

['debt_settlement_flag_Y']


In [21]:
X_train_dummies['debt_settlement_flag_Y'].head()

0    0
1    0
2    0
3    0
4    0
Name: debt_settlement_flag_Y, dtype: uint8

In [22]:
# add missing dummy variables to testing set

X_test_dummies['debt_settlement_flag_Y'] = 0
X_test_dummies['debt_settlement_flag_Y'].head()

0    0
1    0
2    0
3    0
4    0
Name: debt_settlement_flag_Y, dtype: int64

I think that logistic regression will work better because the data is not scaled. Signed Jedediah Madsen on this day, Wednesday, April 17, 2024.

In [27]:
# Train the Logistic Regression model on the unscaled data and print the model score
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', max_iter=100)
model.fit(X_train_dummies,y_train)
print(f"Train: {model.score(X_train_dummies, y_train)}")
print(f"Test: {model.score(X_test_dummies, y_test)}")

Train: 0.6529556650246305
Test: 0.5082943428328371


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Didn't do a grid search because I got a zillion warnings when I tried I couldn't figure out how to get them to go away without scaling, I think. 

In [26]:
# Train a K-Nearest Neighbors model and print the model score
from sklearn.neighbors import KNeighborsClassifier

for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_dummies, y_train)
    train_score = knn.score(X_train_dummies, y_train)
    test_score = knn.score(X_test_dummies, y_test)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")

k: 1, Train/Test Score: 1.000/0.502
k: 3, Train/Test Score: 0.764/0.503
k: 5, Train/Test Score: 0.711/0.507
k: 7, Train/Test Score: 0.677/0.512
k: 9, Train/Test Score: 0.655/0.514
k: 11, Train/Test Score: 0.639/0.503
k: 13, Train/Test Score: 0.633/0.514
k: 15, Train/Test Score: 0.632/0.516
k: 17, Train/Test Score: 0.621/0.511
k: 19, Train/Test Score: 0.616/0.509


Logistic regression works about the same as KNN here but maybe only because I went with the defaults and didn't do a grid search.

In [29]:
# Scale the data
from sklearn.preprocessing import StandardScaler
# Scaling the X train data by using StandardScaler()
scaler = StandardScaler().fit(X_train_dummies)
X_train_scaled = scaler.transform(X_train_dummies)
# Scaling the X train data by using StandardScaler()
scaler = StandardScaler().fit(X_test_dummies)
X_test_scaled = scaler.transform(X_test_dummies)

In [33]:
# Train the Logistic Regression model on the scaled data and print the model score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

model = LogisticRegression(C=0.01, tol=0.0001)
model.fit(X_train_scaled,y_train)
print(f"Train: {model.score(X_train_scaled, y_train)}")
print(f"Test: {model.score(X_test_scaled, y_test)}")

predictions = model.predict(X_test_scaled)
print(confusion_matrix(y_test, predictions))

print(classification_report(y_test, predictions))

Train: 0.6935960591133005
Test: 0.6518502764780945
[[1568  783]
 [ 854 1497]]
              precision    recall  f1-score   support

   high_risk       0.65      0.67      0.66      2351
    low_risk       0.66      0.64      0.65      2351

    accuracy                           0.65      4702
   macro avg       0.65      0.65      0.65      4702
weighted avg       0.65      0.65      0.65      4702



Still got a bunch of warnings on my grid search but was able to do a partial grid search and optimize a little more. 

In [30]:
# Train a K-Nearest Neighbors model on the scaled data and print the model score

for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")

k: 1, Train/Test Score: 1.000/0.540
k: 3, Train/Test Score: 0.787/0.541
k: 5, Train/Test Score: 0.737/0.557
k: 7, Train/Test Score: 0.711/0.560
k: 9, Train/Test Score: 0.705/0.571
k: 11, Train/Test Score: 0.696/0.572
k: 13, Train/Test Score: 0.689/0.570
k: 15, Train/Test Score: 0.679/0.572
k: 17, Train/Test Score: 0.678/0.577
k: 19, Train/Test Score: 0.671/0.575


In [34]:
knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(X_train_scaled, y_train)
predictions = knn.predict(X_test_scaled)
print(confusion_matrix(y_test, predictions))

print(classification_report(y_test, predictions))

[[1324 1027]
 [ 989 1362]]
              precision    recall  f1-score   support

   high_risk       0.57      0.56      0.57      2351
    low_risk       0.57      0.58      0.57      2351

    accuracy                           0.57      4702
   macro avg       0.57      0.57      0.57      4702
weighted avg       0.57      0.57      0.57      4702



Logistic regression outperformed KNN even with scaling, possibly because I was actually able to optimize with a grid search. 