In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from matplotlib import pyplot as plt

In [2]:
# Import the data
file_path = Path("Resources/lending_data.csv")
df_lending = pd.read_csv(file_path)
df_lending.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [4]:
# Finding null values if it exists
for column in df_lending.columns:
    print(f"{column} has {df_lending[column].isnull().sum()} null values")

loan_size has 0 null values
interest_rate has 0 null values
borrower_income has 0 null values
debt_to_income has 0 null values
num_of_accounts has 0 null values
derogatory_marks has 0 null values
total_debt has 0 null values
loan_status has 0 null values


In [7]:
target = df_lending["loan_status"]
target_names = ["negative", "positive"]

In [8]:
data = df_lending.drop("loan_status", axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


# My Prediction:
### Based on my knowledge, I believe that the logisitic regression model will perform better than the random forest classifier because the logistic regression predicts a binary outcome (0 and 1) based on the target loan status.

In [10]:
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)
X_train.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
63243,8900.0,6.893,45500,0.340659,3,0,15500
36991,9000.0,6.932,45800,0.344978,3,0,15800
39985,8900.0,6.896,45500,0.340659,3,0,15500
63047,9000.0,6.961,46100,0.349241,3,0,16100
49913,8400.0,6.712,43800,0.315068,3,0,13800


# Logistic Regression Model

In [11]:
# Create the Logistic Regression Model
model = LogisticRegression(max_iter=1000)
model

LogisticRegression(max_iter=1000)

In [13]:
# Fit the model with data and print results
model.fit(X_train, y_train)

print(f"Training Data Score: {model.score(X_train, y_train)}")
print(f"Testing Data Score: {model.score(X_test, y_test)}")

Training Data Score: 0.9919177328380795
Testing Data Score: 0.9924680148576145


In [14]:
# y_pred for labeling 
y_pred = model.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [15]:
# Import metrics class
from sklearn import metrics
from sklearn.metrics import confusion_matrix

metrics.confusion_matrix(y_test, y_pred)

array([[18699,    93],
       [   53,   539]])

In [16]:
# Calculate accuracy
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
accuracy = (tp + tn) / (tp + fp + tn + fn) 
print(f"Accuracy: {accuracy}")

Accuracy: 0.9924680148576145


In [17]:
# Print the results
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

    negative       1.00      1.00      1.00     18792
    positive       0.85      0.91      0.88       592

    accuracy                           0.99     19384
   macro avg       0.93      0.95      0.94     19384
weighted avg       0.99      0.99      0.99     19384



# Random Forest Classifier

In [18]:
# Create the classifier
classifier = RandomForestClassifier(n_estimators=100)

In [20]:
# Train the model using the training sets and print results
classifier.fit(X_train, y_train)

print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9971798046498831
Testing Data Score: 0.9920037144036319


In [21]:
# Predicting the test set
y_pred = classifier.predict(X_test)

In [23]:
confusion_matrix(y_test, y_pred)

array([[18695,    97],
       [   58,   534]])

In [24]:
# Calculate the accuracy
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
accuracy = (tp + tn) / (tp + fp + tn + fn) 
print(f"Accuracy: {accuracy}")

Accuracy: 0.9920037144036319


In [25]:
# Print the results
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

    negative       1.00      0.99      1.00     18792
    positive       0.85      0.90      0.87       592

    accuracy                           0.99     19384
   macro avg       0.92      0.95      0.93     19384
weighted avg       0.99      0.99      0.99     19384



# Results
### For the logistic regression model, I got a training data score of 0.9919177328380795 and a testing data score of 0.9924680148576145. 
### For the random forest classifier model, I got a training data score of 0.9971798046498831 and a testing data score of 0.9920037144036319.
### The results were very similar, but the random forest classifier got better results. This did not coincide with my prediction, but the results were not that far off of each other. 