In [2]:
# Import necessary libraries and modules
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import RandomOverSampler
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv("Resources/lending_data.csv")

# Split data into features (X) and target (y)
y = df['loan_status']
X = df.drop('loan_status', axis=1)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Instantiate and train a Logistic Regression model with the original dataset
logistic_regression_model = LogisticRegression(random_state=1)
logistic_regression_model.fit(X_train, y_train)

# Predictions using the original data model
y_pred = logistic_regression_model.predict(X_test)

# Evaluate the original model's performance
print("Original Data Model Evaluation:")
print("Balanced Accuracy Score:", balanced_accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Resample the training data using RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Instantiate and train a Logistic Regression model with the resampled dataset
model_resampled = LogisticRegression(random_state=1)
model_resampled.fit(X_resampled, y_resampled)

# Predictions using the resampled data model
y_pred_resampled = model_resampled.predict(X_test)

# Evaluate the resampled model's performance
print("Resampled Data Model Evaluation:")
print("Balanced Accuracy Score:", balanced_accuracy_score(y_test, y_pred_resampled))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_resampled))
print("Classification Report:\n", classification_report(y_test, y_pred_resampled))


Original Data Model Evaluation:
Balanced Accuracy Score: 0.9521352751368186
Confusion Matrix:
 [[14926    75]
 [   46   461]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     15001
           1       0.86      0.91      0.88       507

    accuracy                           0.99     15508
   macro avg       0.93      0.95      0.94     15508
weighted avg       0.99      0.99      0.99     15508

Resampled Data Model Evaluation:
Balanced Accuracy Score: 0.9941749445500477
Confusion Matrix:
 [[14915    86]
 [    3   504]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00     15001
           1       0.85      0.99      0.92       507

    accuracy                           0.99     15508
   macro avg       0.93      0.99      0.96     15508
weighted avg       1.00      0.99      0.99     15508

