In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Step 1: Read the CSV file into a Pandas DataFrame
file_path = 'Resources/lending_data.csv'
data = pd.read_csv(file_path)

# Step 2: Create labels (y) and features (X)
y = data['loan_status']
X = data.drop('loan_status', axis=1)  # Drop the 'loan_status' column to get features

# Step 3: Split data into training and testing sets
# Here, test_size is the proportion of the dataset to include in the test split
# random_state ensures reproducibility of the split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now you have X_train (training features), X_test (testing features),
# y_train (training labels), and y_test (testing labels)


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [3]:
import pandas as pd

# Step 1: Read the CSV file into a Pandas DataFrame
file_path = 'Resources/lending_data.csv'
data = pd.read_csv(file_path)

# Step 2: Create labels (y) and features (X)
y = data['loan_status']  # Labels are taken from the 'loan_status' column
X = data.drop('loan_status', axis=1)  # Features are obtained by dropping the 'loan_status' column


In [4]:
print(y)

0        0
1        0
2        0
3        0
4        0
        ..
77531    1
77532    1
77533    1
77534    1
77535    1
Name: loan_status, Length: 77536, dtype: int64


NameError: name 'x' is not defined

### Step 3: Check the balance of the labels variable (`y`) by using the `value_counts` function.

In [6]:
label_counts = y.value_counts()
print(label_counts)


0    75036
1     2500
Name: loan_status, dtype: int64


### Step 4: Split the data into training and testing datasets by using `train_test_split`.

In [8]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [9]:
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model with random_state=1
model = LogisticRegression(random_state=1)

# Fit the model using training data
model.fit(X_train, y_train)



### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [11]:
# Make predictions using the testing data
y_pred = model.predict(X_test)


### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [12]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, balanced_accuracy_score

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Print classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)

# Calculate balanced accuracy score
balanced_acc = balanced_accuracy_score(y_test, y_pred)
print("Balanced Accuracy:", balanced_acc)


Accuracy: 0.9928424039205571
Confusion Matrix:
[[14947    64]
 [   47   450]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     15011
           1       0.88      0.91      0.89       497

    accuracy                           0.99     15508
   macro avg       0.94      0.95      0.94     15508
weighted avg       0.99      0.99      0.99     15508

Balanced Accuracy: 0.9505845277514129


In [13]:
from sklearn.metrics import confusion_matrix

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[14947    64]
 [   47   450]]


In [14]:
from sklearn.metrics import classification_report

# Print classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)


Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     15011
           1       0.88      0.91      0.89       497

    accuracy                           0.99     15508
   macro avg       0.94      0.95      0.94     15508
weighted avg       0.99      0.99      0.99     15508



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** It does very well at predicting.

---

## Predict a Logistic Regression Model with Resampled Training Data

### Step 1: Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points. 

In [16]:
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model with random_state=1
oversampler = RandomOverSampler(random_state=1)

# Fit the original training data to the oversampler model
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)


In [17]:
# Count the distinct values of the resampled labels data
resampled_label_counts = y_train_resampled.value_counts()
print(resampled_label_counts)


0    60025
1    60025
Name: loan_status, dtype: int64


### Step 2: Use the `LogisticRegression` classifier and the resampled data to fit the model and make predictions.

In [18]:
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model with random_state=1
model_resampled = LogisticRegression(random_state=1)

# Fit the model using the resampled training data
model_resampled.fit(X_train_resampled, y_train_resampled)

# Make predictions using the testing data
y_pred_resampled = model_resampled.predict(X_test)


### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [19]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, balanced_accuracy_score

# Calculate accuracy score
accuracy_resampled = accuracy_score(y_test, y_pred_resampled)
print("Accuracy (Resampled):", accuracy_resampled)

# Generate confusion matrix
conf_matrix_resampled = confusion_matrix(y_test, y_pred_resampled)
print("Confusion Matrix (Resampled):")
print(conf_matrix_resampled)

# Print classification report
class_report_resampled = classification_report(y_test, y_pred_resampled)
print("Classification Report (Resampled):")
print(class_report_resampled)

# Calculate balanced accuracy score
balanced_acc_resampled = balanced_accuracy_score(y_test, y_pred_resampled)
print("Balanced Accuracy (Resampled):", balanced_acc_resampled)


Accuracy (Resampled): 0.9952282692803714
Confusion Matrix (Resampled):
[[14939    72]
 [    2   495]]
Classification Report (Resampled):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     15011
           1       0.87      1.00      0.93       497

    accuracy                           1.00     15508
   macro avg       0.94      1.00      0.96     15508
weighted avg       1.00      1.00      1.00     15508

Balanced Accuracy (Resampled): 0.9955896862756715


In [20]:
from sklearn.metrics import confusion_matrix

# Generate confusion matrix for resampled model
conf_matrix_resampled = confusion_matrix(y_test, y_pred_resampled)
print("Confusion Matrix (Resampled):")
print(conf_matrix_resampled)


Confusion Matrix (Resampled):
[[14939    72]
 [    2   495]]


In [22]:
from sklearn.metrics import confusion_matrix

# Generate confusion matrix for resampled model
conf_matrix_resampled = confusion_matrix(y_test, y_pred_resampled)
print("Confusion Matrix (Resampled):")
print(conf_matrix_resampled)


Confusion Matrix (Resampled):
[[14939    72]
 [    2   495]]


### Step 4: Answer the following question

**Question:** How well does the logistic regression model, fit with oversampled data, predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** the models work very very well!