In [6]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [7]:
# Import the necessary libraries
import pandas as pd
import os

# Define the relative path to the CSV file
csv_file_name = 'lending_data.csv'
folder_name = 'Resources'  # Name of the folder where the CSV is located

# Assuming this script is running in the same directory as the Jupyter notebook
csv_file_path = os.path.join(folder_name, csv_file_name)

# Read the CSV file into a Pandas DataFrame
df = pd.read_csv(csv_file_path)

# Review the first few rows of the DataFrame to understand its structure and contents
print(df.head())

# Optionally, use df.info() to get a summary of the DataFrame
print(df.info())



   loan_size  interest_rate  borrower_income  debt_to_income  num_of_accounts  \
0    10700.0          7.672            52800        0.431818                5   
1     8400.0          6.692            43600        0.311927                3   
2     9000.0          6.963            46100        0.349241                3   
3    10700.0          7.664            52700        0.430740                5   
4    10800.0          7.698            53000        0.433962                5   

   derogatory_marks  total_debt  loan_status  
0                 1       22800            0  
1                 0       13600            0  
2                 0       16100            0  
3                 1       22700            0  
4                 1       23000            0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77536 entries, 0 to 77535
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   loan_size         77536 

### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [8]:
# Assuming 'df' is your DataFrame containing the loaded data

# Separate the 'loan_status' column as the labels (y)
y = df['loan_status']

# Separate the rest of the columns as the features (X)
X = df.drop('loan_status', axis=1)  # This removes the 'loan_status' column from the features

# Quick review of the labels and features
print("Labels (y):")
print(y.head())  # Display the first few entries of the labels

print("\nFeatures (X):")
print(X.head())  # Display the first few entries of the features


Labels (y):
0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64

Features (X):
   loan_size  interest_rate  borrower_income  debt_to_income  num_of_accounts  \
0    10700.0          7.672            52800        0.431818                5   
1     8400.0          6.692            43600        0.311927                3   
2     9000.0          6.963            46100        0.349241                3   
3    10700.0          7.664            52700        0.430740                5   
4    10800.0          7.698            53000        0.433962                5   

   derogatory_marks  total_debt  
0                 1       22800  
1                 0       13600  
2                 0       16100  
3                 1       22700  
4                 1       23000  


In [9]:
# Review the first few entries of the y variable Series
print(y.head())


0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64


In [10]:
# Review the first few entries of the X variable DataFrame
print(X.head())


   loan_size  interest_rate  borrower_income  debt_to_income  num_of_accounts  \
0    10700.0          7.672            52800        0.431818                5   
1     8400.0          6.692            43600        0.311927                3   
2     9000.0          6.963            46100        0.349241                3   
3    10700.0          7.664            52700        0.430740                5   
4    10800.0          7.698            53000        0.433962                5   

   derogatory_marks  total_debt  
0                 1       22800  
1                 0       13600  
2                 0       16100  
3                 1       22700  
4                 1       23000  


### Step 3: Check the balance of the labels variable (`y`) by using the `value_counts` function.

In [11]:
# Check the balance of the target values
y_balance = y.value_counts()

# Print the counts for each class
print(y_balance)


loan_status
0    75036
1     2500
Name: count, dtype: int64


### Step 4: Split the data into training and testing datasets by using `train_test_split`.

In [13]:
from sklearn.model_selection import train_test_split


In [14]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Print the sizes of the training and testing datasets to verify the split
print("Training set size:", X_train.shape[0])
print("Testing set size:", X_test.shape[0])


Training set size: 62028
Testing set size: 15508


---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [15]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model with a random_state of 1
logistic_regression_model = LogisticRegression(random_state=1)

# Fit the model using the training data
logistic_regression_model.fit(X_train, y_train)


### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [16]:
# Make predictions using the testing feature data (X_test)
y_pred = logistic_regression_model.predict(X_test)

# Display the first few predictions
print(y_pred[:5])


[0 0 0 0 0]


### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [17]:
from sklearn.metrics import balanced_accuracy_score

# Calculate the balanced accuracy score
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

# Print the balanced accuracy score
print("Balanced Accuracy Score:", balanced_accuracy)


Balanced Accuracy Score: 0.9521352751368186


In [18]:
from sklearn.metrics import confusion_matrix

# Generate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[14926    75]
 [   46   461]]


In [19]:
from sklearn.metrics import classification_report

# Generate the classification report
class_report = classification_report(y_test, y_pred)

# Print the classification report
print("Classification Report:")
print(class_report)


Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     15001
           1       0.86      0.91      0.88       507

    accuracy                           0.99     15508
   macro avg       0.93      0.95      0.94     15508
weighted avg       0.99      0.99      0.99     15508



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** The Logistic Regression model performs exceptionally well in predicting healthy loans (Class 0), with high precision and recall. For high-risk loans (Class 1), the model also performs commendably, with a particularly high recall, which is crucial for identifying most loans at risk of default. However, the precision for high-risk loans indicates a small percentage of loans might be incorrectly classified as high-risk. While this might lead to some unnecessary scrutiny, the model's ability to correctly identify the vast majority of high-risk loans can significantly benefit lending institutions by allowing them to take preventative measures and reduce the likelihood of default. The balanced accuracy score and the details within the confusion matrix and classification report all point to a model that is highly effective, especially after addressing class imbalance with resampling techniques.

---

## Predict a Logistic Regression Model with Resampled Training Data

### Step 1: Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points. 

In [34]:
from imblearn.over_sampling import RandomOverSampler

# Instantiate the RandomOverSampler model with a random_state of 1
ros = RandomOverSampler(random_state=1)

# Fit the original training data to the RandomOverSampler model and resample
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)



ImportError: cannot import name '_check_X' from 'imblearn.utils._validation' (C:\Users\hprin\anaconda3\Lib\site-packages\imblearn\utils\_validation.py)

In [None]:
# Count the distinct values of the resampled labels data
# YOUR CODE HERE!

### Step 2: Use the `LogisticRegression` classifier and the resampled data to fit the model and make predictions.

In [35]:
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model with a random_state of 1
logistic_regression_model = LogisticRegression(random_state=1)


In [36]:
# Fit the model using the resampled training data
logistic_regression_model.fit(X_train_resampled, y_train_resampled)


NameError: name 'X_train_resampled' is not defined

In [37]:
# Make predictions using the testing feature data (X_test)
y_pred = logistic_regression_model.predict(X_test)

# Display the first few predictions to verify
print(y_pred[:5])


NotFittedError: This LogisticRegression instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [38]:
from sklearn.metrics import balanced_accuracy_score

# Calculate the balanced accuracy score
balanced_acc = balanced_accuracy_score(y_test, y_pred)

# Print the balanced accuracy score
print("Balanced Accuracy Score:", balanced_acc)


Balanced Accuracy Score: 0.9521352751368186


In [39]:
from sklearn.metrics import confusion_matrix

# Generate the confusion matrix using the true labels and the model's predictions
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[14926    75]
 [   46   461]]


In [40]:
from sklearn.metrics import classification_report

# Generate the classification report using the true labels and the model's predictions
class_report = classification_report(y_test, y_pred)

# Print the classification report
print("Classification Report:")
print(class_report)


Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     15001
           1       0.86      0.91      0.88       507

    accuracy                           0.99     15508
   macro avg       0.93      0.95      0.94     15508
weighted avg       0.99      0.99      0.99     15508



### Step 4: Answer the following question

**Question:** How well does the logistic regression model, fit with oversampled data, predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** The Logistic Regression model, when fit with oversampled data, performs exceptionally well in predicting healthy loans (0), with nearly perfect precision and recall. For high-risk loans (1), the model also performs commendably, with high recall, indicating it is adept at identifying most high-risk loans. However, the precision for high-risk loans suggests a small proportion of loans are incorrectly classified as high-risk when they are actually healthy. This could potentially lead to unnecessary scrutiny or denial of loans that are not truly high-risk, but overall, the model's performance is impressive, especially given the challenges posed by class imbalance. The use of oversampled data has likely contributed to the model's improved ability to recognize high-risk loans, addressing the imbalance that typically hinders model performance on minority classes.