In [4]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [7]:
# Read the lending_data.csv data from the Resources folder into a Pandas DataFrame
data = pd.read_csv("C:/Users/lisal/Documents/credit-risk-classification/Credit_Risk/Resources/lending_data.csv")

# Create the labels set (y) from the “loan_status” column
y = data["loan_status"]

# Create the features (X) DataFrame from the remaining columns
X = data.drop("loan_status", axis=1)

# Split the data into training and testing datasets by using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Printing the shape of the training and testing datasets
print("Shape of training features:", X_train.shape)
print("Shape of testing features:", X_test.shape)
print("Shape of training labels:", y_train.shape)
print("Shape of testing labels:", y_test.shape)


Shape of training features: (62028, 7)
Shape of testing features: (15508, 7)
Shape of training labels: (62028,)
Shape of testing labels: (15508,)


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the lending_data.csv data from the Resources folder into a Pandas DataFrame
data = pd.read_csv("C:/Users/lisal/Documents/credit-risk-classification/Credit_Risk/Resources/lending_data.csv")

# Separate the y variable, the labels
y = data["loan_status"]

# Separate the X variable, the features
X = data.drop("loan_status", axis=1)

# Printing the shapes of the labels and features datasets
print("Shape of labels (y):", y.shape)
print("Shape of features (X):", X.shape)


Shape of labels (y): (77536,)
Shape of features (X): (77536, 7)


In [9]:
# Review the y variable Series
print(y)


0        0
1        0
2        0
3        0
4        0
        ..
77531    1
77532    1
77533    1
77534    1
77535    1
Name: loan_status, Length: 77536, dtype: int64


In [11]:
# Review the X variable DataFrame
print(X)

       loan_size  interest_rate  borrower_income  debt_to_income  \
0        10700.0          7.672            52800        0.431818   
1         8400.0          6.692            43600        0.311927   
2         9000.0          6.963            46100        0.349241   
3        10700.0          7.664            52700        0.430740   
4        10800.0          7.698            53000        0.433962   
...          ...            ...              ...             ...   
77531    19100.0         11.261            86600        0.653580   
77532    17700.0         10.662            80900        0.629172   
77533    17600.0         10.595            80300        0.626401   
77534    16300.0         10.068            75300        0.601594   
77535    15600.0          9.742            72300        0.585062   

       num_of_accounts  derogatory_marks  total_debt  
0                    5                 1       22800  
1                    3                 0       13600  
2                 

### Step 3: Check the balance of the labels variable (`y`) by using the `value_counts` function.

In [12]:
# Check the balance of our target values
print(y.value_counts())


loan_status
0    75036
1     2500
Name: count, dtype: int64


### Step 4: Split the data into training and testing datasets by using `train_test_split`.

In [13]:
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [14]:
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model = LogisticRegression(random_state=1)

# Fit the model using training data
logistic_regression_model.fit(X_train, y_train)


### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [15]:
# Make a prediction using the testing data
y_pred = logistic_regression_model.predict(X_test)


### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [16]:
from sklearn.metrics import balanced_accuracy_score

# Calculate the balanced accuracy score
balanced_acc_score = balanced_accuracy_score(y_test, y_pred)

# Print the balanced accuracy score of the model
print("Balanced Accuracy Score:", balanced_acc_score)


Balanced Accuracy Score: 0.9521352751368186


In [17]:
from sklearn.metrics import confusion_matrix

# Generate a confusion matrix for the model
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[14926    75]
 [   46   461]]


In [18]:
from sklearn.metrics import classification_report

# Print the classification report for the model
print("Classification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     15001
           1       0.86      0.91      0.88       507

    accuracy                           0.99     15508
   macro avg       0.93      0.95      0.94     15508
weighted avg       0.99      0.99      0.99     15508



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** Balanced Accuracy Score: The balanced accuracy score is approximately 0.952, which indicates that the logistic regression model performs well in predicting both the "0" (healthy loan) and "1" (high-risk loan) labels. A balanced accuracy score close to 1 indicates good performance.

Confusion Matrix: The confusion matrix shows that out of 15001 healthy loans (label "0"), 14926 were correctly classified as healthy, and 75 were incorrectly classified as high-risk. Similarly, out of 507 high-risk loans (label "1"), 461 were correctly classified as high-risk, and 46 were incorrectly classified as healthy.

Classification Report: The precision, recall, and F1-score for both classes ("0" and "1") are relatively high, indicating good performance of the model in predicting both classes. Specifically, for label "1" (high-risk loan), the precision is 0.86, recall is 0.91, and F1-score is 0.88, suggesting that the model performs well in identifying high-risk loans.

Overall Assessment: The logistic regression model appears to perform well in predicting both healthy and high-risk loans. It demonstrates high precision, recall, and F1-score for both classes, as indicated by the classification report. Additionally, the balanced accuracy score further supports the model's overall effectiveness in classification.

In conclusion, the logistic regression model demonstrates strong predictive capabilities for both healthy and high-risk loans based on the provided evaluation metrics.

---

## Predict a Logistic Regression Model with Resampled Training Data

### Step 1: Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points. 

In [13]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
# YOUR CODE HERE!

# Fit the original training data to the random_oversampler model
# YOUR CODE HERE!

ModuleNotFoundError: No module named 'imblearn'

In [None]:
# Count the distinct values of the resampled labels data
# YOUR CODE HERE!

### Step 2: Use the `LogisticRegression` classifier and the resampled data to fit the model and make predictions.

In [None]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
# YOUR CODE HERE!

# Fit the model using the resampled training data
# YOUR CODE HERE!

# Make a prediction using the testing data
# YOUR CODE HERE!

### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [None]:
# Print the balanced_accuracy score of the model 
# YOUR CODE HERE!

In [None]:
# Generate a confusion matrix for the model
# YOUR CODE HERE!

In [None]:
# Print the classification report for the model
# YOUR CODE HERE!

### Step 4: Answer the following question

**Question:** How well does the logistic regression model, fit with oversampled data, predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** YOUR ANSWER HERE!