In [2]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced

import warnings
warnings.filterwarnings('ignore')

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [3]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
lending_data_df = pd.read_csv(Path('Resources/lending_data.csv'))

# Review the DataFrame
lending_data_df.sample(10)

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
15560,10900.0,7.772,53700,0.441341,5,1,23700,0
25678,9600.0,7.207,48400,0.380165,4,0,18400,0
25496,6500.0,5.894,36100,0.168975,1,0,6100,0
71577,10900.0,7.769,53700,0.441341,5,1,23700,0
56280,9000.0,6.952,46000,0.347826,3,0,16000,0
13353,9900.0,7.327,49500,0.393939,4,0,19500,0
13089,10100.0,7.431,50500,0.405941,4,1,20500,0
43751,9600.0,7.193,48300,0.378882,4,0,18300,0
66623,9800.0,7.296,49300,0.391481,4,0,19300,0
48662,8500.0,6.735,44000,0.318182,3,0,14000,0


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [4]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = lending_data_df['loan_status']

# Separate the X variable, the features
x = lending_data_df.drop(columns = 'loan_status')

In [5]:
# Review the y variable Series
y.sample(10)

29917    0
29658    0
6366     0
46481    0
16869    0
33106    0
54126    0
30343    0
7724     0
64778    0
Name: loan_status, dtype: int64

In [6]:
# Review the X variable DataFrame
x.sample(10)

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
41340,9900.0,7.335,49600,0.395161,4,0,19600
10073,8200.0,6.614,42800,0.299065,2,0,12800
73914,10100.0,7.397,50200,0.40239,4,1,20200
10751,8600.0,6.76,44200,0.321267,3,0,14200
13485,7700.0,6.392,40800,0.264706,2,0,10800
648,9900.0,7.33,49600,0.395161,4,0,19600
53707,9200.0,7.043,46900,0.360341,3,0,16900
34219,12300.0,8.373,59400,0.494949,6,1,29400
69551,11500.0,8.019,56100,0.465241,5,1,26100
72454,9700.0,7.246,48800,0.385246,4,0,18800


### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [7]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
x_train, x_test, y_train, y_test = train_test_split(
    x,
    y,
    random_state = 1
)

---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [8]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
LR_model = LogisticRegression(random_state = 1)

# Fit the model using training data
LR_model.fit(x_train, y_train)

### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [9]:
# Make a prediction using the testing data
LR_predictions = LR_model.predict(x_test)

### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [11]:
# Generate a confusion matrix for the model
cm_imbalanced = confusion_matrix(y_test, LR_predictions)
cm_imbalanced_df = pd.DataFrame(cm_imbalanced,
                                index = ['Actual Healthy Loans (low-risk)',
                                'Actual Non-Healthy Loans (high-risk)'],
                                columns = ['Predicted Healthy Loans (low-risk)','Predicted Non-Healthy Loans (high-risk)']
                              )
cm_imbalanced_df

Unnamed: 0,Predicted Healthy Loans (low-risk),Predicted Non-Healthy Loans (high-risk)
Actual Healthy Loans (low-risk),18663,102
Actual Non-Healthy Loans (high-risk),56,563


In [12]:
# Print the classification report for the model
print(classification_report_imbalanced(y_test, LR_predictions))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.99      0.91      1.00      0.95      0.91     18765
          1       0.85      0.91      0.99      0.88      0.95      0.90       619

avg / total       0.99      0.99      0.91      0.99      0.95      0.91     19384



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** The model seems to be doing great based on its balanced accuracy score of 95%. However, this high score is mostly because the data is skewed. There are more healthy loans than non-healthy. This means the model is better at predicting loans as healthy rather than identifying them as non-healthy. Based on the imbalanced classification report, the model always predicted healthy loans correctly 100% of the time, but it only got non-healthy loans right 85% of the time.

---