In [33]:
#
# Import the modules
#

import pandas                                                 as pd
import shutil                                                 as shu
from   colorama                import Fore                    as fr
from   colorama                import Back                    as bk
from   colorama                import Style                   as st
from   pathlib                 import Path                    as pt
from   sklearn.metrics         import balanced_accuracy_score as bas
from   sklearn.metrics         import confusion_matrix        as cmx
from   sklearn.metrics         import classification_report   as csr
from   sklearn.model_selection import train_test_split        as tts
from   sklearn.linear_model    import LogisticRegression      as lre
from   imblearn.over_sampling  import RandomOverSampler       as ros
from   collections             import Counter                 as ctr

In [44]:
#
# Auxiliary functions
#
def printSeparator():
    w, h = shu.get_terminal_size()
    print(fr.GREEN + '-' * w + fr.WHITE)
    
def printStep(stepA, stepB):
    printSeparator()
    print(fr.BLUE   + stepA)
    print(fr.YELLOW + stepB)
    printSeparator()
    
def printDFinfo(name,dfName):
    printSeparator()
    print('Name: ',name)
    printSeparator()
    print(dfName.info())    
    printSeparator()
    print('Row Count :' + fr.RED)
    print(dfName.count(),fr.WHITE)
    printSeparator()
    print(dfName.head())
    printSeparator()
    
def printReport(reportName):
    printSeparator()
    print(fr.RED,'Classification Report',fr.WHITE)
    print(reportName)
    printSeparator()
    
def printBAS(basName):
    printSeparator()
    print(fr.WHITE + 'Balanced Accuracy Score : '+ fr.RED + str(basName))
    printSeparator()

Step 1: Split the Data into Training and Testing Sets



Step 1.1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [3]:
printStep('1 - Preparation','1.1 - Read CSV file, create DF and show head'                                          )

[32m--------------------------------------------------------------------------------[37m
[34m1 - Preparation
[33m1.1 - Read CSV file, create DF and show head
[32m--------------------------------------------------------------------------------[37m


In [4]:
#
# Read the CSV file from the Resources folder into a Pandas DataFrame
#

df_lending = pd.read_csv(pt('../Resources/lending_data.csv'))

#
# Review the DataFrame
#
printDFinfo('df_lending',df_lending)


[32m--------------------------------------------------------------------------------[37m
Name:  df_lending
[32m--------------------------------------------------------------------------------[37m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77536 entries, 0 to 77535
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   loan_size         77536 non-null  float64
 1   interest_rate     77536 non-null  float64
 2   borrower_income   77536 non-null  int64  
 3   debt_to_income    77536 non-null  float64
 4   num_of_accounts   77536 non-null  int64  
 5   derogatory_marks  77536 non-null  int64  
 6   total_debt        77536 non-null  int64  
 7   loan_status       77536 non-null  int64  
dtypes: float64(3), int64(5)
memory usage: 4.7 MB
None
[32m--------------------------------------------------------------------------------[37m
Row Count :[31m
loan_size           77536
interest_rate       77536
borrowe

Step 1.2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [5]:
printStep('1 - Preparation','1.2 - Create labels X and y')

[32m--------------------------------------------------------------------------------[37m
[34m1 - Preparation
[33m1.2 - Create labels X and y
[32m--------------------------------------------------------------------------------[37m


In [6]:
#
# Separate the data into labels and features
#
# Separate the y variable, the labels
#

y = df_lending['loan_status'];

#
# Separate the X variable, the features
#

X = df_lending.drop(columns=['loan_status']);

In [7]:
#
# Review the y variable Series
#

printDFinfo('y',y)

[32m--------------------------------------------------------------------------------[37m
Name:  y
[32m--------------------------------------------------------------------------------[37m
<class 'pandas.core.series.Series'>
RangeIndex: 77536 entries, 0 to 77535
Series name: loan_status
Non-Null Count  Dtype
--------------  -----
77536 non-null  int64
dtypes: int64(1)
memory usage: 605.9 KB
None
[32m--------------------------------------------------------------------------------[37m
Row Count :[31m
77536 [37m
[32m--------------------------------------------------------------------------------[37m
0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64
[32m--------------------------------------------------------------------------------[37m


In [8]:
#
# Review the X variable DataFrame
#
printDFinfo('X',X)

[32m--------------------------------------------------------------------------------[37m
Name:  X
[32m--------------------------------------------------------------------------------[37m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77536 entries, 0 to 77535
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   loan_size         77536 non-null  float64
 1   interest_rate     77536 non-null  float64
 2   borrower_income   77536 non-null  int64  
 3   debt_to_income    77536 non-null  float64
 4   num_of_accounts   77536 non-null  int64  
 5   derogatory_marks  77536 non-null  int64  
 6   total_debt        77536 non-null  int64  
dtypes: float64(3), int64(4)
memory usage: 4.1 MB
None
[32m--------------------------------------------------------------------------------[37m
Row Count :[31m
loan_size           77536
interest_rate       77536
borrower_income     77536
debt_to_income      77536
num_of_acco

Step 1.3: Check the balance of the labels variable (`y`) by using the `value_counts` function.

In [9]:
printStep('1 - Preparation','1.3 - Check Balance of y')

[32m--------------------------------------------------------------------------------[37m
[34m1 - Preparation
[33m1.3 - Check Balance of y
[32m--------------------------------------------------------------------------------[37m


In [10]:
#
# Check the balance of our target values
#
printSeparator()
print('total count:',fr.RED,y.count(),fr.WHITE)
print('Count by  0:',fr.RED,y.value_counts()[0],fr.WHITE)
print('Count by  1:',fr.RED,y.value_counts()[1],fr.WHITE)
print('Check      :',fr.RED,y.value_counts()[0]+y.value_counts()[1])
printSeparator()

[32m--------------------------------------------------------------------------------[37m
total count: [31m 77536 [37m
Count by  0: [31m 75036 [37m
Count by  1: [31m 2500 [37m
Check      : [31m 77536
[32m--------------------------------------------------------------------------------[37m


Step 1.4: Split the data into training and testing datasets by using `train_test_split`.

In [11]:
printStep('1 - Preparation ','1.4 - Train Test Split')

[32m--------------------------------------------------------------------------------[37m
[34m1 - Preparation 
[33m1.4 - Train Test Split
[32m--------------------------------------------------------------------------------[37m


In [13]:
#
# Split the data using train_test_split
# Assign a random_state of 1 to the function
#

X_train, X_test, y_train, y_test = tts(X, y, random_state=1)
printSeparator()
print('X_train Count          :',fr.RED,X_train.count()[0],fr.WHITE)
print('y_train Count          :',fr.RED,y_train.count(),fr.WHITE)
print('X_test  Count          :',fr.RED,X_test.count()[0],fr.WHITE)
print('y_test  Count          :',fr.RED,y_test.count(),fr.WHITE)
print('X_train + X_test Count :',fr.RED,X_train.count()[0]+X_test.count()[0],fr.WHITE)
print('y_train + y_test Count :',fr.RED,y_train.count()+y_test.count(),fr.WHITE)
printSeparator()

[32m--------------------------------------------------------------------------------[37m
X_train Count          : [31m 58152 [37m
y_train Count          : [31m 58152 [37m
X_test  Count          : [31m 19384 [37m
y_test  Count          : [31m 19384 [37m
X_train + X_test Count : [31m 77536 [37m
y_train + y_test Count : [31m 77536 [37m
[32m--------------------------------------------------------------------------------[37m


Step 2. Create a Logistic Regression Model with the Original Data

In [14]:
printStep('2 - Logistic Regression','2.1 - Create the Logistic Regression Model')

[32m--------------------------------------------------------------------------------[37m
[34m2 - Logistic Regression
[33m2.1 - Create the Logistic Regression Model
[32m--------------------------------------------------------------------------------[37m


Step 2.1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).


In [15]:
# 
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
#

logistic_regression_model = lre(solver='lbfgs', random_state=1)

#
# Fit the model using training data
#

lr_model                  = logistic_regression_model.fit(X_train, y_train)

Step 2.2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [16]:
printStep('2 - Logistic Regression','2.2 - Make Predictions using the Testing Data')

[32m--------------------------------------------------------------------------------[37m
[34m2 - Logistic Regression
[33m2.2 - Make Predictions using the Testing Data
[32m--------------------------------------------------------------------------------[37m


In [17]:
# 
# Make a prediction using the testing data
#

test_predictions    = logistic_regression_model.predict(X_test)
df_test_predictions = pd.DataFrame({'Predictions': test_predictions, 'Actual': y_test})
printDFinfo('df_test_predictions',df_test_predictions)

[32m--------------------------------------------------------------------------------[37m
Name:  df_test_predictions
[32m--------------------------------------------------------------------------------[37m
<class 'pandas.core.frame.DataFrame'>
Index: 19384 entries, 60914 to 2793
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   Predictions  19384 non-null  int64
 1   Actual       19384 non-null  int64
dtypes: int64(2)
memory usage: 454.3 KB
None
[32m--------------------------------------------------------------------------------[37m
Row Count :[31m
Predictions    19384
Actual         19384
dtype: int64 [37m
[32m--------------------------------------------------------------------------------[37m
       Predictions  Actual
60914            0       0
36843            0       0
1966             0       0
70137            0       0
27237            0       0
[32m-----------------------------------------------------

Step 2.3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [18]:
printStep('2 - Logistic Regression','2.3 Calculate, Generate, Print metrics for the model')

[32m--------------------------------------------------------------------------------[37m
[34m2 - Logistic Regression
[33m2.3 Calculate, Generate, Print metrics for the model
[32m--------------------------------------------------------------------------------[37m


In [45]:
#
# Print the balanced_accuracy score of the model
#
printBAS(bas(y_test, test_predictions))


[32m--------------------------------------------------------------------------------[37m
[37mBalanced Accuracy Score : [31m0.9520479254722232
[32m--------------------------------------------------------------------------------[37m


In [42]:
# 
# Generate a confusion matrix for the model
#

cf_test_matrix = cmx(y_test, test_predictions)
printSeparator()
print('cf test matrix :',fr.RED)
print(cf_test_matrix)
printSeparator()                      

[32m--------------------------------------------------------------------------------[37m
cf test matrix : [31m
[[18663   102]
 [   56   563]]
[32m--------------------------------------------------------------------------------[37m


In [21]:
#
# Print the classification report for the model
#

testing_report = csr(y_test, test_predictions);
printReport(testing_report)


[32m--------------------------------------------------------------------------------[37m
[31m Classification Report [37m
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.85      0.91      0.88       619

    accuracy                           0.99     19384
   macro avg       0.92      0.95      0.94     19384
weighted avg       0.99      0.99      0.99     19384

[32m--------------------------------------------------------------------------------[37m


Step 2.4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** `The logistic regression model was 95% accurate at predicting the healthy vs high-risk loan labels`

In [22]:
printStep('2 - Logistic Regression','2.4 - Qualify the Model')
print('The logistic regression model was'+ fr.RED +' 95% ' + fr.WHITE +'accurate at predicting the healthy vs high-risk loan labels')
printSeparator()

[32m--------------------------------------------------------------------------------[37m
[34m2 - Logistic Regression
[33m2.4 - Qualify the Model
[32m--------------------------------------------------------------------------------[37m
The logistic regression model was[31m 95% [37maccurate at predicting the healthy vs high-risk loan labels
[32m--------------------------------------------------------------------------------[37m


Step 3. Predict a Logistic Regression Model with Resampled Training Data


Step 3.1: Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points. 

In [23]:
printStep('3 - Logistic Regression Model with Resampled Training Data','3.1 - Resample the training data with the RandomOversampler')

[32m--------------------------------------------------------------------------------[37m
[34m3 - Logistic Regression Model with Resampled Training Data
[33m3.1 - Resample the training data with the RandomOversampler
[32m--------------------------------------------------------------------------------[37m


In [24]:
#
# Instantiate the random oversampler model
# Assign a random_state parameter of 1 to the model
#

ros = ros(random_state=1);

#
# Fit the original training data to the random_oversampler model
#

X_ros_model, y_ros_model = ros.fit_resample(X,y)

printDFinfo('X_ros_model',X_ros_model)
printDFinfo('y_ros_model',y_ros_model)

[32m--------------------------------------------------------------------------------[37m
Name:  X_ros_model
[32m--------------------------------------------------------------------------------[37m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150072 entries, 0 to 150071
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   loan_size         150072 non-null  float64
 1   interest_rate     150072 non-null  float64
 2   borrower_income   150072 non-null  int64  
 3   debt_to_income    150072 non-null  float64
 4   num_of_accounts   150072 non-null  int64  
 5   derogatory_marks  150072 non-null  int64  
 6   total_debt        150072 non-null  int64  
dtypes: float64(3), int64(4)
memory usage: 8.0 MB
None
[32m--------------------------------------------------------------------------------[37m
Row Count :[31m
loan_size           150072
interest_rate       150072
borrower_income     150072
debt_to_incom

In [25]:
# 
# Count the distinct values of the resampled labels data
#

printSeparator()
print('X_ros_model ',ctr(X_ros_model))
print('y_ros_model ',ctr(y_ros_model))
printSeparator()

[32m--------------------------------------------------------------------------------[37m
X_ros_model  Counter({'loan_size': 1, 'interest_rate': 1, 'borrower_income': 1, 'debt_to_income': 1, 'num_of_accounts': 1, 'derogatory_marks': 1, 'total_debt': 1})
y_ros_model  Counter({0: 75036, 1: 75036})
[32m--------------------------------------------------------------------------------[37m


Step 3.2: Use the `LogisticRegression` classifier and the resampled data to fit the model and make predictions.

In [26]:
printStep('3 - Logistic Regression Model with Resampled Training Data','3.2 - Train a Logistic Regression Model using the resampled data')

[32m--------------------------------------------------------------------------------[37m
[34m3 - Logistic Regression Model with Resampled Training Data
[33m3.2 - Train a Logistic Regression Model using the resampled data
[32m--------------------------------------------------------------------------------[37m


In [27]:
# 
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
#

classifier = lre(solver='lbfgs', random_state=1)

#
# Fit the model using the resampled training data
#

classifier.fit(X_ros_model, y_ros_model)

#
# Make a prediction using the testing data
#

predictions    = classifier.predict(X_ros_model);
df_predictions = pd.DataFrame({'Predictions': predictions, 'Actual': y_ros_model});
printDFinfo('df_predictions',df_predictions)

[32m--------------------------------------------------------------------------------[37m
Name:  df_predictions
[32m--------------------------------------------------------------------------------[37m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150072 entries, 0 to 150071
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   Predictions  150072 non-null  int64
 1   Actual       150072 non-null  int64
dtypes: int64(2)
memory usage: 2.3 MB
None
[32m--------------------------------------------------------------------------------[37m
Row Count :[31m
Predictions    150072
Actual         150072
dtype: int64 [37m
[32m--------------------------------------------------------------------------------[37m
   Predictions  Actual
0            0       0
1            0       0
2            0       0
3            0       0
4            0       0
[32m--------------------------------------------------------------------------

Step 3.3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [28]:
printStep('3 - Logistic Regression Model with Resampled Training Data','3.3 - Calculate, Generate, Print metrics for the model')

[32m--------------------------------------------------------------------------------[37m
[34m3 - Logistic Regression Model with Resampled Training Data
[33m3.3 - Calculate, Generate, Print metrics for the model
[32m--------------------------------------------------------------------------------[37m


In [46]:
# 
# Print the balanced_accuracy score of the model
#

printBAS(bas(y_ros_model, predictions))

[32m--------------------------------------------------------------------------------[37m
[37mBalanced Accuracy Score : [31m0.9945026387334079
[32m--------------------------------------------------------------------------------[37m


In [30]:
# Generate a confusion matrix for the model
cf_matrix = cmx(y_ros_model, predictions)
printSeparator()
print('CF Matrix :',fr.RED)
print(cf_matrix)
printSeparator()

[32m--------------------------------------------------------------------------------[37m
CF Matrix : [31m
[[74614   422]
 [  403 74633]]
[32m--------------------------------------------------------------------------------[37m


In [31]:
# Print the classification report for the model
report = csr(y_ros_model, predictions)
printReport(report)


[32m--------------------------------------------------------------------------------[37m
[31m Classification Report [37m
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     75036
           1       0.99      0.99      0.99     75036

    accuracy                           0.99    150072
   macro avg       0.99      0.99      0.99    150072
weighted avg       0.99      0.99      0.99    150072

[32m--------------------------------------------------------------------------------[37m


Step 3.4: Answer the following question

**Question:** How well does the logistic regression model, fit with oversampled data, predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** `The logistic regression model predicts the oversampled data with near-perfect accuracy (>99% accurate)`

In [32]:
printStep('3 - Logistic Regression Model with Resampled Training Data','3.4 - Qualify the Model ')
print('The logistic regression model predicts the oversampled data with near-perfect accuracy (>99% accurate)')
printSeparator()

[32m--------------------------------------------------------------------------------[37m
[34m3 - Logistic Regression Model with Resampled Training Data
[33m3.4 - Qualify the Model 
[32m--------------------------------------------------------------------------------[37m
The logistic regression model predicts the oversampled data with near-perfect accuracy (>99% accurate)
[32m--------------------------------------------------------------------------------[37m
