In [None]:
#
# Import the modules
#

import pandas                                                 as pd
import shutil                                                 as shu
from   colorama                import Fore                    as fr
from   colorama                import Back                    as bk
from   colorama                import Style                   as st
from   pathlib                 import Path                    as pt
from   sklearn.metrics         import balanced_accuracy_score as bas
from   sklearn.metrics         import confusion_matrix        as cmx
from   sklearn.metrics         import classification_report   as csr
from   sklearn.model_selection import train_test_split        as tts
from   sklearn.linear_model    import LogisticRegression      as lre
from   imblearn.over_sampling  import RandomOverSampler       as ros
from   collections             import Counter                 as ctr

In [None]:
def printSeparator():
    w, h = shu.get_terminal_size()
    print(fr.GREEN); print('_'* w,fr.WHITE)

In [45]:
def printStep(stepA, stepB):
    printSeparator()
    print(stepA)
    print(stepB)
    printSeparator()

Step 1: Split the Data into Training and Testing Sets



Step 1.1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [48]:
printStep('1','')

[32m
________________________________________________________________________________ [37m
1

[32m
________________________________________________________________________________ [37m


In [49]:
printStep('1','1.1')

[32m
________________________________________________________________________________ [37m
1
1.1
[32m
________________________________________________________________________________ [37m


In [None]:
#
# Read the CSV file from the Resources folder into a Pandas DataFrame
#

df_lending = pd.read_csv(pt('../Resources/lending_data.csv'))

#
# Review the DataFrame
#
printSeparator()
print('Row Count :',fr.RED,df_lending.count()[0],fr.WHITE)
print('')
print(df_lending.head())
printSeparator()

Step 1.2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [47]:
printStep('1','1.2')

[32m
________________________________________________________________________________ [37m
1
1.2
[32m
________________________________________________________________________________ [37m


In [None]:
#
# Separate the data into labels and features
#
# Separate the y variable, the labels
#

y = df_lending['loan_status'];

#
# Separate the X variable, the features
#

X = df_lending.drop(columns=['loan_status']);

In [None]:
#
# Review the y variable Series
#
printSeparator()
print('Values fot Y :')
print(fr.WHITE)
print(y.head())
printSeparator()

In [None]:
#
# Review the X variable DataFrame
#
printSeparator()

print('Values of X :')
print(fr.WHITE)
print(X.head())
printSeparator()

Step 1.3: Check the balance of the labels variable (`y`) by using the `value_counts` function.

In [53]:
printStep('1','1.3')

[32m
________________________________________________________________________________ [37m
1
1.3
[32m
________________________________________________________________________________ [37m


In [None]:
#
# Check the balance of our target values
#
printSeparator()
print('total count:',fr.RED,y.count(),fr.WHITE)
print('Count by  0:',fr.RED,y.value_counts()[0],fr.WHITE)
print('Count by  1:',fr.RED,y.value_counts()[1],fr.WHITE)
print('Check      :',fr.RED,y.value_counts()[0]+y.value_counts()[1])
printSeparator()

Step 1.4: Split the data into training and testing datasets by using `train_test_split`.

In [54]:
printStep('1','1.4')

[32m
________________________________________________________________________________ [37m
1
1.4
[32m
________________________________________________________________________________ [37m


In [None]:
#
# Split the data using train_test_split
# Assign a random_state of 1 to the function
#

X_train, X_test, y_train, y_test = tts(X, y, random_state=1)
printSeparator()
print('X_train Count          :',Fore.RED,X_train.count()[0],Fore.WHITE)
print('y_train Count          :',Fore.RED,y_train.count(),Fore.WHITE)
print('X_test  Count          :',Fore.RED,X_test.count()[0],Fore.WHITE)
print('y_test  Count          :',Fore.RED,y_test.count(),Fore.WHITE)
print('X_train + X_test Count :',Fore.RED,X_train.count()[0]+X_test.count()[0],Fore.WHITE)
printSeparator()

Step 2. Create a Logistic Regression Model with the Original Data

In [50]:
printStep('2','')

[32m
________________________________________________________________________________ [37m
2

[32m
________________________________________________________________________________ [37m


In [51]:
printStep('2','2.1')

[32m
________________________________________________________________________________ [37m
2
2.1
[32m
________________________________________________________________________________ [37m


Step 2.1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).


In [None]:
# 
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
#

logistic_regression_model = lre(solver='lbfgs', random_state=1)

#
# Fit the model using training data
#

lr_model                  = logistic_regression_model.fit(X_train, y_train)

Step 2.2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [55]:
printStep('2','2.2')

[32m
________________________________________________________________________________ [37m
2
2.2
[32m
________________________________________________________________________________ [37m


In [None]:
# 
# Make a prediction using the testing data
#

test_predictions    = logistic_regression_model.predict(X_test)
df_test_predictions = pd.DataFrame({'Predictions': test_predictions, 'Actual': y_test})
printSeparator()
print(df_test_predictions)
printSeparator()

Step 2.3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [56]:
printStep('2','2.3')

[32m
________________________________________________________________________________ [37m
2
2.3
[32m
________________________________________________________________________________ [37m


In [None]:
#
# Print the balanced_accuracy score of the model
#
printSeparator()
print(f"The balanced accuracy score of the model is: {bas(y_test, test_predictions)}")
printSeparator()

In [None]:
# 
# Generate a confusion matrix for the model
#

cf_test_matrix = cmx(y_test, test_predictions)
printSeparator()
print('cf test matrix :',Fore.RED)
print(cf_test_matrix)
printSeparator()                      

In [None]:
#
# Print the classification report for the model
#

testing_report = csr(y_test, test_predictions);
printSeparator()
print(Fore.RED,'Classification Report',Fore.WHITE)
print(testing_report)
printSeparator()

Step 2.4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** `The logistic regression model was 95% accurate at predicting the healthy vs high-risk loan labels`

In [59]:
printStep('2','2.4')
print('The logistic regression model was 95% accurate at predicting the healthy vs high-risk loan labels')
printSeparator()

[32m
________________________________________________________________________________ [37m
2
2.4
[32m
________________________________________________________________________________ [37m
The logistic regression model was 95% accurate at predicting the healthy vs high-risk loan labels
[32m
________________________________________________________________________________ [37m


Step 3. Predict a Logistic Regression Model with Resampled Training Data


Step 3.1: Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points. 

In [62]:
printStep('3','')
printStep('3','3.1')

[32m
________________________________________________________________________________ [37m
3

[32m
________________________________________________________________________________ [37m
[32m
________________________________________________________________________________ [37m
3
3.1
[32m
________________________________________________________________________________ [37m


In [None]:
#
# Instantiate the random oversampler model
# Assign a random_state parameter of 1 to the model
#

ros = ros(random_state=1);

#
# Fit the original training data to the random_oversampler model
#

X_ros_model, y_ros_model = ros.fit_resample(X,y);

In [None]:
# 
# Count the distinct values of the resampled labels data
#

printSeparator()
print('X_ros_model ',ctr(X_ros_model))
print('y_ros_model ',ctr(y_ros_model))
printSeparator()

Step 3.2: Use the `LogisticRegression` classifier and the resampled data to fit the model and make predictions.

In [63]:
printStep('3','3.2')

[32m
________________________________________________________________________________ [37m
3
3.2
[32m
________________________________________________________________________________ [37m


In [None]:
# 
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
#

classifier = lre(solver='lbfgs', random_state=1)

#
# Fit the model using the resampled training data
#

classifier.fit(X_ros_model, y_ros_model)

#
# Make a prediction using the testing data
#

predictions    = classifier.predict(X_ros_model);
df_predictions = pd.DataFrame({'Predictions': predictions, 'Actual': y_ros_model});
printSeparator()
print(df_predictions)
printSeparator()


Step 3.3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [64]:
printStep('3','3.3')

[32m
________________________________________________________________________________ [37m
3
3.3
[32m
________________________________________________________________________________ [37m


In [None]:
# 
# Print the balanced_accuracy score of the model
#
printSeparator()
print(f"The balanced accuracy score of the model is: {bas(y_ros_model, predictions)}")
printSeparator()

In [None]:
# Generate a confusion matrix for the model
cf_matrix = cmx(y_ros_model, predictions)
printSeparator()
print('CF Matrix :',Fore.RED)
print(cf_matrix)
printSeparator()

In [None]:
# Print the classification report for the model
report = csr(y_ros_model, predictions)
printSeparator()
print(Fore.RED,'Classification Report',Fore.WHITE)
print(report)
printSeparator()

Step 3.4: Answer the following question

**Question:** How well does the logistic regression model, fit with oversampled data, predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** `The logistic regression model predicts the oversampled data with near-perfect accuracy (>99% accurate)`

In [65]:
printStep('3','3.4')
print('The logistic regression model predicts the oversampled data with near-perfect accuracy (>99% accurate)')
printSeparator()

[32m
________________________________________________________________________________ [37m
3
3.4
[32m
________________________________________________________________________________ [37m
The logistic regression model predicts the oversampled data with near-perfect accuracy (>99% accurate)
[32m
________________________________________________________________________________ [37m
