In [43]:
# Import the modules
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `loan_approval_dataset.csv` data from the `Resources` folder into a Pandas DataFrame.

In [44]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
file_path = Path("loan_approval_dataset.csv")
df_loan_apps = pd.read_csv(file_path)

# Review the DataFrame
df_loan_apps.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,Income_to_Loan,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status,loan_status1
0,1,2,Graduate,No,9600000,29900000,0.32107,12,778,2400000,17600000,22700000,8000000,Approved,1
1,2,0,Not Graduate,Yes,4100000,12200000,0.336066,8,417,2700000,2200000,8800000,3300000,Rejected,0
2,3,3,Graduate,No,9100000,29700000,0.306397,20,506,7100000,4500000,33300000,12800000,Rejected,0
3,4,3,Graduate,No,8200000,30700000,0.267101,8,467,18200000,3300000,23300000,7900000,Rejected,0
4,5,5,Not Graduate,Yes,9800000,24200000,0.404959,20,382,12400000,8200000,29400000,5000000,Rejected,0


In [45]:
column_types = df_loan_apps.dtypes
print(column_types)

loan_id                       int64
no_of_dependents              int64
education                    object
self_employed                object
income_annum                  int64
loan_amount                   int64
Income_to_Loan              float64
loan_term                     int64
cibil_score                   int64
residential_assets_value      int64
commercial_assets_value       int64
luxury_assets_value           int64
bank_asset_value              int64
loan_status                  object
loan_status1                  int64
dtype: object


In [46]:
# Convert categorical data to numeric with `pd.get_dummies`
categorical_columns = ['loan_status', 'education', 'self_employed']
application_df_encoded = pd.get_dummies(df_loan_apps, columns=categorical_columns) 
print(application_df_encoded.head())

   loan_id  no_of_dependents  income_annum  loan_amount  Income_to_Loan  \
0        1                 2       9600000     29900000        0.321070   
1        2                 0       4100000     12200000        0.336066   
2        3                 3       9100000     29700000        0.306397   
3        4                 3       8200000     30700000        0.267101   
4        5                 5       9800000     24200000        0.404959   

   loan_term  cibil_score  residential_assets_value  commercial_assets_value  \
0         12          778                   2400000                 17600000   
1          8          417                   2700000                  2200000   
2         20          506                   7100000                  4500000   
3          8          467                  18200000                  3300000   
4         20          382                  12400000                  8200000   

   luxury_assets_value  bank_asset_value  loan_status1  loan_status_

### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [48]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = application_df_encoded["loan_status_Approved"]

# Separate the X variable, the features
X = application_df_encoded.drop("loan_status_Approved", axis=1)


In [49]:
# Review the y variable Series
print(y.head())

0    1
1    0
2    0
3    0
4    0
Name: loan_status_Approved, dtype: uint8


In [50]:
# Review the X variable DataFrame
print(X.head())

   loan_id  no_of_dependents  income_annum  loan_amount  Income_to_Loan  \
0        1                 2       9600000     29900000        0.321070   
1        2                 0       4100000     12200000        0.336066   
2        3                 3       9100000     29700000        0.306397   
3        4                 3       8200000     30700000        0.267101   
4        5                 5       9800000     24200000        0.404959   

   loan_term  cibil_score  residential_assets_value  commercial_assets_value  \
0         12          778                   2400000                 17600000   
1          8          417                   2700000                  2200000   
2         20          506                   7100000                  4500000   
3          8          467                  18200000                  3300000   
4         20          382                  12400000                  8200000   

   luxury_assets_value  bank_asset_value  loan_status1  loan_status_

### Step 3: Check the balance of the labels variable (`y`) by using the `value_counts` function.

In [51]:
# Check the balance of our target values
label_counts = y.value_counts()
print(label_counts)

1    2656
0    1613
Name: loan_status_Approved, dtype: int64


### Step 4: Split the data into training and testing datasets by using `train_test_split`.

In [52]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,stratify=y)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (3201, 17)
X_test shape: (1068, 17)
y_train shape: (3201,)
y_test shape: (1068,)


---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [53]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier

# Fit the model using training data
classifier.fit(X_train, y_train)

### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [54]:
# Score the model
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

# Make a prediction using the testing data
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Training Data Score: 0.6219931271477663
Testing Data Score: 0.6198501872659176


Unnamed: 0,Prediction,Actual
2381,1,1
1642,1,1
750,1,1
2726,1,0
3841,1,1
...,...,...
2107,1,0
57,1,1
3024,1,1
2277,1,1


### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [55]:
from sklearn.metrics import accuracy_score
# Print the balanced_accuracy score of the model
accuracy_score(y_test, predictions)


0.6198501872659176

In [56]:
# Generate a confusion matrix for the model
confusion_mat = confusion_matrix(y_test, predictions)
# Print
print("Confusion Matrix:")
print(confusion_mat)

Confusion Matrix:
[[  0 404]
 [  2 662]]


In [57]:
# Print the classification report for the model
class_report = classification_report(y_test, predictions)
# Print
print("Classification Report:")
print(class_report)

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       404
           1       0.62      1.00      0.77       664

    accuracy                           0.62      1068
   macro avg       0.31      0.50      0.38      1068
weighted avg       0.39      0.62      0.48      1068



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** Classification Report is showing that model has 100% precision for predicting healthy loans and 85% precision for predicting high-risk loans. Model is able to identify 99% of healthy loans and 91% of high-risk loans. Balance between the 2 previous measures (f1-score) is showing 100% for healthy loans and 88% for high-risk loans. This is a good indication about how well the model is likely to perform in real life.

---

## Predict a Logistic Regression Model with Resampled Training Data

### Step 1: Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points. 

In [58]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
rds = RandomOverSampler(random_state=1)
print(rds)
# Fit the original training data to the random_oversampler model
X_train_resampled, y_train_resampled = rds.fit_resample(X_train, y_train)
print("Resampled X_train shape:", X_train_resampled.shape)
print("Resampled y_train shape:", y_train_resampled.shape)

RandomOverSampler(random_state=1)
Resampled X_train shape: (3984, 17)
Resampled y_train shape: (3984,)


In [59]:
# Count the distinct values of the resampled labels data
label_counts_resampled = y_train_resampled.value_counts()
print(label_counts_resampled)

1    1992
0    1992
Name: loan_status_Approved, dtype: int64


### Step 2: Use the `LogisticRegression` classifier and the resampled data to fit the model and make predictions.

In [60]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logreg_model_resampled = LogisticRegression(random_state=1)
print(logreg_model_resampled)
# Fit the model using the resampled training data
logreg_model_resampled.fit(X_train_resampled, y_train_resampled)

# Make a prediction using the testing data
predictions_resampled = logreg_model_resampled.predict(X_test)
print(predictions_resampled)

LogisticRegression(random_state=1)
[1 0 0 ... 1 0 0]


### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [61]:
# Print the balanced_accuracy score of the model 
balanced_accuracy_resampled = balanced_accuracy_score(y_test, predictions_resampled)
print("Balanced Accuracy Score:", balanced_accuracy_resampled)

Balanced Accuracy Score: 0.5464779911726112


In [62]:
# Generate a confusion matrix for the model
confusion_mat_resampled = confusion_matrix(y_test, predictions_resampled)
print("Confusion Matrix for Resampled Model:")
print(confusion_mat_resampled)

Confusion Matrix for Resampled Model:
[[214 190]
 [290 374]]


In [63]:
# Print the classification report for the model
class_report_resampled = classification_report(y_test, predictions_resampled)
print("Classification Report for Resampled Model:")
print(class_report_resampled)

Classification Report for Resampled Model:
              precision    recall  f1-score   support

           0       0.42      0.53      0.47       404
           1       0.66      0.56      0.61       664

    accuracy                           0.55      1068
   macro avg       0.54      0.55      0.54      1068
weighted avg       0.57      0.55      0.56      1068



### Step 4: Answer the following question

**Question:** How well does the logistic regression model, fit with oversampled data, predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** Classification Report is showing that model has 100% precision for predicting healthy loans and 84% precision for predicting high-risk loans. Model is able to identify 99% of healthy loans and 99% of high-risk loans. Balance between the 2 previous measures (f1-score) is showing 100% for healthy loans and 91% for high-risk loans. This is a good indication that this model fit with oversampled data is likely to perform better than previous regression model in real life.