In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
lending_df = pd.read_csv(Path('Resources/lending_data.csv'))

# Review the DataFrame
lending_df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [3]:
# Separate the data into labels and features

# Separate the y variable, the labels
target_labels=['Healthy Loan', 'High-Risk Loan']
y = lending_df['loan_status']

# Separate the X variable, the features
X = lending_df.drop(columns='loan_status')

In [4]:
# Review the y variable Series
y[:5]

0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64

In [5]:
# Print count of each loan status
lending_df['loan_status'].value_counts()

loan_status
0    75036
1     2500
Name: count, dtype: int64

In [6]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [8]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [11]:
# Import the StandardScaler module
from sklearn.preprocessing import StandardScaler
# Create the StandardScaler instance
scaler = StandardScaler()

In [12]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [13]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model = LogisticRegression(solver='lbfgs', random_state=1, max_iter=300)

# Create the decision tree classifier instance
decision_tree_model = DecisionTreeClassifier()

# Create the random forest classifier instance
random_forest_model = RandomForestClassifier(n_estimators=500, random_state=78)

# Create the KNeighbors classifier instance
kneighbors_model = KNeighborsClassifier()

# Create the GaussianNB classifier instance
gaussian_model = GaussianNB()

# Create the LinearSVC classifier instance
linear_svc_model = LinearSVC()

# Create the SVC classifier instance
non_linear_svc_model = SVC(kernel='rbf')

In [14]:
# Fit the model using training data
lr_model = logistic_regression_model.fit(X_train_scaled, y_train)

# Fit the model
dt_model = decision_tree_model.fit(X_train_scaled, y_train)

# Fit the model
rf_model = random_forest_model.fit(X_train_scaled, y_train)

# Fit the model
knn_model = kneighbors_model.fit(X_train_scaled, y_train)

# Fit the model
gnb_model = gaussian_model.fit(X_train_scaled, y_train)

# Fit the model
lsvc_model = linear_svc_model.fit(X_train_scaled, y_train)

# Fit the model
nlsvc_model = non_linear_svc_model.fit(X_train_scaled, y_train)


### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [15]:
# Make a prediction using the testing data
lr_predictions = lr_model.predict(X_test_scaled)

# Make a prediction using the testing data
dt_predictions = dt_model.predict(X_test_scaled)

# Make a prediction using the testing data
rf_predictions = rf_model.predict(X_test_scaled)

# Make a prediction using the testing data
knn_predictions = knn_model.predict(X_test_scaled)

# Make a prediction using the testing data
gnb_predictions = gnb_model.predict(X_test_scaled)

# Make a prediction using the testing data
lsvc_predictions = lsvc_model.predict(X_test_scaled)

# Make a prediction using the testing data
nlsvc_predictions = nlsvc_model.predict(X_test_scaled)

### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [16]:
# Function to generate a confusion matrix
def generate_confusion_matrix(y_true, y_pred, model_name="Model"):
    # Generate the confusion matrix
    matrix = confusion_matrix(y_true, y_pred)
    
    # Create a DataFrame with labeled rows and columns
    matrix_df = pd.DataFrame(
        matrix, 
        index=['Actual Negative', 'Actual Positive'], 
        columns=['Predicted Negative', 'Predicted Positive']
    )
    
    # Print the confusion matrix
    print(f"{model_name} Confusion Matrix:")
    return matrix_df  # Return the DataFrame if further processing is needed


In [17]:
# Function to generate a classification report
def generate_classification_report(y_true, y_pred, target_names=None, model_name="Model"):
    # Generate the classification report as a dictionary
    report = classification_report(y_true, y_pred, target_names=target_names, output_dict=True)
    
    # Convert the dictionary to a DataFrame and format numbers to two decimal places
    report_df = pd.DataFrame(report).transpose()
    report_df = report_df.round(2)  # Round all values to 2 decimal places

    # Print the classification report
    print(f"{model_name} Classification Report:")
    return report_df  # Return the DataFrame for further analysis if needed


In [21]:
# Create the confusion matrix for the Logistic Regression model
generate_confusion_matrix(y_test, lr_predictions, model_name="Logistic Regression")

Logistic Regression Confusion Matrix:


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,18652,113
Actual Positive,9,610


In [20]:
# Create the confusion matrix for the Decision Tree model
generate_confusion_matrix(y_test, dt_predictions, model_name="Decision Tree")

Decision Tree Confusion Matrix:


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,18668,97
Actual Positive,93,526


In [19]:
# Create the confusion matrix for the Random Forest model
generate_confusion_matrix(y_test, rf_predictions, model_name="Random Forest")

Random Forest Confusion Matrix:


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,18666,99
Actual Positive,61,558


In [18]:
# Create the confusion matrix for the KNeighbors model
generate_confusion_matrix(y_test, knn_predictions, model_name="KNeighbors")

KNeighbors Confusion Matrix:


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,18651,114
Actual Positive,16,603


In [22]:
# Create the confusion matrix for the GaussianNB model
generate_confusion_matrix(y_test, gnb_predictions, model_name="GaussianNB")

GaussianNB Confusion Matrix:


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,18649,116
Actual Positive,4,615


In [23]:
# Create the confusion matrix for the LinearSVC model
generate_confusion_matrix(y_test, lsvc_predictions, model_name="LinearSVC")

LinearSVC Confusion Matrix:


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,18652,113
Actual Positive,7,612


In [24]:
# Create the confusion matrix for the SVC model
generate_confusion_matrix(y_test, nlsvc_predictions, model_name="SVC")

SVC Confusion Matrix:


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,18649,116
Actual Positive,4,615


In [25]:
# Create the classification report for the Logistic Regression model
generate_classification_report(y_test, lr_predictions, target_names=target_labels, model_name="Logistic Regression")

Logistic Regression Classification Report:


Unnamed: 0,precision,recall,f1-score,support
Healthy Loan,1.0,0.99,1.0,18765.0
High-Risk Loan,0.84,0.99,0.91,619.0
accuracy,0.99,0.99,0.99,0.99
macro avg,0.92,0.99,0.95,19384.0
weighted avg,0.99,0.99,0.99,19384.0


In [26]:
# Create the classification report for the Decision Tree model
generate_classification_report(y_test, dt_predictions, target_names=target_labels, model_name="Decision Tree")

Decision Tree Classification Report:


Unnamed: 0,precision,recall,f1-score,support
Healthy Loan,1.0,0.99,0.99,18765.0
High-Risk Loan,0.84,0.85,0.85,619.0
accuracy,0.99,0.99,0.99,0.99
macro avg,0.92,0.92,0.92,19384.0
weighted avg,0.99,0.99,0.99,19384.0


In [27]:
# Create the classification report for the Random Forest model
generate_classification_report(y_test, rf_predictions, target_names=target_labels, model_name="Random Forest")

Random Forest Classification Report:


Unnamed: 0,precision,recall,f1-score,support
Healthy Loan,1.0,0.99,1.0,18765.0
High-Risk Loan,0.85,0.9,0.87,619.0
accuracy,0.99,0.99,0.99,0.99
macro avg,0.92,0.95,0.94,19384.0
weighted avg,0.99,0.99,0.99,19384.0


In [28]:
# Create the classification report for the KNeighbors model
generate_classification_report(y_test, knn_predictions, target_names=target_labels, model_name="KNeighbors")

KNeighbors Classification Report:


Unnamed: 0,precision,recall,f1-score,support
Healthy Loan,1.0,0.99,1.0,18765.0
High-Risk Loan,0.84,0.97,0.9,619.0
accuracy,0.99,0.99,0.99,0.99
macro avg,0.92,0.98,0.95,19384.0
weighted avg,0.99,0.99,0.99,19384.0


In [29]:
# Create the classification report for the GaussianNB model
generate_classification_report(y_test, gnb_predictions, target_names=target_labels, model_name="GaussianNB")

GaussianNB Classification Report:


Unnamed: 0,precision,recall,f1-score,support
Healthy Loan,1.0,0.99,1.0,18765.0
High-Risk Loan,0.84,0.99,0.91,619.0
accuracy,0.99,0.99,0.99,0.99
macro avg,0.92,0.99,0.95,19384.0
weighted avg,0.99,0.99,0.99,19384.0


In [30]:
# Create the classification report for the LinearSVC model
generate_classification_report(y_test, lsvc_predictions, target_names=target_labels, model_name="LinearSVC")

LinearSVC Classification Report:


Unnamed: 0,precision,recall,f1-score,support
Healthy Loan,1.0,0.99,1.0,18765.0
High-Risk Loan,0.84,0.99,0.91,619.0
accuracy,0.99,0.99,0.99,0.99
macro avg,0.92,0.99,0.95,19384.0
weighted avg,0.99,0.99,0.99,19384.0


In [31]:
# Create the classification report for the SVC model
generate_classification_report(y_test, nlsvc_predictions, target_names=target_labels, model_name="SVC")

SVC Classification Report:


Unnamed: 0,precision,recall,f1-score,support
Healthy Loan,1.0,0.99,1.0,18765.0
High-Risk Loan,0.84,0.99,0.91,619.0
accuracy,0.99,0.99,0.99,0.99
macro avg,0.92,0.99,0.95,19384.0
weighted avg,0.99,0.99,0.99,19384.0


In [36]:
from sklearn.metrics import accuracy_score

print("Gaussian NB is %f percent accurate" % (accuracy_score(gnb_predictions, y_test)*100))
print("Linear SVMs is %f percent accurate" % (accuracy_score(lsvc_predictions, y_test)*100))
print("Non Linear SVMs is %f percent accurate" % (accuracy_score(nlsvc_predictions, y_test)*100))
print("Logistic Regression is %f percent accurate" % (accuracy_score(lr_predictions, y_test)*100))
print("KNeighbors is %f percent accurate" % (accuracy_score(knn_predictions, y_test)*100))
print("Random Forests is %f percent accurate" % (accuracy_score(rf_predictions, y_test)*100))
print("Decision Trees is %f percent accurate" % (accuracy_score(dt_predictions, y_test)*100))


Gaussian NB is 99.380933 percent accurate
Linear SVMs is 99.380933 percent accurate
Non Linear SVMs is 99.380933 percent accurate
Logistic Regression is 99.370615 percent accurate
KNeighbors is 99.329344 percent accurate
Random Forests is 99.174577 percent accurate
Decision Trees is 99.019810 percent accurate


### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** Based on the classification report, the logistic regression model performs very well overall, with an accuracy of **0.99**. Here’s a more detailed breakdown for each label:

### Healthy Loan
- **Precision**: 1.00, indicating that all predictions labeled as "Healthy Loan" are highly accurate with virtually no false positives.
- **Recall**: 0.99, meaning the model correctly identifies 99% of actual "Healthy Loan" cases.
- **F1-Score**: 1.00, which combines precision and recall, showing that the model is extremely effective at predicting "Healthy Loan" cases.

**Interpretation**: The model predicts "Healthy Loan" cases almost perfectly.

### High-Risk Loan
- **Precision**: 0.84, so about 84% of "High-Risk Loan" predictions are correct. This leaves room for some false positives (loans predicted as "High-Risk" that aren’t).
- **Recall**: 0.99, meaning the model identifies 99% of actual "High-Risk Loan" cases correctly.
- **F1-Score**: 0.91, showing a strong but slightly less effective performance for "High-Risk Loan" predictions compared to "Healthy Loan."

**Interpretation**: The model performs well in identifying "High-Risk Loan" cases, though slightly less accurately than "Healthy Loan."

### Overall Assessment
- **Accuracy**: 0.99, indicating that the model is highly effective in predicting both "Healthy Loan" and "High-Risk Loan" cases.
- **Macro Average**: Precision of 0.92, recall of 0.99, and F1-score of 0.95 across both classes, indicating strong performance even when accounting for the class imbalance.
  
In conclusion, the model is extremely accurate for "Healthy Loan" cases and performs very well for "High-Risk Loan" cases, though there’s slightly more room for improvement in precision for the high-risk predictions.

---