In [36]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report
import joblib
from sklearn.preprocessing import StandardScaler

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [37]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
df = pd.read_csv(
    Path('Resources/lending_data.csv')   
)

# Review the DataFrame
df.tail(30)

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
77506,18100.0,10.801,82200,0.635036,11,2,52200,1
77507,18700.0,11.087,84900,0.646643,12,2,54900,1
77508,21900.0,12.426,97500,0.692308,15,3,67500,1
77509,19900.0,11.565,89400,0.66443,13,2,59400,1
77510,17000.0,10.356,78100,0.615877,10,2,48100,1
77511,15900.0,9.872,73500,0.591837,9,2,43500,1
77512,19200.0,11.265,86600,0.65358,12,2,56600,1
77513,20600.0,11.881,92400,0.675325,14,3,62400,1
77514,16600.0,10.169,76300,0.606815,10,2,46300,1
77515,18600.0,11.045,84500,0.64497,12,2,54500,1


In [38]:
df_sorted = df.sort_values(by='loan_size', ascending=False)
df_sorted.head(20) 

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
1269,23800.0,13.235,105200,0.714829,16,3,75200,0
76662,23500.0,13.126,104100,0.711816,16,3,74100,1
77067,23400.0,13.058,103500,0.710145,16,3,73500,1
75310,23100.0,12.942,102400,0.707031,16,3,72400,1
77175,23100.0,12.927,102300,0.706745,16,3,72300,1
75738,22900.0,12.858,101600,0.704724,16,3,71600,1
75419,22800.0,12.807,101100,0.703264,16,3,71100,1
77449,22500.0,12.699,100100,0.7003,15,3,70100,1
37407,22500.0,12.708,100200,0.700599,15,3,70200,0
75547,22500.0,12.683,100000,0.7,15,3,70000,1


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [47]:
from sklearn.preprocessing import MinMaxScaler

# Separate the y variable, the labels
y = df['loan_status'].values.reshape(-1, 1)

# Separate the X variable, the features
X = df.drop(columns=['loan_status'])

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the features (X)
X = scaler.fit_transform(X)

In [48]:
# Review the y variable Series
y[:5]

array([[0],
       [0],
       [0],
       [0],
       [0]])

In [49]:
# Review the X variable DataFrame
X[:5] 

array([[0.30319149, 0.30331872, 0.30319149, 0.60408607, 0.3125    ,
        0.33333333, 0.30319149],
       [0.18085106, 0.1805886 , 0.18085106, 0.43636541, 0.1875    ,
        0.        , 0.18085106],
       [0.21276596, 0.21452724, 0.21409574, 0.48856556, 0.1875    ,
        0.        , 0.21409574],
       [0.30319149, 0.30231684, 0.3018617 , 0.60257782, 0.3125    ,
        0.33333333, 0.3018617 ],
       [0.30851064, 0.30657483, 0.30585106, 0.60708551, 0.3125    ,
        0.33333333, 0.30585106]])

### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [50]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [51]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using training data
classifier.fit(X_train, y_train)


  y = column_or_1d(y, warn=True)


### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [52]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test)

### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [53]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, predictions)

array([[18655,   110],
       [   35,   584]])

In [54]:
# Print the classification report for the model
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.84      0.94      0.89       619

    accuracy                           0.99     19384
   macro avg       0.92      0.97      0.94     19384
weighted avg       0.99      0.99      0.99     19384



In [55]:
# Save the trained model using joblib
joblib.dump(classifier, 'logistic_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:**
The logistic regression model predicts 0 (healthy loans) with high accuracy (precision of 1.00, recall of 0.99), while it predicts 1 (high-risk loans) with good accuracy (precision of 0.84, recall of 0.94). Overall, the model shows strong performance with high precision and recall for both labels.

Evaluating XG BOOST MODEL

In [59]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler  # Add scaling
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import joblib

# Load your data (Assuming df is already loaded)
# df = pd.read_csv('your_data.csv')  # Uncomment if loading from a CSV file

# Define the target column
target_column = 'loan_status'  # Replace with the correct column name

# Split the data into features (X) and target (y)
X = df.drop(target_column, axis=1)
y = df[target_column]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 1. Logistic Regression Model (with scaled data)
print("Training Logistic Regression...")

# Apply MinMaxScaler for Logistic Regression
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression on scaled data
logistic_model = LogisticRegression()
logistic_model.fit(X_train_scaled, y_train)

# Make predictions and evaluate the Logistic Regression model
y_pred_logistic = logistic_model.predict(X_test_scaled)
logistic_accuracy = accuracy_score(y_test, y_pred_logistic)
print(f'Logistic Regression Accuracy: {logistic_accuracy}')

# Save the Logistic Regression model and scaler
joblib.dump(logistic_model, 'logistic_model.pkl')
joblib.dump(scaler, 'scaler.pkl')  # Save the scaler

### 2. XGBoost Model (with unscaled data)
print("Training XGBoost...")

# Train XGBoost on unscaled data
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)  # No need to scale for XGBoost

# Make predictions and evaluate the XGBoost model
y_pred_xgb = xgb_model.predict(X_test)  # Use unscaled data for XGBoost
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
print(f'XGBoost Accuracy: {xgb_accuracy}')

# Save the XGBoost model
joblib.dump(xgb_model, 'xgboost_model.pkl')

# To load the models later, use:
# loaded_logistic_model = joblib.load('logistic_model.pkl')
# loaded_xgb_model = joblib.load('xgboost_model.pkl')
# loaded_scaler = joblib.load('scaler.pkl')

Training Logistic Regression...
Logistic Regression Accuracy: 0.9934227495486201
Training XGBoost...
XGBoost Accuracy: 0.9951637864328089


['xgboost_model.pkl']

In [61]:
from sklearn.metrics import classification_report, confusion_matrix

# Make predictions on the test set using XGBoost
y_pred_xgb = xgb_model.predict(X_test)  # Use unscaled test data for XGBoost

# Generate the classification report
print("Classification Report for XGBoost:")
print(classification_report(y_test, y_pred_xgb))

# Generate the confusion matrix
print("Confusion Matrix for XGBoost:")
conf_matrix = confusion_matrix(y_test, y_pred_xgb)
print(conf_matrix)


Classification Report for XGBoost:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     15011
           1       0.87      0.99      0.93       497

    accuracy                           1.00     15508
   macro avg       0.94      0.99      0.96     15508
weighted avg       1.00      1.00      1.00     15508

Confusion Matrix for XGBoost:
[[14939    72]
 [    3   494]]


---