<a href="https://colab.research.google.com/github/jerryorajekwe/Predicting-Loan-Default-Risk-with-Machine-Learning-Models/blob/main/loandefaultworkings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Libraries for data manipulation and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Libraries for preprocessing and imputation
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# Libraries for model selection and evaluation
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score

# Machine learning models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

# For handling imbalanced data
from imblearn.over_sampling import SMOTE

# Additional utilities
from collections import Counter

In [3]:
# Load the dataset
loan_data = pd.read_csv('/content/drive/MyDrive/loan.csv', low_memory=False)

In [4]:
# 30% random sample
loan_data_sample = loan_data.sample(frac=0.3, random_state=42)

In [5]:
# Save the sampled dataset
loan_data_sample.to_csv('loan_data_sample.csv', index=False)

In [6]:
# Display information about the sampled dataset
print("Sampled Dataset Information:")
loan_data_sample.info()

Sampled Dataset Information:
<class 'pandas.core.frame.DataFrame'>
Index: 678200 entries, 1758049 to 2038627
Columns: 145 entries, id to settlement_term
dtypes: float64(105), int64(4), object(36)
memory usage: 755.4+ MB


In [7]:
print("First 5 Rows of the Dataset:")
loan_data_sample.head()

First 5 Rows of the Dataset:


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
1758049,,,35000,35000,35000.0,36 months,12.12,1164.51,B,B3,...,,,Cash,N,,,,,,
686533,,,30000,30000,30000.0,60 months,10.75,648.54,B,B4,...,,,Cash,N,,,,,,
900721,,,15000,15000,15000.0,36 months,7.49,466.53,A,A4,...,,,Cash,N,,,,,,
1727912,,,24000,24000,24000.0,60 months,21.15,651.31,E,E2,...,,,Cash,N,,,,,,
539691,,,14400,14400,14400.0,36 months,8.59,455.18,A,A5,...,,,Cash,N,,,,,,


In [8]:
print("Table 1: Descriptive Statistics:")
loan_data_sample.describe().round(2)

Table 1: Descriptive Statistics:


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,url,dti,...,deferral_term,hardship_amount,hardship_length,hardship_dpd,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,settlement_amount,settlement_percentage,settlement_term
count,0.0,0.0,678200.0,678200.0,678200.0,678200.0,678200.0,678199.0,0.0,677684.0,...,3120.0,3120.0,3120.0,3120.0,2484.0,3120.0,3120.0,9997.0,9997.0,9997.0
mean,,,15063.79,15058.51,15040.42,13.1,446.3,77946.48,,18.85,...,3.0,151.33,3.0,13.77,444.48,11385.3,194.32,5028.51,47.83,13.09
std,,,9189.32,9187.32,9191.01,4.83,267.19,74649.5,,14.53,...,0.0,125.54,0.0,9.75,366.31,7421.83,201.58,3699.03,7.13,8.07
min,,,500.0,500.0,0.0,5.31,15.69,0.0,,-1.0,...,3.0,1.61,3.0,0.0,10.17,193.98,0.01,107.0,0.45,0.0
25%,,,8000.0,8000.0,8000.0,9.49,251.98,46000.0,,11.91,...,3.0,57.76,3.0,5.0,171.97,5531.34,43.69,2240.75,45.0,6.0
50%,,,13000.0,13000.0,12875.0,12.62,378.59,65000.0,,17.85,...,3.0,116.4,3.0,15.0,344.52,9919.18,128.99,4179.0,45.0,14.0
75%,,,20000.0,20000.0,20000.0,15.99,593.82,93000.0,,24.5,...,3.0,208.74,3.0,23.0,609.23,15698.72,286.32,6831.0,50.0,18.0
max,,,40000.0,40000.0,40000.0,30.99,1717.63,10999200.0,,999.0,...,3.0,893.63,3.0,31.0,2680.89,40149.35,1275.36,30000.0,184.36,50.0


In [9]:
# Get all features
features = loan_data_sample.columns.tolist()
print("All Features:", features)

# Separate numerical and categorical features
numerical_features = loan_data_sample.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_features = loan_data_sample.select_dtypes(include=['object']).columns.tolist()

print("Numerical Features:", numerical_features)
print("Categorical Features:", categorical_features)

All Features: ['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'annual_inc', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose', 'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d', 'collections_12_mths_ex_med', 'mths_since_last_major_derog', 'policy_code', 'application_type', 'annual_inc_joint', 'dti_joint', 'verification_status_joint', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m', 'open_act_il', 'open_il

In [10]:
columns_to_drop = [
    # IDs & Metadata
    'id', 'member_id', 'url', 'desc', 'title', 'zip_code', 'addr_state',

    # Loan Application Details
    'policy_code', 'pymnt_plan', 'hardship_flag', 'debt_settlement_flag',

    # Hardship & Settlement Details
    'hardship_type', 'hardship_reason', 'hardship_status',
    'hardship_start_date', 'hardship_end_date', 'hardship_loan_status',
    'debt_settlement_flag_date', 'settlement_status', 'settlement_date',
    'settlement_amount', 'settlement_percentage', 'settlement_term',

    # Payment History & Unnecessary Dates
    'next_pymnt_d', 'last_credit_pull_d', 'issue_d', 'earliest_cr_line',

    # Extra Financial Metrics
    'out_prncp', 'out_prncp_inv', 'total_rec_late_fee', 'collection_recovery_fee',

    # Redundant Features
    'total_pymnt_inv', 'total_rec_int', 'recoveries'
]

# Drop the columns
loan_data_sample = loan_data_sample.drop(columns=columns_to_drop)

# Save the cleaned dataset
loan_data_sample.to_csv('loan_data_cleaned.csv', index=False)

print("✅ Successfully dropped unnecessary columns and saved cleaned dataset!")

✅ Successfully dropped unnecessary columns and saved cleaned dataset!


In [11]:
# Display information about the cleaned dataset
print("Cleaned Dataset Information:")
loan_data_sample.info()

Cleaned Dataset Information:
<class 'pandas.core.frame.DataFrame'>
Index: 678200 entries, 1758049 to 2038627
Columns: 111 entries, loan_amnt to disbursement_method
dtypes: float64(92), int64(3), object(16)
memory usage: 579.5+ MB


In [12]:
# Number of duplicate records
loan_data_sample.duplicated().sum()

np.int64(0)

In [13]:
# Missing values in each column
loan_data_sample.isnull().sum()

Unnamed: 0,0
loan_amnt,0
funded_amnt,0
funded_amnt_inv,0
term,0
int_rate,0
...,...
hardship_dpd,675080
orig_projected_additional_accrued_interest,675716
hardship_payoff_balance_amount,675080
hardship_last_payment_amount,675080


In [14]:
# Drop columns with high missing values
threshold = 0.5
columns_to_drop = loan_data_sample.columns[loan_data_sample.isnull().mean() > threshold]
loan_data_sample = loan_data_sample.drop(columns=columns_to_drop)

print(f"Dropped columns: {list(columns_to_drop)}")

Dropped columns: ['mths_since_last_delinq', 'mths_since_last_record', 'mths_since_last_major_derog', 'annual_inc_joint', 'dti_joint', 'verification_status_joint', 'mths_since_recent_bc_dlq', 'mths_since_recent_revol_delinq', 'revol_bal_joint', 'sec_app_earliest_cr_line', 'sec_app_inq_last_6mths', 'sec_app_mort_acc', 'sec_app_open_acc', 'sec_app_revol_util', 'sec_app_open_act_il', 'sec_app_num_rev_accts', 'sec_app_chargeoff_within_12_mths', 'sec_app_collections_12_mths_ex_med', 'sec_app_mths_since_last_major_derog', 'deferral_term', 'hardship_amount', 'payment_plan_start_date', 'hardship_length', 'hardship_dpd', 'orig_projected_additional_accrued_interest', 'hardship_payoff_balance_amount', 'hardship_last_payment_amount']


In [15]:
from sklearn.impute import SimpleImputer

# Identify numerical and categorical features
num_features = loan_data_sample.select_dtypes(include=['float64', 'int64']).columns
cat_features = loan_data_sample.select_dtypes(include=['object']).columns

# Apply median imputation for numerical features
num_imputer = SimpleImputer(strategy='median')
loan_data_sample[num_features] = num_imputer.fit_transform(loan_data_sample[num_features])

# Apply mode imputation for categorical features
cat_imputer = SimpleImputer(strategy='most_frequent')
loan_data_sample[cat_features] = cat_imputer.fit_transform(loan_data_sample[cat_features])

print("✅ Successfully handled missing values across all features!")

✅ Successfully handled missing values across all features!


In [16]:
loan_data_sample.isnull().sum().sum()

np.int64(0)

In [17]:
# Compute correlation matrix
corr_matrix = loan_data_sample.select_dtypes(include=['number']).corr().abs()

# Identify highly correlated features (threshold > 0.85)
high_corr_features = set()
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if corr_matrix.iloc[i, j] > 0.85:
            high_corr_features.add(corr_matrix.columns[i])

# Drop highly correlated features
loan_data_sample = loan_data_sample.drop(columns=high_corr_features)

print(f"✅ Removed highly correlated features: {high_corr_features}")

✅ Removed highly correlated features: {'total_il_high_credit_limit', 'num_rev_tl_bal_gt_0', 'num_sats', 'total_rec_prncp', 'funded_amnt', 'tot_hi_cred_lim', 'funded_amnt_inv', 'installment'}


In [18]:
# Define binary loan status mapping
loan_status_mapping = {
    'Fully Paid': 1,
    'Current': 1,
    'Charged Off': 0,
    'Default': 0,
    'Late (31-120 days)': 0,
    'Late (16-30 days)': 0,
    'In Grace Period': 0,
    'Does not meet the credit policy. Status: Fully Paid': 1,
    'Does not meet the credit policy. Status: Charged Off': 0
}

# Apply mapping
loan_data_sample['loan_status_binary'] = loan_data_sample['loan_status'].map(loan_status_mapping)

# Drop original loan_status column
loan_data_sample = loan_data_sample.drop(columns=['loan_status'])

# Print success message
print("✅ Loan status successfully binarized!")

# Show counts of default (0) and non-default (1) values
print("\nDefault and Non-Default Value Counts:")
print(loan_data_sample['loan_status_binary'].value_counts())


✅ Loan status successfully binarized!

Default and Non-Default Value Counts:
loan_status_binary
1.0    588196
0.0     89172
Name: count, dtype: int64


In [19]:
def cap_outliers_and_fillna(df, columns):
    for col in columns:
        lower_cap = df[col].quantile(0.01)  # 1st percentile
        upper_cap = df[col].quantile(0.99)  # 99th percentile
        df[col] = df[col].clip(lower=lower_cap, upper=upper_cap)  # Capping values

    # Fill NaN values with median (recommended for numerical features)
    df[columns] = df[columns].fillna(df[columns].median())

    return df

# Apply outlier capping and handle NaNs
num_features = loan_data_sample.select_dtypes(include=['float64', 'int64']).columns
loan_data_sample = cap_outliers_and_fillna(loan_data_sample, num_features)

print("✅ Extreme values capped, and NaN values replaced with median!")

✅ Extreme values capped, and NaN values replaced with median!


In [20]:
# Separate features (X) and target (y)
X = loan_data_sample.drop(columns=['loan_status_binary'])
y = loan_data_sample['loan_status_binary']

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Output the shape of the splits
print(f"Training Data Shape: {X_train.shape}, {y_train.shape}")
print(f"Testing Data Shape: {X_test.shape}, {y_test.shape}")

Training Data Shape: (542560, 75), (542560,)
Testing Data Shape: (135640, 75), (135640,)


In [21]:
from sklearn.preprocessing import OrdinalEncoder

# Identify categorical columns
cat_features = X_train.select_dtypes(include=['object']).columns

# Initialize Ordinal Encoder (handles unseen categories)
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# Fit encoder on training data and transform
X_train[cat_features] = ordinal_encoder.fit_transform(X_train[cat_features])

# Apply encoding to test set (avoiding KeyError on unseen values)
X_test[cat_features] = ordinal_encoder.transform(X_test[cat_features])

print("✅ Successfully encoded categorical features with OrdinalEncoder!")


✅ Successfully encoded categorical features with OrdinalEncoder!


In [22]:
from sklearn.preprocessing import StandardScaler

# Identify numerical columns
num_features = X_train.select_dtypes(include=['float64', 'int64']).columns

# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform training data
X_train[num_features] = scaler.fit_transform(X_train[num_features])

# Transform test data (keeping consistency)
X_test[num_features] = scaler.transform(X_test[num_features])

print("✅ Successfully scaled numerical features!")


✅ Successfully scaled numerical features!


In [23]:
from sklearn.preprocessing import StandardScaler

# Identify numerical features
num_features = X_train.select_dtypes(include=['float64', 'int64']).columns

# Initialize StandardScaler
scaler = StandardScaler()

# Fit scaler on training data
X_train[num_features] = scaler.fit_transform(X_train[num_features])

# Ensure X_test has the same columns as X_train
X_test = X_test[X_train.columns]

# Apply scaling
X_test[num_features] = scaler.transform(X_test[num_features])

print("✅ Successfully scaled numerical features!")

✅ Successfully scaled numerical features!


In [24]:
from imblearn.over_sampling import SMOTE

# Check class distribution before SMOTE
print("🔹 Class Distribution in y_train BEFORE SMOTE:")
print(y_train.value_counts())

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE only to the training set (avoid data leakage)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check class distribution after SMOTE
print("\n✅ Class Distribution in y_train AFTER SMOTE:")
print(y_train_resampled.value_counts())

🔹 Class Distribution in y_train BEFORE SMOTE:
loan_status_binary
1.0    471060
0.0     71500
Name: count, dtype: int64

✅ Class Distribution in y_train AFTER SMOTE:
loan_status_binary
1.0    471060
0.0    471060
Name: count, dtype: int64


In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

lr_baseline = LogisticRegression(random_state=42, max_iter=100)
lr_baseline.fit(X_train_resampled, y_train_resampled)
y_pred_lr = lr_baseline.predict(X_test)

print("🔹 LR Accuracy Score:", accuracy_score(y_test, y_pred_lr))
print("\n🔹 LR Classification Report:\n", classification_report(y_test, y_pred_lr))
print("\n🔹 LR Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))

🔹 LR Accuracy Score: 0.7884399882040696

🔹 LR Classification Report:
               precision    recall  f1-score   support

         0.0       0.36      0.80      0.50     17672
         1.0       0.96      0.79      0.87    117968

    accuracy                           0.79    135640
   macro avg       0.66      0.79      0.68    135640
weighted avg       0.89      0.79      0.82    135640


🔹 LR Confusion Matrix:
 [[14188  3484]
 [25212 92756]]


In [33]:
from sklearn.ensemble import RandomForestClassifier

rf_baseline = RandomForestClassifier(random_state=42, n_estimators=50, max_depth=5)  # Lower depth
rf_baseline.fit(X_train_resampled, y_train_resampled)
y_pred_rf = rf_baseline.predict(X_test)

print("🔹 RF Accuracy Score:", accuracy_score(y_test, y_pred_rf))
print("\n🔹 RF Classification Report:\n", classification_report(y_test, y_pred_rf))
print("\n🔹 RF Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

🔹 RF Accuracy Score: 0.8161530521969921

🔹 RF Classification Report:
               precision    recall  f1-score   support

         0.0       0.39      0.74      0.51     17672
         1.0       0.95      0.83      0.89    117968

    accuracy                           0.82    135640
   macro avg       0.67      0.78      0.70    135640
weighted avg       0.88      0.82      0.84    135640


🔹 RF Confusion Matrix:
 [[13071  4601]
 [20336 97632]]


In [34]:
from xgboost import XGBClassifier

xgb_baseline = XGBClassifier(random_state=42, n_estimators=50, max_depth=3, eval_metric='logloss')  # Lower depth
xgb_baseline.fit(X_train_resampled, y_train_resampled)
y_pred_xgb = xgb_baseline.predict(X_test)

print("🔹 XGBoost Accuracy Score:", accuracy_score(y_test, y_pred_xgb))
print("\n🔹 XGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))
print("\n🔹 XGBoost Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))

🔹 XGBoost Accuracy Score: 0.9773739309938071

🔹 XGBoost Classification Report:
               precision    recall  f1-score   support

         0.0       0.92      0.91      0.91     17672
         1.0       0.99      0.99      0.99    117968

    accuracy                           0.98    135640
   macro avg       0.95      0.95      0.95    135640
weighted avg       0.98      0.98      0.98    135640


🔹 XGBoost Confusion Matrix:
 [[ 16090   1582]
 [  1487 116481]]


In [None]:
param_grid_lr = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Expanded range for regularization strength
    'solver': ['liblinear', 'lbfgs', 'newton-cg'],  # Additional solver option
    'max_iter': [100, 300, 500]  # Slightly increased iterations for better convergence
}

lr_model = LogisticRegression(random_state=42)

random_search_lr = RandomizedSearchCV(lr_model, param_grid_lr, cv=3, n_iter=10, n_jobs=-1, random_state=42)
random_search_lr.fit(X_train_resampled, y_train_resampled)

print("✅ Best Logistic Regression Hyperparameters:", random_search_lr.best_params_)

In [None]:
param_grid_rf = {
    'n_estimators': [50, 100, 300],  # Increase upper range for better exploration
    'max_depth': [3, 5, 10],  # Include deeper trees for better learning
    'min_samples_split': [2, 5, 10],  # Test more thresholds for splitting
    'min_samples_leaf': [1, 2, 4]  # Add more variation for leaf samples
}

rf_model = RandomForestClassifier(random_state=42)

random_search_rf = RandomizedSearchCV(rf_model, param_grid_rf, cv=3, n_iter=10, n_jobs=-1, random_state=42)
random_search_rf.fit(X_train_resampled, y_train_resampled)

print("✅ Best Random Forest Hyperparameters:", random_search_rf.best_params_)

In [None]:
param_grid_xgb = {
    'n_estimators': [50, 100, 200],  # Limited range to avoid excessive tuning
    'learning_rate': [0.01, 0.05, 0.1],  # Small adjustments to fine-tune step size
    'max_depth': [3, 5],  # Keep depth reasonable to prevent overfitting
    'subsample': [0.8, 1.0],  # Balanced data sampling
    'colsample_bytree': [0.8, 1.0]  # Optimized feature selection per tree
}

xgb_model = XGBClassifier(random_state=42, eval_metric='logloss')

random_search_xgb = RandomizedSearchCV(xgb_model, param_grid_xgb, cv=3, n_iter=5, n_jobs=-1, random_state=42)
random_search_xgb.fit(X_train_resampled, y_train_resampled)

print("✅ Best XGBoost Hyperparameters:", random_search_xgb.best_params_)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Logistic Regression Evaluation
y_pred_lr_best = random_search_lr.best_estimator_.predict(X_test)
print("\n🔹 LR Accuracy Score:", accuracy_score(y_test, y_pred_lr_best))
print("\n🔹 Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_lr_best))
print("\n🔹 Logistic Regression Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr_best))

# Random Forest Evaluation
y_pred_rf_best = random_search_rf.best_estimator_.predict(X_test)
print("\n🔹 RF Accuracy Score:", accuracy_score(y_test, y_pred_rf_best))
print("\n🔹 Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf_best))
print("\n🔹 Random Forest Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf_best))

# XGBoost Evaluation
y_pred_xgb_best = random_search_xgb.best_estimator_.predict(X_test)
print("\n🔹 XGBoost Accuracy Score:", accuracy_score(y_test, y_pred_xgb_best))
print("\n🔹 XGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb_best))
print("\n🔹 XGBoost Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb_best))

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Initialize Logistic Regression model
lr_model = LogisticRegression(random_state=42, max_iter=500)

# Train the model
lr_model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = lr_model.predict(X_test)

# Evaluate performance
print("🔹 Accuracy Score:", accuracy_score(y_test, y_pred))
print("\n🔹 Classification Report:\n", classification_report(y_test, y_pred))

🔹 Accuracy Score: 0.7884399882040696

🔹 Classification Report:
               precision    recall  f1-score   support

         0.0       0.36      0.80      0.50     17672
         1.0       0.96      0.79      0.87    117968

    accuracy                           0.79    135640
   macro avg       0.66      0.79      0.68    135640
weighted avg       0.89      0.79      0.82    135640



In [27]:
# Add Confusion Matrix
print("\n🔹 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


🔹 Confusion Matrix:
[[14188  3484]
 [25212 92756]]


In [29]:
from sklearn.ensemble import RandomForestClassifier

# Initialize baseline Random Forest model
rf_baseline = RandomForestClassifier(random_state=42)

# Train baseline model
rf_baseline.fit(X_train_resampled, y_train_resampled)

# Make predictions
y_pred_rf = rf_baseline.predict(X_test)

# Evaluate performance
print("🔹 Random Forest Accuracy Score:", accuracy_score(y_test, y_pred_rf))
print("\n🔹 Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf))
print("\n🔹 Random Forest Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

🔹 Random Forest Accuracy Score: 0.966020347979947

🔹 Random Forest Classification Report:
               precision    recall  f1-score   support

         0.0       0.91      0.82      0.86     17672
         1.0       0.97      0.99      0.98    117968

    accuracy                           0.97    135640
   macro avg       0.94      0.90      0.92    135640
weighted avg       0.97      0.97      0.97    135640


🔹 Random Forest Confusion Matrix:
 [[ 14534   3138]
 [  1471 116497]]


In [30]:
from xgboost import XGBClassifier

# Initialize baseline XGBoost model
xgb_baseline = XGBClassifier(random_state=42, eval_metric='logloss')

# Train baseline model
xgb_baseline.fit(X_train_resampled, y_train_resampled)

# Make predictions
y_pred_xgb = xgb_baseline.predict(X_test)

# Evaluate performance
print("🔹 XGBoost Accuracy Score:", accuracy_score(y_test, y_pred_xgb))
print("\n🔹 XGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))
print("\n🔹 XGBoost Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))

🔹 XGBoost Accuracy Score: 0.9857269242111472

🔹 XGBoost Classification Report:
               precision    recall  f1-score   support

         0.0       0.97      0.92      0.94     17672
         1.0       0.99      1.00      0.99    117968

    accuracy                           0.99    135640
   macro avg       0.98      0.96      0.97    135640
weighted avg       0.99      0.99      0.99    135640


🔹 XGBoost Confusion Matrix:
 [[ 16284   1388]
 [   548 117420]]


In [31]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Define optimized hyperparameter grids (fewer values to speed up tuning)
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10],  # Fewer values for faster search
    'solver': ['liblinear', 'lbfgs'],
    'max_iter': [100, 300]
}

param_grid_rf = {
    'n_estimators': [100, 300],  # Reduced number of trees
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

param_grid_xgb = {
    'n_estimators': [100, 300],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 6],
    'subsample': [0.8, 1.0]
}

# Initialize models
lr_model = LogisticRegression(random_state=42)
rf_model = RandomForestClassifier(random_state=42)
xgb_model = XGBClassifier(random_state=42, eval_metric='logloss')

# Perform Randomized Search for each model
random_search_lr = RandomizedSearchCV(lr_model, param_grid_lr, cv=3, scoring='accuracy', n_iter=5, n_jobs=-1, random_state=42)
random_search_rf = RandomizedSearchCV(rf_model, param_grid_rf, cv=3, scoring='accuracy', n_iter=5, n_jobs=-1, random_state=42)
random_search_xgb = RandomizedSearchCV(xgb_model, param_grid_xgb, cv=3, scoring='accuracy', n_iter=5, n_jobs=-1, random_state=42)

# Fit models
random_search_lr.fit(X_train_resampled, y_train_resampled)
random_search_rf.fit(X_train_resampled, y_train_resampled)
random_search_xgb.fit(X_train_resampled, y_train_resampled)

# Print best hyperparameters
print("✅ Best Logistic Regression Hyperparameters:", random_search_lr.best_params_)
print("✅ Best Random Forest Hyperparameters:", random_search_rf.best_params_)
print("✅ Best XGBoost Hyperparameters:", random_search_xgb.best_params_)

# Train final models
best_lr_model = random_search_lr.best_estimator_
best_rf_model = random_search_rf.best_estimator_
best_xgb_model = random_search_xgb.best_estimator_

# Make predictions
y_pred_lr_best = best_lr_model.predict(X_test)
y_pred_rf_best = best_rf_model.predict(X_test)
y_pred_xgb_best = best_xgb_model.predict(X_test)

# Evaluate performance
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print("\n🔹 LR Accuracy:", accuracy_score(y_test, y_pred_lr_best))
print("🔹 RF Accuracy:", accuracy_score(y_test, y_pred_rf_best))
print("🔹 XGB Accuracy:", accuracy_score(y_test, y_pred_xgb_best))

print("\n🔹 Logistic Regression Report:\n", classification_report(y_test, y_pred_lr_best))
print("\n🔹 Random Forest Report:\n", classification_report(y_test, y_pred_rf_best))
print("\n🔹 XGBoost Report:\n", classification_report(y_test, y_pred_xgb_best))

print("\n🔹 Logistic Regression Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr_best))
print("\n🔹 Random Forest Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf_best))
print("\n🔹 XGBoost Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb_best))


KeyboardInterrupt: 