<a href="https://colab.research.google.com/github/jcvdm01/machine-and-deep-learning/blob/main/Group%20Project/Final_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from collections import Counter
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint,uniform
from xgboost import XGBClassifier

# Import data and drop redundent variables
df = pd.read_csv('lending_club_loan_two.csv')
df = df.drop(['grade', 'emp_title', 'title', 'installment', 'issue_d', 'earliest_cr_line', 'emp_length'], axis=1)

# Generate dummies for categorial variables
dummies = ['sub_grade', 'verification_status', 'purpose',
            'initial_list_status', 'application_type', 'home_ownership']
df = pd.get_dummies(df, columns=dummies, drop_first=True)

# Extract Zip code from Address and remove Address
df['zip_code'] = df.address.apply(lambda x: x[-5:])
df = pd.get_dummies(df, columns=['zip_code'], drop_first=True)
df.drop('address', axis=1, inplace=True)

# process term from string to int
term_values = {' 36 months': 36, ' 60 months': 60}
df['term'] = df.term.map(term_values)

# dummy for dependent variable(loan status)
df['loan_status_binary'] = df['loan_status'].apply(lambda x: 0 if x == 'Fully Paid' else 1)
df = df.drop(['loan_status'], axis=1)


# Fill nan total_acc by group averages
total_acc_avg = df.groupby(by='total_acc').mean().mort_acc
def fill_mort_acc(total_acc, mort_acc):
    if np.isnan(mort_acc):
        return total_acc_avg[total_acc].round()
    else:
        return mort_acc
df['mort_acc'] = df.apply(lambda x: fill_mort_acc(x['total_acc'], x['mort_acc']), axis=1)

df.dropna(inplace=True)

y = df['loan_status_binary']
X = df.drop('loan_status_binary', axis = 1)

# Turn all booleans into floats (1 or 0)
X = X.astype(float)

#Split data 80/20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Apply SMOTE on original df
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Apply PCA on original df
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Apply pca on Smote
pca_smote = PCA(n_components=0.95)
X_train_smote_pca = pca_smote.fit_transform(X_train_smote)
X_test_smote_pca = pca_smote.transform(X_test)
y_train_smote_pca = y_train_smote


In [None]:
# Function to build, tune, and evaluate a Random Forest on a given dataset
def CreateRandomForest(Xtrain, Xtest, ytrain, ytest):
    # Define the hyperparameter search space
    param_grid = {
        'n_estimators': stats.randint(100, 1500),       # Number of trees
        'max_depth': stats.randint(10, 100),            # Maximum depth of each tree
        'min_samples_split': stats.randint(2, 10),      # Minimum samples to split a node
        'min_samples_leaf': stats.randint(1, 10)        # Minimum samples at a leaf node
    }

    # Initialize the base Random Forest model
    base_rf = RFC(class_weight='balanced', random_state=42)
    # class_weight='balanced' compensates for class imbalance (e.g., default vs non-default)

    # Set up RandomizedSearchCV for hyperparameter tuning
    rf_search = RandomizedSearchCV(
        estimator=base_rf,
        param_distributions=param_grid,
        n_iter=60,                   # Number of hyperparameter combinations to try
        cv=3,                        # 3-fold cross-validation
        scoring='roc_auc',          # Optimize AUC (suitable for imbalanced binary classification)
        n_jobs=2,                   # Use 2 CPU cores (limit to avoid memory overload)
        verbose=1,
        random_state=42
    )

    # Fit the search on the training data
    rf_search.fit(Xtrain, ytrain)

    # Get the best model found by RandomizedSearchCV
    best_rf = rf_search.best_estimator_

    # Generate predictions on both training and testing data
    y_train_pred = best_rf.predict(Xtrain)
    y_test_pred = best_rf.predict(Xtest)

    # Output the best parameters found
    print("\nBest Parameters:", rf_search.best_params_)

    # Print classification report on training set
    print("Classification Report (Train Set):")
    print(classification_report(ytrain, y_train_pred, target_names=["Default", "Non-default"]))

    # Print classification report on test set
    print("Classification Report (Test Set):")
    print(classification_report(ytest, y_test_pred, target_names=["Default", "Non-default"]))


# Apply the function on various dataset variants:
# 1. Original dataset
CreateRandomForest(X_train, X_test, y_train, y_test)

# 2. PCA-transformed version of the original dataset
CreateRandomForest(X_train_pca, X_test_pca, y_train, y_test)

# 3. SMOTE-balanced dataset
CreateRandomForest(X_train_smote, X_test, y_train_smote, y_test)

# 4. SMOTE-balanced + PCA-transformed
CreateRandomForest(X_train_smote_pca, X_test_smote_pca, y_train_smote_pca, y_test)


Fitting 3 folds for each of 60 candidates, totalling 180 fits


KeyboardInterrupt: 

In [None]:
# XGBoost with scale_pos_weight = 4 for imbalanced (original) data
def CreateXGBoostUnbalanced(Xtrain, Xtest, ytrain, ytest):

    # Define hyperparameter search space
    param_dist = {
        'n_estimators': randint(200, 800),
        'max_depth': randint(3, 15),
        'learning_rate': uniform(0, 1),
        'subsample': uniform(0.6, 0.4),
        'colsample_bytree': uniform(0.6, 0.4),
        'gamma': uniform(0, 0.5)
    }

    # Initialize base XGBoost model with class imbalance compensation
    xgb_clf = XGBClassifier(
        objective='binary:logistic',
        eval_metric='auc',
        use_label_encoder=False,
        scale_pos_weight=4  # Weighting for imbalanced classes
    )

    # Perform randomized search over hyperparameters with 3-fold CV
    xgb_cv = RandomizedSearchCV(
        xgb_clf,
        param_distributions=param_dist,
        n_iter=60,
        cv=3,
        scoring='roc_auc',
        verbose=1,
        n_jobs=2  # Limit CPU usage to avoid Windows OSError
    )

    # Train the model with the best parameter set found
    xgb_cv.fit(Xtrain, ytrain)

    # Retrieve the best fitted model
    best_xgb = xgb_cv.best_estimator_

    # Make predictions
    y_train_pred_xgb = best_xgb.predict(Xtrain)
    y_test_pred_xgb = best_xgb.predict(Xtest)

    # Output evaluation metrics
    print("Classification Report (Train Set):")
    print(classification_report(ytrain, y_train_pred_xgb))

    print("Classification Report (Test Set):")
    print(classification_report(ytest, y_test_pred_xgb))


# XGBoost without scale_pos_weight, for use on balanced data (e.g., SMOTE)
def Create_XGBoost_Balanced(Xtrain, Xtest, ytrain, ytest):

    # Define hyperparameter search space
    param_dist = {
        'n_estimators': randint(200, 800),
        'max_depth': randint(3, 15),
        'learning_rate': uniform(0, 1),
        'subsample': uniform(0.6, 0.4),
        'colsample_bytree': uniform(0.6, 0.4),
        'gamma': uniform(0, 0.5)
    }

    # Initialize XGBoost classifier without class weighting
    xgb_clf = XGBClassifier(
        objective='binary:logistic',
        eval_metric='auc',
        use_label_encoder=False,
    )

    # Perform randomized hyperparameter search
    xgb_cv = RandomizedSearchCV(
        xgb_clf,
        param_distributions=param_dist,
        n_iter=60,
        cv=3,
        scoring='roc_auc',
        verbose=1,
        n_jobs=2
    )

    # Fit the randomized search to the training data
    xgb_cv.fit(Xtrain, ytrain)

    # Retrieve the best model found
    best_xgb = xgb_cv.best_estimator_

    # Generate predictions
    y_train_pred_xgb = best_xgb.predict(Xtrain)
    y_test_pred_xgb = best_xgb.predict(Xtest)

    # Output evaluation metrics
    print("Classification Report (Train Set):")
    print(classification_report(ytrain, y_train_pred_xgb))

    print("Classification Report (Test Set):")
    print(classification_report(ytest, y_test_pred_xgb))


# Run the function on different versions of the dataset
CreateXGBoostUnbalanced(X_train, X_test, y_train, y_test)  # Original data
CreateXGBoostUnbalanced(X_train_pca, X_test_pca, y_train, y_test)  # PCA-transformed original
Create_XGBoost_Balanced(X_train_smote, X_test, y_train_smote, y_test)  # SMOTE-balanced
Create_XGBoost_Balanced(X_train_smote_pca, X_test_smote_pca, y_train_smote_pca, y_test)  # SMOTE + PCA

Fitting 3 folds for each of 60 candidates, totalling 180 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Classification Report (Train Set):
              precision    recall  f1-score   support

           0       0.95      0.82      0.88    254073
           1       0.53      0.82      0.64     62102

    accuracy                           0.82    316175
   macro avg       0.74      0.82      0.76    316175
weighted avg       0.87      0.82      0.83    316175

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.94      0.81      0.87     63623
           1       0.51      0.80      0.62     15421

    accuracy                           0.81     79044
   macro avg       0.73      0.81      0.75     79044
weighted avg       0.86      0.81      0.82     79044

Fitting 3 folds for each of 60 candidates, totalling 180 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Classification Report (Train Set):
              precision    recall  f1-score   support

           0       0.95      0.80      0.87    254073
           1       0.50      0.82      0.62     62102

    accuracy                           0.80    316175
   macro avg       0.72      0.81      0.74    316175
weighted avg       0.86      0.80      0.82    316175

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.94      0.79      0.86     63623
           1       0.48      0.80      0.60     15421

    accuracy                           0.79     79044
   macro avg       0.71      0.80      0.73     79044
weighted avg       0.85      0.79      0.81     79044

Fitting 3 folds for each of 60 candidates, totalling 180 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Classification Report (Train Set):
              precision    recall  f1-score   support

           0       0.96      1.00      0.98    254073
           1       1.00      0.96      0.98    254073

    accuracy                           0.98    508146
   macro avg       0.98      0.98      0.98    508146
weighted avg       0.98      0.98      0.98    508146

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.89      0.98      0.93     63623
           1       0.85      0.51      0.64     15421

    accuracy                           0.89     79044
   macro avg       0.87      0.74      0.79     79044
weighted avg       0.88      0.89      0.88     79044

Fitting 3 folds for each of 60 candidates, totalling 180 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Classification Report (Train Set):
              precision    recall  f1-score   support

           0       1.00      0.99      1.00    254073
           1       0.99      1.00      1.00    254073

    accuracy                           1.00    508146
   macro avg       1.00      1.00      1.00    508146
weighted avg       1.00      1.00      1.00    508146

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.91      0.87      0.89     63623
           1       0.54      0.65      0.59     15421

    accuracy                           0.83     79044
   macro avg       0.73      0.76      0.74     79044
weighted avg       0.84      0.83      0.83     79044



In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.metrics import Precision

def CreateSequential(Xtrain, Xtest, ytrain, ytest):
    model = Sequential()

    model.add(Dense(78, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(39, activation= 'relu'))
    model.add(Dropout(0.2))
    model.add(Dense(19, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(units = 1, activation = 'sigmoid'))

    model.compile(loss = "binary_crossentropy", optimizer = 'adam', metrics =['accuracy', Precision()])
    model.fit(x = Xtrain, y = ytrain, epochs = 25, batch_size = 256, validation_data = (Xtest, ytest))

    predictions = (model.predict(Xtest) > 0.5).astype('int64')
    print(classification_report(ytest, predictions))

CreateSequential(X_train, X_test, y_train, y_test)
CreateSequential(X_train_pca, X_test_pca, y_train, y_test)
CreateSequential(X_train_smote, X_test, y_train_smote, y_test)
CreateSequential(X_train_smote_pca, X_test_smote_pca, y_train_smote_pca, y_test)


Epoch 1/25
[1m1236/1236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.8539 - loss: 0.3560 - precision_8: 0.8220 - val_accuracy: 0.8889 - val_loss: 0.2638 - val_precision_8: 0.9930
Epoch 2/25
[1m1236/1236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8872 - loss: 0.2668 - precision_8: 0.9872 - val_accuracy: 0.8887 - val_loss: 0.2611 - val_precision_8: 0.9945
Epoch 3/25
[1m1236/1236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8876 - loss: 0.2642 - precision_8: 0.9810 - val_accuracy: 0.8889 - val_loss: 0.2604 - val_precision_8: 0.9986
Epoch 4/25
[1m1236/1236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8877 - loss: 0.2621 - precision_8: 0.9804 - val_accuracy: 0.8888 - val_loss: 0.2610 - val_precision_8: 0.9970
Epoch 5/25
[1m1236/1236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8869 - loss: 0.2633 - precision_8: 0.9742 - val_ac

In [None]:
print(y_train)
print(y_test)


117283    0
151256    0
242452    0
273157    0
213735    0
         ..
259721    0
366594    1
132193    1
147164    0
122200    0
Name: loan_status_binary, Length: 316175, dtype: int64
251650    1
161400    0
365051    0
27512     0
263208    0
         ..
32032     0
289601    0
172764    0
102050    1
73147     0
Name: loan_status_binary, Length: 79044, dtype: int64


In [None]:
print(X.shape)


(395219, 78)
