In [None]:
# Standard libraries
import sys  # System-specific parameters and functions
import os   # Miscellaneous operating system interfaces
import warnings  # Warning control
warnings.filterwarnings("ignore")

# Data manipulation
import pandas as pd  # Data manipulation and analysis
import numpy as np  # Numerical operations

# Visualization
import matplotlib.pyplot as plt  # Plotting library
import seaborn as sns  # Statistical data visualization
from matplotlib.colors import ListedColormap  # Colormap utilities

# Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler  # Preprocessing tools
from sklearn import model_selection, metrics, preprocessing  # Model selection, evaluation, and preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV  # Model selection and evaluation
from sklearn.tree import DecisionTreeClassifier  # Decision tree classifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier  # Ensemble methods
from xgboost import XGBClassifier  # Extreme Gradient Boosting
from sklearn.neighbors import KNeighborsClassifier  # k-Nearest Neighbors
from sklearn.neural_network import MLPClassifier  # Multi-layer Perceptron

# Statistical analysis
from statsmodels.stats.outliers_influence import variance_inflation_factor  # Variance inflation factor
from scipy.stats import pointbiserialr, chi2_contingency, spearmanr, entropy  # Statistical functions
from statsmodels.graphics.gofplots import qqplot  # Q-Q plot
from collections import Counter  # Container datatypes

# Tabulate
from tabulate import tabulate  # Pretty-print tabular data

# Set visualization style
#sns.set()  # Set Seaborn default style
#plt.style.use('ggplot')  # Set ggplot style for matplotlib

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

# Balancing techniques
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample

In [None]:
df = pd.read_csv("df_cleaned.csv")
df.head()

Unnamed: 0,age,income_level,fico_score,delinquency_status,charge_off_status,number_of_credit_applications,debt_to_income_ratio,payment_methods_high_risk,max_balance,avg_balance_last_12months,...,multiple_applications_short_time_period,unusual_submission_pattern,applications_submitted_during_odd_hours,watchlist_blacklist_flag,public_records_flag,location_encoded,occupation_encoded,days_since_recent_trade,time_between_account_open_and_trade,credit_history_length
0,56,40099,424.0,108,1,10,3.139572,0,18209.383136,4324.930673,...,1,1,0,0,1,0.254848,0.256207,268,119,2004
1,69,2050,483.0,0,0,1,1.017489,0,33673.696504,21726.593916,...,0,1,0,0,0,0.285933,0.26601,307,504,326
2,46,71936,566.0,0,0,1,1.508626,0,101918.174202,63930.102255,...,0,0,0,0,0,0.264368,0.26601,58,1544,160
3,32,15833,491.513037,97,1,5,2.99515,0,9055.101659,3033.403742,...,0,1,1,0,0,0.241167,0.266965,47,62,396
4,60,8574,787.0,0,0,1,1.18238,0,91682.85204,77457.387016,...,1,0,1,0,0,0.241167,0.221757,89,37,664


In [None]:
X=df.drop(columns=["charge_off_status"])
y=df["charge_off_status"]

#20% allocated for test data and 80% for train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=28)

In [None]:
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

## Upsampled

In [None]:
#separate majority and minority classes
majority_class = X_train[y_train == 0]
minority_class = X_train[y_train == 1]

print("Size of majority class before upsampling:", majority_class.shape[0])
print("Size of minority class before upsampling:", minority_class.shape[0])

Size of majority class before upsampling: 4133
Size of minority class before upsampling: 1467


In [None]:
#upsample minority class
minority_upsampled = resample(minority_class,
                              replace=True,
                              n_samples=len(majority_class),
                              random_state=28)

X_upsampled = np.vstack([majority_class, minority_upsampled])
y_upsampled = np.concatenate([np.zeros(len(majority_class)), np.ones(len(majority_class))])

#shuffle data
shuffle_indices = np.arange(len(X_upsampled))
np.random.shuffle(shuffle_indices)
X_upsampled = X_upsampled[shuffle_indices]
y_upsampled = y_upsampled[shuffle_indices]

unique_classes, class_counts = np.unique(y_upsampled, return_counts=True)
print("Class counts after upsampling:")
for cls, count in zip(unique_classes, class_counts):
  print(f"Class {int(cls)}: {count}")

Class counts after upsampling:
Class 0: 4133
Class 1: 4133


### Logistic Regression

In [None]:
lr = LogisticRegression(random_state=32)
result = lr.fit(X_upsampled,y_upsampled)
y_pred = lr.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  85.43 %
Misclassification rate of this model:  14.57 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.891     0.915     0.903      1040
           1      0.735     0.678     0.705       360

    accuracy                          0.854      1400
   macro avg      0.813     0.797     0.804      1400
weighted avg      0.851     0.854     0.852      1400

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  952 |                   88 |
+-----------------+----------------------+----------------------+
| Actual Positive |                  116 |                  244 |
+-----------------+----------------------+----------------------+


### Decision Trees

In [None]:
dt = DecisionTreeClassifier(random_state=32)
param_grid = {
    'max_depth': [3, 4, 5, None],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['auto', 'log2', None]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_upsampled, y_upsampled)

# Print the best hyperparameters
print(grid_search.best_params_)

# Print the best score
print(grid_search.best_score_)

{'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
0.8946284319814254


In [None]:
dtree = DecisionTreeClassifier(max_depth = None,  min_samples_split = 2, min_samples_leaf = 1, max_features = 'log2', random_state = 32)
dtree.fit(X_upsampled,y_upsampled)
y_pred = dtree.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  76.57 %
Misclassification rate of this model:  23.43 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.836     0.851     0.844      1040
           1      0.547     0.519     0.533       360

    accuracy                          0.766      1400
   macro avg      0.692     0.685     0.688      1400
weighted avg      0.762     0.766     0.764      1400

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  885 |                  155 |
+-----------------+----------------------+----------------------+
| Actual Positive |                  173 |                  187 |
+-----------------+----------------------+----------------------+


### Random Forest

In [None]:
np.random.seed(32)

rf = RandomForestClassifier(random_state=32)
param_grid = {
    'n_estimators': [50, 80, 100, 120, 150],
    'max_depth': [3, 4, 5, None],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['auto', 'log2',None]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_upsampled, y_upsampled)

# Print the best hyperparameters
print(grid_search.best_params_)

# Print the best score
print(grid_search.best_score_)

{'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 80}
0.9510040372164201


In [None]:
rf = RandomForestClassifier(n_estimators=80, max_depth=None, max_features='log2', min_samples_leaf=1, min_samples_split=3, random_state=32)
rf.fit(X_upsampled,y_upsampled)
y_pred = rf.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  86.29 %
Misclassification rate of this model:  13.71 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.884     0.938     0.910      1040
           1      0.784     0.644     0.707       360

    accuracy                          0.863      1400
   macro avg      0.834     0.791     0.809      1400
weighted avg      0.858     0.863     0.858      1400

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  976 |                   64 |
+-----------------+----------------------+----------------------+
| Actual Positive |                  128 |                  232 |
+-----------------+----------------------+----------------------+


### Ada boosting

In [None]:
np.random.seed(32)

ada_classifier = AdaBoostClassifier(random_state=32)

param_grid = {
    'n_estimators': [50, 80, 100, 120, 150],
    'learning_rate': [0.01, 0.1, 0.5, 1.0],
        'algorithm': ['SAMME', 'SAMME.R']
}

grid_search = GridSearchCV(estimator=ada_classifier, param_grid=param_grid, cv=5, n_jobs=-1)

grid_search.fit(X_upsampled, y_upsampled)

print("Best Hyperparameters:", grid_search.best_params_)

print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'algorithm': 'SAMME.R', 'learning_rate': 1.0, 'n_estimators': 150}
Best Score: 0.8063147068354704


In [None]:
ada_classifier = AdaBoostClassifier(n_estimators=120, learning_rate=0.5, algorithm='SAMME.R', random_state=28)

ada_classifier.fit(X_upsampled, y_upsampled)

y_pred_ada = ada_classifier.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred_ada) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_ada) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_ada, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred_ada)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  86.29 %
Misclassification rate of this model:  13.71 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.888     0.933     0.910      1040
           1      0.773     0.661     0.713       360

    accuracy                          0.863      1400
   macro avg      0.831     0.797     0.811      1400
weighted avg      0.859     0.863     0.859      1400

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  970 |                   70 |
+-----------------+----------------------+----------------------+
| Actual Positive |                  122 |                  238 |
+-----------------+----------------------+----------------------+


### Gradient Boosting

In [None]:
np.random.seed(32)

gb_classifier = GradientBoostingClassifier(random_state=32)

param_grid = {
    'n_estimators': [50, 80, 100, 120, 150, 200,None],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, None],
}

grid_search = GridSearchCV(estimator=gb_classifier, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_upsampled, y_upsampled)

print("Best Hyperparameters:", grid_search.best_params_)

print("Best Score:", grid_search.best_score_)

In [None]:
gb_classifier = GradientBoostingClassifier(n_estimators=150, learning_rate=0.2, max_depth = 5, random_state=32)
gb_classifier.fit(X_upsampled, y_upsampled)
y_pred = gb_classifier.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  79.08 %
Misclassification rate of this model:  20.92 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.837     0.932     0.882       236
           1      0.158     0.065     0.092        46

    accuracy                          0.791       282
   macro avg      0.497     0.499     0.487       282
weighted avg      0.726     0.791     0.753       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  220 |                   16 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   43 |                    3 |
+-----------------+----------------------+----------------------+


### Multilayer Perceptron

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_upsampled)
X_test_scaled = scaler.transform(X_test)

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam',alpha=0.0001, max_iter=1000, random_state=32)
mlp.fit(X_train_scaled, y_upsampled)
y_pred = mlp.predict(X_test_scaled)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

### k Nearest Neighbours

In [None]:
np.random.seed(32)

# Initialize KNN classifier
knn_classifier = KNeighborsClassifier()

# Define parameter grid for KNN
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=knn_classifier, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model
grid_search.fit(X_upsampled, y_upsampled)


# Display best hyperparameters and best score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

In [None]:
# Use the best estimator from GridSearchCV to predict on test data
knn_classifier = grid_search.best_estimator_
knn_classifier.fit(X_upsampled, y_upsampled)

y_pred_knn = knn_classifier.predict(X_test)

# Calculate accuracy and misclassification rate
accuracy = round(metrics.accuracy_score(y_test, y_pred_knn) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_knn) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

# Print classification report
print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_knn, digits=3))

# Print confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred_knn)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  78.07 %
Misclassification rate of this model:  21.93 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.876     0.821     0.848      1040
           1      0.562     0.664     0.609       360

    accuracy                          0.781      1400
   macro avg      0.719     0.743     0.728      1400
weighted avg      0.795     0.781     0.786      1400

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  854 |                  186 |
+-----------------+----------------------+----------------------+
| Actual Positive |                  121 |                  239 |
+-----------------+----------------------+----------------------+


### Support Vector Machines

In [None]:
np.random.seed(32)

g = [0.0001, 0.001, 0.01, 0.1]
hist = []
for val in g:
  clf = SVC(gamma=val)
  cross_val = cross_val_score(clf, X_upsampled, y_upsampled, cv=5)
  hist.append(np.mean(cross_val))
print(hist)

In [None]:
# Use the best estimator from GridSearchCV to predict on test data
svm_classifier = SVC(gamma=0.1)
svm_classifier.fit(X_upsampled, y_upsampled)

y_pred_svm = svm_classifier.predict(X_test)

# Calculate accuracy and misclassification rate
accuracy = round(metrics.accuracy_score(y_test, y_pred_svm) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_svm) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

# Print classification report
print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_svm, digits=3))

# Print confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred_svm)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

### Naive Bias

In [None]:
np.random.seed(32)

# Initialize Naive Bayes classifier
nb_classifier = GaussianNB()

# Define parameter grid for Naive Bayes (even though GaussianNB has few hyperparameters)
param_grid = {
    'var_smoothing': np.logspace(0, -9, num=100)
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=nb_classifier, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model
grid_search.fit(X_upsampled, y_upsampled)

# Display best hyperparameters and best score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

In [None]:
# Use the best estimator from GridSearchCV to predict on test data
nb_classifier = grid_search.best_estimator_
nb_classifier.fit(X_upsampled, y_upsampled)

y_pred_nb = nb_classifier.predict(X_test)

# Calculate accuracy and misclassification rate
accuracy = round(metrics.accuracy_score(y_test, y_pred_nb) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_nb) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

# Print classification report
print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_nb, digits=3))

# Print confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred_nb)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

### XGBoost

In [None]:
np.random.seed(32)

xgb_classifier = XGBClassifier(random_state=28)

param_grid = {
    'n_estimators': [50, 80, 100, 120, 150, 200,None],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, None],
}

grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_upsampled, y_upsampled)

print("Best Hyperparameters:", grid_search.best_params_)

print("Best Score:", grid_search.best_score_)

In [None]:
xgb_classifier = XGBClassifier(n_estimators=150, learning_rate=0.2, max_depth = None, random_state=32)
xgb_classifier.fit(X_upsampled, y_upsampled)
y_pred = xgb_classifier.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  78.37 %
Misclassification rate of this model:  21.63 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.835     0.924     0.877       236
           1      0.143     0.065     0.090        46

    accuracy                          0.784       282
   macro avg      0.489     0.494     0.483       282
weighted avg      0.722     0.784     0.749       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  218 |                   18 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   43 |                    3 |
+-----------------+----------------------+----------------------+


## Downsampled

In [None]:
#downsample minority class
majority_downsampled = resample(majority_class,
                              replace=True,
                              n_samples=len(minority_class),
                              random_state=28)

X_downsampled = np.vstack([minority_class, majority_downsampled])
y_downsampled = np.concatenate([np.zeros(len(minority_class)), np.ones(len(minority_class))])

#shuffle data
shuffle_indices = np.arange(len(X_downsampled))
np.random.shuffle(shuffle_indices)
X_downsampled = X_downsampled[shuffle_indices]
y_downsampled = y_downsampled[shuffle_indices]

unique_classes, class_counts = np.unique(y_downsampled, return_counts=True)
print("Class counts after upsampling:")
for cls, count in zip(unique_classes, class_counts):
  print(f"Class {int(cls)}: {count}")

Class counts after upsampling:
Class 0: 1467
Class 1: 1467


### Logistic regreesion

In [None]:
lr = LogisticRegression(random_state=32)
result = lr.fit(X_downsampled,y_downsampled)
y_pred = lr.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  14.57 %
Misclassification rate of this model:  85.43 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.264     0.084     0.127      1040
           1      0.109     0.325     0.164       360

    accuracy                          0.146      1400
   macro avg      0.186     0.204     0.145      1400
weighted avg      0.224     0.146     0.136      1400

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                   87 |                  953 |
+-----------------+----------------------+----------------------+
| Actual Positive |                  243 |                  117 |
+-----------------+----------------------+----------------------+


### Decision tree

In [None]:
dt = DecisionTreeClassifier(random_state=32)
param_grid = {
    'max_depth': [3, 4, 5, None],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['auto', 'log2', None]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_downsampled, y_downsampled)

# Print the best hyperparameters
print(grid_search.best_params_)

# Print the best score
print(grid_search.best_score_)

{'max_depth': 3, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
0.801295416620637


In [None]:
dtree = DecisionTreeClassifier(max_depth = 3,  min_samples_split = 2, min_samples_leaf = 1, max_features = None, random_state = 32)
dtree.fit(X_downsampled,y_downsampled)
y_pred = dtree.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  14.43 %
Misclassification rate of this model:  85.57 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.245     0.073     0.113      1040
           1      0.116     0.350     0.174       360

    accuracy                          0.144      1400
   macro avg      0.180     0.212     0.143      1400
weighted avg      0.212     0.144     0.128      1400

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                   76 |                  964 |
+-----------------+----------------------+----------------------+
| Actual Positive |                  234 |                  126 |
+-----------------+----------------------+----------------------+


### Random Forest

In [None]:
np.random.seed(32)

rf = RandomForestClassifier(random_state=32)
param_grid = {
    'n_estimators': [50, 80, 100, 120, 150],
    'max_depth': [3, 4, 5, None],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['auto', 'log2',None]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_downsampled, y_downsampled)

# Print the best hyperparameters
print(grid_search.best_params_)

# Print the best score
print(grid_search.best_score_)

{'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 80}
0.811518625974615


In [None]:
rf = RandomForestClassifier(n_estimators=80, max_depth=None, max_features='log2', min_samples_leaf=1, min_samples_split=3, random_state=32)
rf.fit(X_downsampled,y_downsampled)
y_pred = rf.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  16.07 %
Misclassification rate of this model:  83.93 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.303     0.100     0.150      1040
           1      0.114     0.336     0.171       360

    accuracy                          0.161      1400
   macro avg      0.209     0.218     0.161      1400
weighted avg      0.255     0.161     0.156      1400

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  104 |                  936 |
+-----------------+----------------------+----------------------+
| Actual Positive |                  239 |                  121 |
+-----------------+----------------------+----------------------+


### AdaBoosting

In [None]:
np.random.seed(32)

ada_classifier = AdaBoostClassifier(random_state=32)

param_grid = {
    'n_estimators': [50, 80, 100, 120, 150],
    'learning_rate': [0.01, 0.1, 0.5, 1.0],
        'algorithm': ['SAMME', 'SAMME.R']
}

grid_search = GridSearchCV(estimator=ada_classifier, param_grid=param_grid, cv=5, n_jobs=-1)

grid_search.fit(X_downsampled, y_downsampled)

print("Best Hyperparameters:", grid_search.best_params_)

print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'algorithm': 'SAMME', 'learning_rate': 1.0, 'n_estimators': 50}
Best Score: 0.8023164002767587


In [None]:
ada_classifier = AdaBoostClassifier(n_estimators=50, learning_rate=1.0, algorithm='SAMME', random_state=32)

ada_classifier.fit(X_downsampled, y_downsampled)

y_pred_ada = ada_classifier.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred_ada) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_ada) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_ada, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred_ada)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  13.86 %
Misclassification rate of this model:  86.14 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.223     0.064     0.100      1040
           1      0.115     0.353     0.174       360

    accuracy                          0.139      1400
   macro avg      0.169     0.209     0.137      1400
weighted avg      0.196     0.139     0.119      1400

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                   67 |                  973 |
+-----------------+----------------------+----------------------+
| Actual Positive |                  233 |                  127 |
+-----------------+----------------------+----------------------+


### Gradient Boosting

In [None]:
np.random.seed(32)

gb_classifier = GradientBoostingClassifier(random_state=32)

param_grid = {
    'n_estimators': [50, 80, 100, 120, 150, 200,None],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, None],
}

grid_search = GridSearchCV(estimator=gb_classifier, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_downsampled, y_downsampled)

print("Best Hyperparameters:", grid_search.best_params_)

print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 120}
Best Score: 0.8060689222110458


In [None]:
gb_classifier = GradientBoostingClassifier(n_estimators=120, learning_rate=0.1, max_depth = 3, random_state=32)
gb_classifier.fit(X_downsampled, y_downsampled)
y_pred = gb_classifier.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  16.29 %
Misclassification rate of this model:  83.71 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.307     0.101     0.152      1040
           1      0.116     0.342     0.173       360

    accuracy                          0.163      1400
   macro avg      0.212     0.221     0.163      1400
weighted avg      0.258     0.163     0.157      1400

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  105 |                  935 |
+-----------------+----------------------+----------------------+
| Actual Positive |                  237 |                  123 |
+-----------------+----------------------+----------------------+


### MLP

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_downsampled)
X_test_scaled = scaler.transform(X_test)

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam',alpha=0.0001, max_iter=1000, random_state=32)
mlp.fit(X_train_scaled, y_downsampled)
y_pred = mlp.predict(X_test_scaled)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  29.29 %
Misclassification rate of this model:  70.71 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.545     0.291     0.380      1040
           1      0.127     0.297     0.178       360

    accuracy                          0.293      1400
   macro avg      0.336     0.294     0.279      1400
weighted avg      0.437     0.293     0.328      1400

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  303 |                  737 |
+-----------------+----------------------+----------------------+
| Actual Positive |                  253 |                  107 |
+-----------------+----------------------+----------------------+


### kNN

In [None]:
np.random.seed(32)

# Initialize KNN classifier
knn_classifier = KNeighborsClassifier()

# Define parameter grid for KNN
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=knn_classifier, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model
grid_search.fit(X_downsampled, y_downsampled)


# Display best hyperparameters and best score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'metric': 'manhattan', 'n_neighbors': 15, 'weights': 'distance'}
Best Score: 0.7866347657726276


In [None]:
# Use the best estimator from GridSearchCV to predict on test data
knn_classifier = grid_search.best_estimator_
knn_classifier.fit(X_downsampled, y_downsampled)

y_pred_knn = knn_classifier.predict(X_test)

# Calculate accuracy and misclassification rate
accuracy = round(metrics.accuracy_score(y_test, y_pred_knn) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_knn) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

# Print classification report
print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_knn, digits=3))

# Print confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred_knn)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  18.5 %
Misclassification rate of this model:  81.5 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.366     0.133     0.195      1040
           1      0.118     0.336     0.175       360

    accuracy                          0.185      1400
   macro avg      0.242     0.234     0.185      1400
weighted avg      0.302     0.185     0.190      1400

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  138 |                  902 |
+-----------------+----------------------+----------------------+
| Actual Positive |                  239 |                  121 |
+-----------------+----------------------+----------------------+


### SVM

In [None]:
np.random.seed(32)

g = [0.0001, 0.001, 0.01, 0.1]
hist = []
for val in g:
  clf = SVC(gamma=val)
  cross_val = cross_val_score(clf, X_downsampled, y_downsampled, cv=5)
  hist.append(np.mean(cross_val))
print(hist)

[0.626791518160834, 0.626791518160834, 0.626791518160834, 0.626791518160834]


In [None]:
# Use the best estimator from GridSearchCV to predict on test data
svm_classifier = SVC(gamma=0.1)
svm_classifier.fit(X_downsampled, y_downsampled)

y_pred_svm = svm_classifier.predict(X_test)

# Calculate accuracy and misclassification rate
accuracy = round(metrics.accuracy_score(y_test, y_pred_svm) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_svm) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

# Print classification report
print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_svm, digits=3))

# Print confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred_svm)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  74.29 %
Misclassification rate of this model:  25.71 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.743     1.000     0.852      1040
           1      0.000     0.000     0.000       360

    accuracy                          0.743      1400
   macro avg      0.371     0.500     0.426      1400
weighted avg      0.552     0.743     0.633      1400

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                 1040 |                    0 |
+-----------------+----------------------+----------------------+
| Actual Positive |                  360 |                    0 |
+-----------------+----------------------+----------------------+


### Naive Bayes

In [None]:
np.random.seed(32)

# Initialize Naive Bayes classifier
nb_classifier = GaussianNB()

# Define parameter grid for Naive Bayes (even though GaussianNB has few hyperparameters)
param_grid = {
    'var_smoothing': np.logspace(0, -9, num=100)
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=nb_classifier, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model
grid_search.fit(X_downsampled, y_downsampled)

# Display best hyperparameters and best score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'var_smoothing': 9.999999999999999e-10}
Best Score: 0.7924368135541976


In [None]:
# Use the best estimator from GridSearchCV to predict on test data
nb_classifier = grid_search.best_estimator_
nb_classifier.fit(X_downsampled, y_downsampled)

y_pred_nb = nb_classifier.predict(X_test)

# Calculate accuracy and misclassification rate
accuracy = round(metrics.accuracy_score(y_test, y_pred_nb) * 100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred_nb) * 100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

# Print classification report
print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred_nb, digits=3))

# Print confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred_nb)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  16.14 %
Misclassification rate of this model:  83.86 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.315     0.110     0.163      1040
           1      0.108     0.311     0.160       360

    accuracy                          0.161      1400
   macro avg      0.211     0.210     0.161      1400
weighted avg      0.262     0.161     0.162      1400

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  114 |                  926 |
+-----------------+----------------------+----------------------+
| Actual Positive |                  248 |                  112 |
+-----------------+----------------------+----------------------+


### XGBoost

In [None]:
np.random.seed(32)

xgb_classifier = XGBClassifier(random_state=32)

param_grid = {
    'n_estimators': [50, 80, 100, 120, 150, 200,None],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, None],
}

grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_downsampled, y_downsampled)

print("Best Hyperparameters:", grid_search.best_params_)

print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 150}
Best Score: 0.806063107953323


In [None]:
xgb_classifier = XGBClassifier(n_estimators=150, learning_rate=0.05, max_depth = 3, random_state=32)
xgb_classifier.fit(X_downsampled, y_downsampled)
y_pred = xgb_classifier.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  14.64 %
Misclassification rate of this model:  85.36 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.252     0.076     0.117      1040
           1      0.116     0.350     0.174       360

    accuracy                          0.146      1400
   macro avg      0.184     0.213     0.145      1400
weighted avg      0.217     0.146     0.132      1400

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                   79 |                  961 |
+-----------------+----------------------+----------------------+
| Actual Positive |                  234 |                  126 |
+-----------------+----------------------+----------------------+
