In [None]:
# import libraries

# to handle the data
import pandas as pd
import numpy as np

# to visualize the dataset
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

# to preprocess the data
from sklearn.preprocessing import MinMaxScaler, LabelEncoder    

# machine learning
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import shuffle

# sampling
from imblearn.combine import SMOTETomek

# model
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn import tree

# evaluation
from sklearn.metrics import matthews_corrcoef as MCC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import precision_score as precision
from sklearn.metrics import recall_score as recall
from sklearn.metrics import roc_auc_score as auc
from sklearn.metrics import f1_score as f1

# adjust parameters
import optuna
from pyswarm import pso

# hide warnings
import warnings
warnings.filterwarnings('ignore')

# adjust parameters
import optuna

# max columns 
pd.set_option('display.max_columns', None)

from collections import Counter
from math import *

In [None]:
# train data
df = pd.read_csv("bank-full.csv",sep = ';')
df = shuffle(df,random_state = 42)

In [None]:
def label_binary(df_train,df_test):
    # 'default','housing','loan' - binary
    # no = 0, yes =1
    label_encoder = LabelEncoder()
    object_cols = ['default','housing','loan','y']
    for col in object_cols:
        df_train[col] = label_encoder.fit_transform(df_train[col])
        df_test[col] = label_encoder.transform(df_test[col])
    return df_train,df_test

def onehot(df):
    cat_cols = ['marital','education','contact','poutcome','month','job']
    #onehotEncoding
    try:
        df=pd.get_dummies(df,columns=cat_cols)
        return df
    except:
        print('there is no cat_cols in the df')
        return df

----
## Data process

Code for data exploration is in bank_EDA.ipynb

In [None]:
X_kos_folds = []
y_kos_folds = []
df_test_folds = []
feat_cols_folds = []
k = 5
for i in range(k):
    
    # 5 folds split
    n = df.shape[0]//5 # split point
    df_train = pd.concat([df[:i*n],df[(i+1)*n:]])
    df_test = df[i*n:(i+1)*n]

    # process numerical variables
    numeirc_cols = ['age','balance','duration','campaign','pdays','previous','day']
    for col in numeirc_cols:
        sc = MinMaxScaler()
        df_train[col+"_scaled"] = sc.fit_transform(df_train[[col]])
        df_test[col+"_scaled"] = sc.transform(df_test[[col]])
    
    # process categorical variables
    df_train,df_test = label_binary(df_train,df_test)
    df_train = onehot(df_train)
    df_test = onehot(df_test)
    
    # Selecting Columns For use 
    feat_cols=df_train.columns.drop(['y'])
    feat_cols=feat_cols.drop(numeirc_cols)
    
    # choose predictor and responser
    X=df_train[feat_cols]
    y=df_train['y']
    
    # combining sampling
    kos = SMOTETomek(random_state=0)
    X_kos, y_kos = kos.fit_resample(X, y)
    print('y_kos class:{}'.format(Counter(y_kos)))
    print(f'The Shape Of X is {X_kos.shape}')
    print(f'The Shape Of y is {y_kos.shape}')
    
    X_kos_folds.append(X_kos)
    y_kos_folds.append(y_kos)
    df_test_folds.append(df_test)
    feat_cols_folds.append(feat_cols)
    
print('Finish')

---
## First iteration

In [None]:
# Naive Bayes
nb_mcc = []
nb_cm = []
nb_accuracy = []
nb_precision = []
nb_recall = []
nb_f1 = []

for i in range(k):
    X_kos = X_kos_folds[i]
    y_kos = y_kos_folds[i]
    df_test = df_test_folds[i]
    feat_cols = feat_cols_folds[i]
    
    nb_model = GaussianNB()
    nb_model.fit(X_kos,y_kos)
    
    test_labels = nb_model.predict(df_test[feat_cols])
    cm = confusion_matrix(df_test['y'], test_labels)
    mcc = MCC(df_test['y'], test_labels)
    nb_cm.append(cm)
    nb_mcc.append(mcc)
    print(cm,mcc)

    nb_accuracy.append(accuracy(df_test['y'], test_labels))
    nb_precision.append(precision(df_test['y'], test_labels))
    nb_recall.append(recall(df_test['y'], test_labels))
    nb_f1.append(f1(df_test['y'], test_labels))

print("Mean MCC:", sum(nb_mcc)/5)
print("Mean accuracy:", sum(nb_accuracy)/5)
print("Mean precision:", sum(nb_precision)/5)
print("Mean recall:", sum(nb_recall)/5)
print("Mean f1:", sum(nb_f1)/5)

In [None]:
# Decision Tree
tr_mcc = []
tr_cm = []
tr_accuracy = []
tr_precision = []
tr_recall = []
tr_f1 = []

for i in range(k):
    X_kos = X_kos_folds[i]
    y_kos = y_kos_folds[i]
    df_test = df_test_folds[i]
    feat_cols = feat_cols_folds[i]
    
    tr_model = tree.DecisionTreeClassifier(criterion="entropy")
    tr_model.fit(X_kos,y_kos)
    
    test_labels = tr_model.predict(df_test[feat_cols])
    cm = confusion_matrix(df_test['y'], test_labels)
    mcc = MCC(df_test['y'], test_labels)
    tr_cm.append(cm)
    tr_mcc.append(mcc)
    print(cm,mcc)
    
    tr_accuracy.append(accuracy(df_test['y'], test_labels))
    tr_precision.append(precision(df_test['y'], test_labels))
    tr_recall.append(recall(df_test['y'], test_labels))
    tr_f1.append(f1(df_test['y'], test_labels))

print("Mean MCC:", sum(tr_mcc)/5)
print("Mean accuracy:", sum(tr_accuracy)/5)
print("Mean precision:", sum(tr_precision)/5)
print("Mean recall:", sum(tr_recall)/5)
print("Mean f1:", sum(tr_f1)/5)

In [None]:
# SVM
svm_mcc = []
svm_cm = []
svm_accuracy = []
svm_precision = []
svm_recall = []
svm_f1 = []

for i in range(k):
    X_kos = X_kos_folds[i]
    y_kos = y_kos_folds[i]
    df_test = df_test_folds[i]
    feat_cols = feat_cols_folds[i]
    
    # train the model
    svm_model = svm.SVC(kernel='rbf')
    svm_model.fit(X_kos,y_kos)
    
    test_labels = svm_model.predict(df_test[feat_cols])
    cm = confusion_matrix(df_test['y'], test_labels)
    mcc = MCC(df_test['y'], test_labels)
    svm_cm.append(cm)
    svm_mcc.append(mcc)
    print(cm,mcc)
    
    svm_accuracy.append(accuracy(df_test['y'], test_labels))
    svm_precision.append(precision(df_test['y'], test_labels))
    svm_recall.append(recall(df_test['y'], test_labels))
    svm_f1.append(f1(df_test['y'], test_labels))

print("Mean MCC:", sum(svm_mcc)/5)
print("Mean accuracy:", sum(svm_accuracy)/5)
print("Mean precision:", sum(svm_precision)/5)
print("Mean recall:", sum(svm_recall)/5)
print("Mean f1:", sum(svm_f1)/5)

---
## Second iteration

Tunning code by optuna and pso is in bank_tunning.ipynb

In [None]:
# Random Forest
rf_params = {
    'n_estimators': 414,
    'max_features': 14,
         }

rf_mcc = []
rf_cm = []
rf_accuracy = []
rf_precision = []
rf_recall = []
rf_auc = []
rf_f1 = []

for i in range(k):
    X_kos = X_kos_folds[i]
    y_kos = y_kos_folds[i]
    df_test = df_test_folds[i]
    feat_cols = feat_cols_folds[i]
    
    # train the model
    rf_model = RandomForestClassifier(**rf_params)
    rf_model.fit(X_kos,y_kos)
    
    test_predictions = rf_model.predict_proba(df_test[feat_cols])[:, 1]
    test_labels = [i >= 0.5 for i in test_predictions]
    
    cm = confusion_matrix(df_test['y'], test_labels)
    mcc = MCC(df_test['y'], test_labels)
    rf_cm.append(cm)
    rf_mcc.append(mcc)
    print(cm,mcc)
    
    rf_accuracy.append(accuracy(df_test['y'], test_labels))
    rf_precision.append(precision(df_test['y'], test_labels))
    rf_recall.append(recall(df_test['y'], test_labels))
    rf_auc.append(auc(df_test['y'], test_predictions))
    rf_f1.append(f1(df_test['y'], test_labels))
    
print("Mean MCC:", sum(rf_mcc)/5)
print("Mean accuracy:", sum(rf_accuracy)/5)
print("Mean precision:", sum(rf_precision)/5)
print("Mean recall:", sum(rf_recall)/5)
print("Mean roc_auc:", sum(rf_auc)/5)
print("Mean f1:", sum(rf_f1)/5)

In [None]:
# LightGBM
lgb_params = {
        'n_estimators': 1100,
        'num_leaves': 104,
        'max_depth': 23, 
        'learning_rate': 0.005,
        'min_child_weight': 1e-06,
        'min_child_samples': 19, 
        'subsample': 0.9,
        'colsample_bytree': 1, 
        'verbose':-1
            }

lgb_mcc = []
lgb_cm = []
lgb_accuracy = []
lgb_precision = []
lgb_recall = []
lgb_auc = []
lgb_f1 = []

for i in range(k):
    X_kos = X_kos_folds[i]
    y_kos = y_kos_folds[i]
    df_test = df_test_folds[i]
    feat_cols = feat_cols_folds[i]
    
    # train the model
    lgb_model = lgb.LGBMClassifier(**lgb_params)
    lgb_model.fit(X_kos,y_kos)
    
    test_predictions = lgb_model.predict_proba(df_test[feat_cols])[:, 1]
    test_labels = [i >= 0.5 for i in test_predictions]
    
    cm = confusion_matrix(df_test['y'], test_labels)
    mcc = MCC(df_test['y'], test_labels)
    lgb_cm.append(cm)
    lgb_mcc.append(mcc)
    print(cm,mcc)
    
    lgb_accuracy.append(accuracy(df_test['y'], test_labels))
    lgb_precision.append(precision(df_test['y'], test_labels))
    lgb_recall.append(recall(df_test['y'], test_labels))
    lgb_auc.append(auc(df_test['y'], test_predictions))
    lgb_f1.append(f1(df_test['y'], test_labels))

print("Mean MCC:", sum(lgb_mcc)/5)
print("Mean accuracy:", sum(lgb_accuracy)/5)
print("Mean precision:", sum(lgb_precision)/5)
print("Mean recall:", sum(lgb_recall)/5)
print("Mean roc_auc:", sum(lgb_auc)/5)
print("Mean f1:", sum(lgb_f1)/5)

In [None]:
# CatBoostClassifier
cat_params = {
    'eval_metric':'AUC',
    'iterations':1000,
    'learning_rate':0.025,
    'depth':6,
    'rsm':0.11,
    'subsample':1,
    'verbose':False
}

cat_mcc = []
cat_cm = []
cat_accuracy = []
cat_precision = []
cat_recall = []
cat_auc = []
cat_f1 = []

for i in range(k):
    X_kos = X_kos_folds[i]
    y_kos = y_kos_folds[i]
    df_test = df_test_folds[i]
    feat_cols = feat_cols_folds[i]
    
    #Cat_features
    cat_features = np.where(X_kos.dtypes != np.float64)[0]

    # Train the model on the entire dataset
    cat_model = CatBoostClassifier(**cat_params)
    train_pool = Pool(X_kos, y_kos, cat_features=cat_features)
    cat_model.fit(train_pool)

    # Make predictions on the test set
    test_pool = Pool(df_test[feat_cols], cat_features=cat_features)
    test_predictions = cat_model.predict_proba(test_pool)[:, 1]
    test_labels = [i >= 0.5 for i in test_predictions]
    
    cm = confusion_matrix(df_test['y'], test_labels)
    mcc = MCC(df_test['y'], test_labels)
    cat_cm.append(cm)
    cat_mcc.append(mcc)
    print(cm,mcc)
    
    cat_accuracy.append(accuracy(df_test['y'], test_labels))
    cat_precision.append(precision(df_test['y'], test_labels))
    cat_recall.append(recall(df_test['y'], test_labels))
    cat_auc.append(auc(df_test['y'], test_predictions))
    cat_f1.append(f1(df_test['y'], test_labels))


print("Mean MCC:", sum(cat_mcc)/5)
print("Mean accuracy:", sum(cat_accuracy)/5)
print("Mean precision:", sum(cat_precision)/5)
print("Mean recall:", sum(cat_recall)/5)
print("Mean roc_auc:", sum(cat_auc)/5)
print("Mean f1:", sum(cat_f1)/5)

In [None]:
# XGBoost Parameters
xgb_params = {
    'n_estimators': 736,
    'num_leaves': 994, 
    'max_depth': 11,
    'learning_rate': 0.01,
    'min_child_weight': 0.0001,
    'min_child_sampes': 23,
    'subsample': 0.9,
    'colsample_bytree': 0.8,
    'objective': 'binary:logistic',
    'eval_metric': 'auc'
}
xgb_mcc = []
xgb_cm = []
xgb_accuracy = []
xgb_precision = []
xgb_recall = []
xgb_auc = []
xgb_f1 = []

for i in range(k):
    X_kos = X_kos_folds[i]
    y_kos = y_kos_folds[i]
    df_test = df_test_folds[i]
    feat_cols = feat_cols_folds[i]
    
    # train the model
    xgb_model = xgb.XGBClassifier(**xgb_params)
    xgb_model.fit(X_kos,y_kos)
    
    test_predictions = xgb_model.predict_proba(df_test[feat_cols])[:, 1]
    test_labels = [i >= 0.5 for i in test_predictions]
    
    cm = confusion_matrix(df_test['y'], test_labels)
    mcc = MCC(df_test['y'], test_labels)
    xgb_cm.append(cm)
    xgb_mcc.append(mcc)
    print(cm,mcc)
    
    xgb_accuracy.append(accuracy(df_test['y'], test_labels))
    xgb_precision.append(precision(df_test['y'], test_labels))
    xgb_recall.append(recall(df_test['y'], test_labels))
    xgb_auc.append(auc(df_test['y'], test_predictions))
    xgb_f1.append(f1(df_test['y'], test_labels))

print("Mean MCC:", sum(xgb_mcc)/5)
print("Mean accuracy:", sum(xgb_accuracy)/5)
print("Mean precision:", sum(xgb_precision)/5)
print("Mean recall:", sum(xgb_recall)/5)
print("Mean roc_auc:", sum(xgb_auc)/5)
print("Mean f1:", sum(xgb_f1)/5)

---
## Third iteration

In [None]:
# CatBoostClassifier with PSO
cat_params = {
    'eval_metric':'AUC',
    'iterations':1000,
    'learning_rate':0.0098,
    'depth':9,
    'rsm':0.1287,
    'subsample':0.8294,
    'verbose':False
}

catP_mcc = []
catP_cm = []
catP_accuracy = []
catP_precision = []
catP_recall = []
catP_auc = []
catP_f1 = []

for i in range(k):
    X_kos = X_kos_folds[i]
    y_kos = y_kos_folds[i]
    df_test = df_test_folds[i]
    feat_cols = feat_cols_folds[i]
    
    #Cat_features
    catP_features = np.where(X_kos.dtypes != np.float64)[0]

    # Train the model on the entire dataset
    catP_model = CatBoostClassifier(**cat_params)
    train_pool = Pool(X_kos, y_kos, cat_features=catP_features)
    catP_model.fit(train_pool)

    # Make predictions on the test set
    test_pool = Pool(df_test[feat_cols], cat_features=catP_features)
    test_predictions = catP_model.predict_proba(test_pool)[:, 1]
    test_labels = [i >= 0.5 for i in test_predictions]
    
    cm = confusion_matrix(df_test['y'], test_labels)
    mcc = MCC(df_test['y'], test_labels)
    catP_cm.append(cm)
    catP_mcc.append(mcc)
    print(cm,mcc)
    
    catP_accuracy.append(accuracy(df_test['y'], test_labels))
    catP_precision.append(precision(df_test['y'], test_labels))
    catP_recall.append(recall(df_test['y'], test_labels))
    catP_auc.append(auc(df_test['y'], test_predictions))
    catP_f1.append(f1(df_test['y'], test_labels))

print("Mean MCC:", sum(catP_mcc)/5)
print("Mean accuracy:", sum(catP_accuracy)/5)
print("Mean precision:", sum(catP_precision)/5)
print("Mean recall:", sum(catP_recall)/5)
print("Mean roc_auc:", sum(catP_auc)/5)
print("Mean f1:", sum(catP_f1)/5)

In [None]:
# Create data for the plot
classifiers = ['CatBoost with PSO','LightGBM', 'CatBoost','XGBoost','RandomForest','SVM','NaiveBayes','DecisionTree']
mcc_scores = [sum(catP_mcc)/5,sum(lgb_mcc)/5,sum(cat_mcc)/5,sum(xgb_mcc)/5,sum(rf_mcc)/5,sum(svm_mcc)/5,sum(nb_mcc)/5, sum(tr_mcc)/5]

# Create the figure with optimized settings
fig = go.Figure(data=[go.Bar(
    x=classifiers,
    y=[i for i in mcc_scores],
    name='Accuracy',
)])

# Update layout with optimized settings
fig.update_layout(
    title='MCC Comparison',
    xaxis_title='Classifier',
    yaxis_title='MCC Score',
    template='plotly_white',
    font=dict(family="Arial", size=12),
    width=600,
    margin=dict(l=50, r=50, t=50, b=50)
)

# Add gridlines
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#E0E0E0')

# Show the plot
fig.show()