In [None]:
# import libraries

# to handle the data
import pandas as pd
import numpy as np

# to visualize the dataset
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

# to preprocess the data
from sklearn.preprocessing import MinMaxScaler, LabelEncoder    

# machine learning
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import shuffle

# model
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

# evaluation
from sklearn.metrics import matthews_corrcoef as MCC
from sklearn.metrics import roc_curve, auc, confusion_matrix

# max columns 
pd.set_option('display.max_columns', None)

# hide warnings
import warnings
warnings.filterwarnings('ignore')

# tun parameters
import optuna
from pyswarm import pso

# sampling
from imblearn.combine import SMOTETomek

from collections import Counter
from math import *

In [None]:
# train data
df = pd.read_csv("bank-full.csv",sep = ';')
df = shuffle(df,random_state = 42)

In [None]:
def label_binary(df_train,df_test):
    # 'default','housing','loan' - binary
    # no = 0, yes =1
    label_encoder = LabelEncoder()
    object_cols = ['default','housing','loan','y']
    for col in object_cols:
        df_train[col] = label_encoder.fit_transform(df_train[col])
        df_test[col] = label_encoder.transform(df_test[col])
    return df_train,df_test

def onehot(df):
    cat_cols = ['marital','education','contact','poutcome','month','job']
    #onehotEncoding
    try:
        df=pd.get_dummies(df,columns=cat_cols)
        return df
    except:
        print('there is no cat_cols in the df')
        return df

In [None]:
X_kos_folds = []
y_kos_folds = []
df_test_folds = []
feat_cols_folds = []
k = 5
for i in range(k):
    
    # 5 folds split
    n = df.shape[0]//5 # split point
    df_train = pd.concat([df[:i*n],df[(i+1)*n:]])
    df_test = df[i*n:(i+1)*n]

    # process numerical variables
    numeirc_cols = ['age','balance','duration','campaign','pdays','previous','day']
    for col in numeirc_cols:
        sc = MinMaxScaler()
        df_train[col+"_scaled"] = sc.fit_transform(df_train[[col]])
        df_test[col+"_scaled"] = sc.transform(df_test[[col]])
    
    # process categorical variables
    df_train,df_test = label_binary(df_train,df_test)
    df_train = onehot(df_train)
    df_test = onehot(df_test)
    
    # Selecting Columns FOr use 
    feat_cols=df_train.columns.drop(['y'])
    feat_cols=feat_cols.drop(numeirc_cols)
    
    # choose predictor and responser
    X=df_train[feat_cols]
    y=df_train['y']
    
    # combining sampling
    kos = SMOTETomek(random_state=0)
    X_kos, y_kos = kos.fit_resample(X, y)
    print(f'The Shape Of X is {X_kos.shape}')
    print(f'The Shape Of y is {y_kos.shape}')
    
    X_kos_folds.append(X_kos)
    y_kos_folds.append(y_kos)
    df_test_folds.append(df_test)
    feat_cols_folds.append(feat_cols)

In [None]:
# LightGBM Parameters adjusting
def objective(trial):
    Params = {
        'n_estimators': trial.suggest_int('n_estimators',800, 1200),
        'num_leaves': trial.suggest_int('num_leaves',100,300),
        'learning_rate': trial.suggest_categorical('learning_rate',[0.01,0.005,0.015,0.02,0.025]),
        'max_depth': 4trial.suggest_int('max_depth',20,80),
        'min_child_weight': trial.suggest_categorical('min_child_weight',[0.001,0.0001,0.00001,0.000001,0.0000001,0]),
        'min_child_samples': trial.suggest_int('min_child_samples',5, 32),
        'subsample': trial.suggest_categorical('subsample',[0.8,0.9,1]),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree',[0.8,0.9,1]),
        'verbose':-1
    }
    
    lgb_mcc = []
    for i in range(k):
        X_kos = X_kos_folds[i]
        y_kos = y_kos_folds[i]
        df_test = df_test_folds[i]
        feat_cols = feat_cols_folds[i]

        # train the model
        lgb_model = lgb.LGBMClassifier(**Params)
        lgb_model.fit(X_kos,y_kos)
        test_predictions = lgb_model.predict_proba(df_test[feat_cols])[:, 1]
        test_labels = [i >= 0.5 for i in test_predictions]
        mcc = MCC(df_test['y'], test_labels)
        lgb_mcc.append(mcc)

    return sum(lgb_mcc)/5

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50,show_progress_bar=True)

In [None]:
best_params = study.best_params
print(f"Best Params: {best_params}")
optuna.visualization.plot_param_importances(study).show()
optuna.visualization.plot_optimization_history(study).show()
optuna.visualization.plot_slice(study).show()

In [None]:
# CatBoostClassifier
def objective(trial):
    Params = {
        'eval_metric':'AUC',
        'iterations': 1000,
        'depth': trial.suggest_int('depth',1,16),
        'learning_rate': trial.suggest_categorical('learning_rate',[0.005 * i for i in range(1,21)]),
        'subsample': trial.suggest_categorical('subsample',[0.8,0.9,1]),
        'rsm': trial.suggest_categorical('rsm',[0.01*i for i in range(1,21)]),
        'verbose': False
             }
    
    cat_mcc = []
    for i in range(k):
        X_kos = X_kos_folds[i]
        y_kos = y_kos_folds[i]
        df_test = df_test_folds[i]
        feat_cols = feat_cols_folds[i]
        
        # Cat_features
        cat_features = np.where(X_kos.dtypes != np.float64)[0]

        # Train the model on the entire dataset
        cat_model = CatBoostClassifier(**Params)
        train_pool = Pool(X_kos, y_kos, cat_features=cat_features)
        cat_model.fit(train_pool)

        # Make predictions on the test set
        test_pool = Pool(df_test[feat_cols], cat_features=cat_features)
        test_predictions = cat_model.predict_proba(test_pool)[:, 1]
        test_labels = [i >= 0.5 for i in test_predictions]
        mcc = MCC(df_test['y'], test_labels)
        cat_mcc.append(mcc)

        # train the model
        cat_model = CatBoostClassifier(**Params)
        cat_model.fit(X_kos,y_kos)
        test_predictions = cat_model.predict_proba(df_test[feat_cols])[:, 1]
        test_labels = [i >= 0.5 for i in test_predictions]
        mcc = MCC(df_test['y'], test_labels)
        cat_mcc.append(mcc)

    return sum(cat_mcc)/5

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50,show_progress_bar=True)

In [None]:
best_params = study.best_params
print(f"Best Params: {best_params}")
optuna.visualization.plot_param_importances(study).show()
optuna.visualization.plot_optimization_history(study).show()
optuna.visualization.plot_slice(study).show()

In [None]:
# XGBoost Parameters
def objective(trial):
    Params = {
        'n_estimators': ttrial.suggest_int('n_estimators',10,1000),
        'num_leaves': trial.suggest_int('num_leaves', 100, 1000),
        'learning_rate': trial.suggest_categorical('learning_rate',[0.005 * i for i in range(1,21)]),
        'max_depth':trial.suggest_int('max_depth',10,50),
        'min_child_weight': trial.suggest_categorical('min_child_weight',[0.01,0.001,0.0001,0.00001,0.000001,0]),
        'min_child_samples': trial.suggest_int('min_child_samples',10, 50),
        'subsample': trial.suggest_categorical('subsample',[0.8,0.9,1]),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree',[0.8,0.9,1]),
        'objective': 'binary:logistic',
        'eval_metric': 'auc'
             }
    
    xgb_mcc = []
    for i in range(k):
        X_kos = X_kos_folds[i]
        y_kos = y_kos_folds[i]
        df_test = df_test_folds[i]
        feat_cols = feat_cols_folds[i]

        # train the model
        xgb_model = xgb.XGBClassifier(**Params)
        xgb_model.fit(X_kos,y_kos)
        test_predictions = xgb_model.predict_proba(df_test[feat_cols])[:, 1]
        test_labels = [i >= 0.5 for i in test_predictions]
        mcc = MCC(df_test['y'], test_labels)
        xgb_mcc.append(mcc)

    return sum(xgb_mcc)/5

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50,show_progress_bar=True)

In [None]:
best_params = study.best_params
print(f"Best Params: {best_params}")
optuna.visualization.plot_param_importances(study).show()
optuna.visualization.plot_optimization_history(study).show()
optuna.visualization.plot_slice(study).show()

In [None]:
# Random Forest
def objective(trial):
    Params = {
        'n_estimators': trial.suggest_int('n_estimators',10,1000),
        'max_features': trial.suggest_int('num_leaves', 5, 32),
             }

    rf_mcc = []
    for i in range(k):
        X_kos = X_kos_folds[i]
        y_kos = y_kos_folds[i]
        df_test = df_test_folds[i]
        feat_cols = feat_cols_folds[i]

        # train the model
        rf_model = RandomForestClassifier(**Params)
        rf_model.fit(X_kos,y_kos)
        test_predictions = rf_model.predict_proba(df_test[feat_cols])[:, 1]
        test_labels = [i >= 0.5 for i in test_predictions]
        mcc = MCC(df_test['y'], test_labels)
        rf_mcc.append(mcc)

    return sum(rf_mcc)/5

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50,show_progress_bar=True)    

In [None]:
best_params = study.best_params
print(f"Best Params: {best_params}")
optuna.visualization.plot_param_importances(study).show()
optuna.visualization.plot_optimization_history(study).show()
optuna.visualization.plot_slice(study).show()

---
## Catboost with PSO and its performance visualization & feature importances

In [None]:
# Define the MCC objective function for PSO
def objective(params):
    depth, learning_rate, subsample, rsm = params
    cat_params = {
        'eval_metric':'AUC',
        'iterations': 1000,
        'depth': int(depth),  # depth should be an integer
        'learning_rate': learning_rate,
        'subsample': subsample,
        'rsm': rsm,
        'verbose': False
    }

    cat_mcc = []
    for i in range(k):
        X_kos = X_kos_folds[i]
        y_kos = y_kos_folds[i]
        df_test = df_test_folds[i]
        feat_cols = feat_cols_folds[i]

        # Cat_features
        cat_features = np.where(X_kos.dtypes != np.float64)[0]

        # Train the model
        cat_model = CatBoostClassifier(**cat_params)
        train_pool = Pool(X_kos, y_kos, cat_features=cat_features)
        cat_model.fit(train_pool)

        # Make predictions on the test set
        test_pool = Pool(df_test[feat_cols], cat_features=cat_features)
        test_predictions = cat_model.predict_proba(test_pool)[:, 1]
        test_labels = [i >= 0.5 for i in test_predictions]

        # Calculate MCC
        mcc = MCC(df_test['y'], test_labels)
        cat_mcc.append(mcc)

    return -sum(cat_mcc)/k  # PSO minimizes, so we negate MCC for maximization

# Parameter bounds: [min, max] for depth, learning_rate, subsample, and rsm
lb = [1, 0.005, 0.8, 0.01]  # lower bounds
ub = [16, 0.1, 1.0, 0.2]    # upper bounds

# Run PSO to optimize CatBoost parameters
best_params, best_mcc = pso(objective, lb, ub, swarmsize=1, maxiter=10,debug=True)

print(f"Best parameters: {best_params}")
print(f"Best MCC: {-best_mcc}")


In [None]:
# Visualization: ROC curve, confusion matrix, feature importance, AUC score change
def full_visualization(best_params):
    cat_params = {
        'eval_metric': 'AUC',
        'iterations': 1000,
        'depth': int(best_params[0]),
        'learning_rate': best_params[1],
        'subsample': best_params[2],
        'rsm': best_params[3],
        'verbose': False
    }

    mean_fpr = np.linspace(0, 1, 100)
    tprs = []
    aucs = []
    feature_importances = None
    plt.figure(figsize=(14, 12))

    # 1. Plot ROC curves and collect AUC scores
    plt.subplot(2, 2, 1)
    for i in range(k):
        X_kos = X_kos_folds[i]
        y_kos = y_kos_folds[i]
        df_test = df_test_folds[i]
        feat_cols = feat_cols_folds[i]

        # Training model
        cat_model = CatBoostClassifier(**cat_params)
        cat_features = np.where(X_kos.dtypes != np.float64)[0]
        train_pool = Pool(X_kos, y_kos, cat_features=cat_features)
        cat_model.fit(train_pool)

        # Record the importance of features and save them only once
        if feature_importances is None:
            feature_importances = cat_model.get_feature_importance()

        # Calculated ROC curve
        test_pool = Pool(df_test[feat_cols], cat_features=cat_features)
        y_proba = cat_model.predict_proba(test_pool)[:, 1]
        fpr, tpr, _ = roc_curve(df_test['y'], y_proba)
        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)
        tprs.append(np.interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        
        # 绘制每折的ROC曲线
        plt.plot(fpr, tpr, lw=2, alpha=0.3, label=f'Fold {i+1} ROC (AUC = {roc_auc:.4f})')

    # Plot the ROC curve for each fold
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, color='b', lw=2, alpha=0.8, label=f'Mean ROC (AUC = {mean_auc:.4f})')
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', alpha=0.8)
    plt.title('ROC Curves for 5-Fold Cross-Validation', fontsize=14)
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.legend(loc="lower right", fontsize=10)
    plt.grid(True)

    # 2. Plot confusion matrix
    plt.subplot(2, 2, 2)
    # Visualization of the confusion matrix using the last folded test set
    y_pred = [1 if x >= 0.5 else 0 for x in y_proba]
    cm = confusion_matrix(df_test['y'], y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
    plt.title(f'Confusion Matrix for Last Fold', fontsize=14)
    plt.xlabel('Predicted Label', fontsize=12)
    plt.ylabel('True Label', fontsize=12)

    # 3. Plot feature importance
    plt.subplot(2, 2, 3)
    sns.barplot(x=feature_importances, y=feat_cols)
    plt.title('Feature Importance', fontsize=14)
    plt.xlabel('Importance', fontsize=12)
    plt.ylabel('Feature', fontsize=12)

    # 4. Plot the AUC score as the fold changes
    plt.subplot(2, 2, 4)
    plt.plot(range(1, k+1), aucs, marker='o')
    plt.title('AUC Scores per Fold', fontsize=14)
    plt.xlabel('Fold', fontsize=12)
    plt.ylabel('AUC Score', fontsize=12)
    plt.grid(True)

    plt.tight_layout()
    plt.show()


full_visualization(best_params)

In [None]:
# Plot top 20 feature importances with values from the provided CatBoost model
def plot_top_feature_importances(feature_importances, feat_cols):
    # Sort feature importances in descending order
    sorted_idx = np.argsort(feature_importances)[::-1][:20]
    top_features = np.array(feat_cols)[sorted_idx]
    top_importances = feature_importances[sorted_idx]

    # Plotting the top 20 features
    plt.figure(figsize=(10, 8))
    sns.barplot(x=top_importances, y=top_features, palette='coolwarm')
    plt.title('Top 20 Feature Importances', fontsize=16, fontweight='bold')
    plt.xlabel('Importance', fontsize=14)
    plt.ylabel('Feature', fontsize=14)

    # Annotating each bar with its value
    for index, value in enumerate(top_importances):
        plt.text(value + 0.005, index, f'{value:.4f}', va='center', fontsize=12)

    plt.tight_layout()
    plt.show()

# CatBoost training and feature importance extraction
def get_feature_importances_and_plot(best_params):
    cat_params = {
        'eval_metric': 'AUC',
        'iterations': 1000,
        'depth': int(best_params[0]),
        'learning_rate': best_params[1],
        'subsample': best_params[2],
        'rsm': best_params[3],
        'verbose': False
    }
    
    # Use the first fold to train the model and obtain feature importance
    X_kos = X_kos_folds[0]
    y_kos = y_kos_folds[0]
    feat_cols = feat_cols_folds[0]
    
    cat_model = CatBoostClassifier(**cat_params)
    cat_features = np.where(X_kos.dtypes != np.float64)[0]
    train_pool = Pool(X_kos, y_kos, cat_features=cat_features)
    cat_model.fit(train_pool)
    
    # Acquired feature importance
    feature_importances = cat_model.get_feature_importance()
    print(feature_importances)
    # Plot the importance of the top 20 features
    plot_top_feature_importances(feature_importances, feat_cols)


get_feature_importances_and_plot(best_params)