In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
# IMPORTS
!pip install shap -q

# GPU Specific Installation for LightGBM
!pip uninstall -y lightgbm
!pip install lightgbm --config-settings=cmake.define.USE_GPU=ON

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
import shap

# Scikit-learn
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.pipeline import Pipeline

# Imbalanced Data Handling
from imblearn.over_sampling import SMOTE

# Models
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

# Metrics
from sklearn.metrics import (
    classification_report,
    ConfusionMatrixDisplay,
    roc_auc_score,
    average_precision_score,
    precision_recall_curve,
    f1_score,
    accuracy_score,
    precision_score,
    recall_score
)

print("Libraries imported successfully.")

Found existing installation: lightgbm 4.6.0
Uninstalling lightgbm-4.6.0:
  Successfully uninstalled lightgbm-4.6.0
Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m52.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Libraries imported successfully.


In [None]:
# DATA LOADING AND FEATURE ENGINEERING
# Mount Google Drive to access the dataset
from google.colab import drive
drive.mount('/content/drive')

path = '/content/drive/My Drive/train.csv'
df = pd.read_csv(path)
print("Data loaded successfully.")

def feature_engineer(data):
    """Creates interaction features from key business variables."""
    df_eng = data.copy()
    if 'id' in df_eng.columns:
        df_eng = df_eng.drop('id', axis=1)
    df_eng['balance_per_age'] = df_eng['balance'] / (df_eng['age'] + 1)
    df_eng['duration_x_campaign'] = df_eng['duration'] * df_eng['campaign']
    return df_eng

df_engineered = feature_engineer(df)
print("Feature engineering complete.")

Mounted at /content/drive
Data loaded successfully.
Feature engineering complete.


In [None]:
# PREPROCESSING PIPELINE DEFINITION
X = df_engineered.drop('y', axis=1)
y = df_engineered['y']

numeric_features = X.select_dtypes(include=np.number).columns.tolist()
ordinal_features = ['education']
nominal_features = X.select_dtypes(include=['object']).columns.drop(ordinal_features, errors='ignore').tolist()
education_order = ['unknown', 'primary', 'secondary', 'tertiary']

numeric_transformer = StandardScaler()
ordinal_transformer = OrdinalEncoder(categories=[education_order], handle_unknown='use_encoded_value', unknown_value=-1)
nominal_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('ord', ordinal_transformer, ordinal_features),
        ('nom', nominal_transformer, nominal_features)
    ],
    remainder='drop'
)
print("Preprocessing pipeline created successfully.")

Preprocessing pipeline created successfully.


In [None]:
# DATA SPLITTING
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)
print(f"Data split into training set ({X_train.shape[0]} samples) and test set ({X_test.shape[0]} samples).")


Data split into training set (600000 samples) and test set (150000 samples).


In [None]:
# PREPROCESSING AND RESAMPLING
print("\n--- Applying Preprocessing and SMOTE Resampling ONCE ---")

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Apply SMOTE
smote = SMOTE(random_state=42, sampling_strategy=0.8)
X_resampled, y_resampled = smote.fit_resample(X_train_processed, y_train)

print(f"Data processing and resampling complete.")
print(f"Shape of final training data (X_resampled): {X_resampled.shape}")


--- Applying Preprocessing and SMOTE Resampling ONCE ---
Data processing and resampling complete.
Shape of final training data (X_resampled): (949698, 50)


In [None]:
# HYPERPARAMETER TUNING
advanced_models = {
    'LightGBM': lgb.LGBMClassifier(random_state=42, device='gpu'),
    'XGBoost': xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss', tree_method='gpu_hist'),
    'CatBoost': cb.CatBoostClassifier(random_state=42, verbose=0, task_type='GPU')
}

param_grids = {
    'LightGBM': {'n_estimators': [300, 500], 'learning_rate': [0.05, 0.1], 'num_leaves': [50, 70], 'class_weight': ['balanced']},
    'XGBoost': {'n_estimators': [300, 500], 'learning_rate': [0.05, 0.1], 'max_depth': [6, 8], 'scale_pos_weight': [len(y_train[y_train==0]) / len(y_train[y_train==1])]},
    'CatBoost': {'iterations': [300, 500], 'learning_rate': [0.05, 0.1], 'depth': [6, 8], 'auto_class_weights': ['Balanced']}
}
tuned_models = {}
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

for name, model in advanced_models.items():
    print(f"\n--- Tuning {name} on GPU ---")
    search = RandomizedSearchCV(model, param_distributions=param_grids[name], n_iter=10, cv=kfold, scoring='average_precision', n_jobs=-1, random_state=42, verbose=1)
    search.fit(X_resampled, y_resampled)
    tuned_models[name] = search.best_estimator_
    print(f"Best PR-AUC for {name}: {search.best_score_:.4f}")


--- Tuning LightGBM on GPU ---
Fitting 3 folds for each of 8 candidates, totalling 24 fits




[LightGBM] [Info] Number of positive: 422088, number of negative: 527610
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 12567
[LightGBM] [Info] Number of data points in the train set: 949698, number of used features: 50
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 11 dense feature groups (10.87 MB) transferred to GPU in 0.016723 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Best PR-AUC for LightGBM: 0.9929

--- Tuning XGBoost on GPU ---
Fitting 3 folds for each of 8 candidates, totalling 24 fits



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best PR-AUC for XGBoost: 0.9912

--- Tuning CatBoost on GPU ---
Fitting 3 folds for each of 8 candidates, totalling 24 fits




Best PR-AUC for CatBoost: 0.9921


In [None]:
# STACKING ENSEMBLE
print("\n--- Building and Training the Stacking Ensemble ---")
base_estimators = [
    ('lgbm', tuned_models['LightGBM']),
    ('xgb', tuned_models['XGBoost']),
    ('cat', tuned_models['CatBoost'])
]

stacking_classifier = StackingClassifier(
    estimators=base_estimators,
    final_estimator=LogisticRegression(class_weight='balanced'),
    cv=5,
    n_jobs=1,
    passthrough=True
)

# The stacker is trained on the same resampled data
stacking_classifier.fit(X_resampled, y_resampled)
print("Stacking Ensemble trained successfully.")


--- Building and Training the Stacking Ensemble ---
[LightGBM] [Info] Number of positive: 422088, number of negative: 527610
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 12567
[LightGBM] [Info] Number of data points in the train set: 949698, number of used features: 50
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 11 dense feature groups (10.87 MB) transferred to GPU in 0.016204 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 337670, number of negative: 422088
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 12572
[LightGBM] [Info] Number of data points in the train set: 759758, number of used features: 50
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 11 dense feature groups (8.69 MB) transferred to GPU in 0.022905 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




[LightGBM] [Info] Number of positive: 337670, number of negative: 422088
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 12555
[LightGBM] [Info] Number of data points in the train set: 759758, number of used features: 50
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 11 dense feature groups (8.69 MB) transferred to GPU in 0.012496 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




[LightGBM] [Info] Number of positive: 337670, number of negative: 422088
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 12556
[LightGBM] [Info] Number of data points in the train set: 759758, number of used features: 50
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 11 dense feature groups (8.69 MB) transferred to GPU in 0.011841 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




[LightGBM] [Info] Number of positive: 337671, number of negative: 422088
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 12561
[LightGBM] [Info] Number of data points in the train set: 759759, number of used features: 50
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 11 dense feature groups (8.69 MB) transferred to GPU in 0.012668 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




[LightGBM] [Info] Number of positive: 337671, number of negative: 422088
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 12562
[LightGBM] [Info] Number of data points in the train set: 759759, number of used features: 50
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 11 dense feature groups (8.69 MB) transferred to GPU in 0.023896 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method =

Stacking Ensemble trained successfully.


In [None]:
# FINAL MODEL EVALUATION

all_models_to_eval = {}
all_models_to_eval.update(tuned_models)
all_models_to_eval['Stacking Ensemble'] = stacking_classifier

evaluation_results = []
def evaluate_model(name, model, X_test_processed_data, y_test_data):
    y_proba = model.predict_proba(X_test_processed_data)[:, 1]
    precision, recall, thresholds = precision_recall_curve(y_test_data, y_proba)
    f1 = 2 * (precision * recall) / (precision + recall)
    f1 = np.nan_to_num(f1) # Avoid division by zero issues
    best_threshold = thresholds[np.argmax(f1)]
    y_pred = (y_proba >= best_threshold).astype(int)

    print(f"\n--- Evaluation Report for: {name} ---")
    print(f"Best Threshold (for F1-Score): {best_threshold:.4f}")
    print(classification_report(y_test_data, y_pred, digits=4))

    return {
        "Model": name, "Accuracy": accuracy_score(y_test_data, y_pred),
        "Precision (Yes)": precision_score(y_test_data, y_pred, pos_label=1),
        "Recall (Yes)": recall_score(y_test_data, y_pred, pos_label=1),
        "F1-Score (Yes)": f1_score(y_test_data, y_pred, pos_label=1),
        "ROC-AUC": roc_auc_score(y_test_data, y_proba)
    }

print("\n--- Generating Final Model Leaderboard ---")
for name, model in all_models_to_eval.items():
    evaluation_results.append(evaluate_model(name, model, X_test_processed, y_test))

leaderboard = pd.DataFrame(evaluation_results).sort_values(by="F1-Score (Yes)", ascending=False)
pd.set_option('display.float_format', '{:.4f}'.format)
print("\n--- Final Performance Leaderboard ---")
print(leaderboard)

In [None]:
# EXPLAINABILITY WITH SHAP
print("\n--- Generating SHAP Explainability Plot for the Stacking Model ---")
# SHAP needs a DataFrame with feature names for plotting
feature_names = preprocessor.get_feature_names_out()
X_resampled_df = pd.DataFrame(X_resampled, columns=feature_names)
X_test_processed_df = pd.DataFrame(X_test_processed, columns=feature_names)

# Use a sample for speed, especially with KernelExplainer
X_shap_sample = shap.sample(X_resampled_df, 150, random_state=42)
X_test_shap_sample = X_test_processed_df.sample(150, random_state=42)

explainer = shap.KernelExplainer(stacking_classifier.predict_proba, X_shap_sample)
shap_values = explainer.shap_values(X_test_shap_sample)

shap.summary_plot(shap_values[1], X_test_shap_sample, max_display=15)


--- Generating SHAP Explainability Plot for the Stacking Model ---



    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


  0%|          | 0/150 [00:00<?, ?it/s]



AssertionError: The shape of the shap_values matrix does not match the shape of the provided data matrix.

In [None]:
# SAVE
final_deployment_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('stacker', stacking_classifier)
])

# Save the single pipeline object to your Google Drive
save_path = "/content/drive/MyDrive/Bank_Marketing_Final_Model_GPU/"
os.makedirs(save_path, exist_ok=True)
joblib.dump(final_deployment_pipeline, os.path.join(save_path, "final_model_pipeline.joblib"))

print(f"\nFinal deployment pipeline saved successfully to your Google Drive folder: '{save_path}'")


Final deployment pipeline saved successfully to your Google Drive folder: '/content/drive/MyDrive/Bank_Marketing_Final_Model_GPU/'
