In [34]:
# Core
import numpy as np
import pandas as pd

# Visualization (optional)
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Evaluation
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef,
    confusion_matrix
)


In [35]:
data_df = pd.read_csv("bank.csv")
data_df.shape

(11162, 17)

In [36]:
data_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [37]:
data_df = pd.read_csv("bank.csv")
dataset_source = "Kaggle (https://www.kaggle.com/datasets/janiobachmann/bank-marketing-dataset/data)" 
n_samples = data_df.shape[0]
n_features = data_df.shape[1]

In [38]:
display(data_df.head())

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [39]:
data_df["deposit"].value_counts()

deposit
no     5873
yes    5289
Name: count, dtype: int64

In [40]:
data_df.shape

(11162, 17)

In [41]:
data_df.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
deposit      0
dtype: int64

In [42]:
data_df["deposit"] = data_df["deposit"].map({"yes": 1, "no": 0})

In [43]:
target = "deposit"

features = list(data_df.columns)
features.remove(target)

In [44]:
features

['age',
 'job',
 'marital',
 'education',
 'default',
 'balance',
 'housing',
 'loan',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome']

In [11]:
X = data_df[features]
y = data_df[target]

print(X.shape, y.shape)


(11162, 16) (11162,)


In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [13]:
categorical_cols = data_df.select_dtypes(include=["object", "category"]).columns.tolist()

from sklearn.preprocessing import LabelEncoder

label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])
    label_encoders[col] = le


In [14]:
categorical_cols 

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome']

In [15]:
features

['age',
 'job',
 'marital',
 'education',
 'default',
 'balance',
 'housing',
 'loan',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome']

In [16]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    # Probability needed for AUC
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
    else:
        y_prob = model.decision_function(X_test)

    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred),
        "TP": tp,
        "FP": fp,
        "FN": fn,
        "TN": tn
    }

    return metrics


In [17]:
log_reg_pl = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(max_iter=1000))
])

log_reg_pl.fit(X_train, y_train)
lgr_metrics = evaluate_model(log_reg_pl, X_test, y_test)
lgr_metrics


{'Accuracy': 0.7971339005821765,
 'AUC': 0.8729348831597151,
 'Precision': 0.7956989247311828,
 'Recall': 0.7693761814744802,
 'F1 Score': 0.7823161941374339,
 'MCC': 0.5927735737935427,
 'TP': np.int64(814),
 'FP': np.int64(209),
 'FN': np.int64(244),
 'TN': np.int64(966)}

In [18]:
dt = DecisionTreeClassifier(
    max_depth=5,
    random_state=42
)

dt.fit(X_train, y_train)
dt_metrics = evaluate_model(dt, X_test, y_test)
dt_metrics


{'Accuracy': 0.8069861173309449,
 'AUC': 0.8733024976873266,
 'Precision': 0.7619047619047619,
 'Recall': 0.8620037807183365,
 'F1 Score': 0.8088691796008869,
 'MCC': 0.6202138848318366,
 'TP': np.int64(912),
 'FP': np.int64(285),
 'FN': np.int64(146),
 'TN': np.int64(890)}

In [19]:
knn_pl = Pipeline([
    ("scaler", StandardScaler()),
    ("model", KNeighborsClassifier(n_neighbors=5))
])

knn_pl.fit(X_train, y_train)
knn_metrics = evaluate_model(knn_pl, X_test, y_test)
knn_metrics

{'Accuracy': 0.7899686520376176,
 'AUC': 0.8520387724731529,
 'Precision': 0.796576032225579,
 'Recall': 0.7476370510396976,
 'F1 Score': 0.7713310580204779,
 'MCC': 0.5784811934309146,
 'TP': np.int64(791),
 'FP': np.int64(202),
 'FN': np.int64(267),
 'TN': np.int64(973)}

In [20]:
nb = GaussianNB()

nb.fit(X_train, y_train)
nb_metrics = evaluate_model(nb, X_test, y_test)
nb_metrics

{'Accuracy': 0.7514554411106136,
 'AUC': 0.8109528214616095,
 'Precision': 0.7215859030837004,
 'Recall': 0.774102079395085,
 'F1 Score': 0.746922024623803,
 'MCC': 0.5045412758760058,
 'TP': np.int64(819),
 'FP': np.int64(316),
 'FN': np.int64(239),
 'TN': np.int64(859)}

In [21]:
rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42
)

rf.fit(X_train, y_train)
rf_metrics = evaluate_model(rf, X_test, y_test)
rf_metrics


{'Accuracy': 0.8557993730407524,
 'AUC': 0.9157917387282308,
 'Precision': 0.8262411347517731,
 'Recall': 0.8809073724007561,
 'F1 Score': 0.8526989935956084,
 'MCC': 0.7131558008789023,
 'TP': np.int64(932),
 'FP': np.int64(196),
 'FN': np.int64(126),
 'TN': np.int64(979)}

In [22]:
xgb = XGBClassifier(
    n_estimators=200,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)

xgb.fit(X_train, y_train)
xgb_metrics = evaluate_model(xgb, X_test, y_test)
xgb_metrics


{'Accuracy': 0.8571428571428571,
 'AUC': 0.9264135462333588,
 'Precision': 0.8319856244384546,
 'Recall': 0.8752362948960303,
 'F1 Score': 0.8530631045601106,
 'MCC': 0.7151072494153867,
 'TP': np.int64(926),
 'FP': np.int64(187),
 'FN': np.int64(132),
 'TN': np.int64(988)}

In [23]:
results = pd.DataFrame.from_dict({
    "Logistic Regression": lgr_metrics,
    "Decision Tree": dt_metrics,
    "KNN": knn_metrics,
    "Naive Bayes": nb_metrics,
    "Random Forest": rf_metrics,
    "XGBoost": xgb_metrics
}, orient="index")

results


Unnamed: 0,Accuracy,AUC,Precision,Recall,F1 Score,MCC,TP,FP,FN,TN
Logistic Regression,0.797134,0.872935,0.795699,0.769376,0.782316,0.592774,814,209,244,966
Decision Tree,0.806986,0.873302,0.761905,0.862004,0.808869,0.620214,912,285,146,890
KNN,0.789969,0.852039,0.796576,0.747637,0.771331,0.578481,791,202,267,973
Naive Bayes,0.751455,0.810953,0.721586,0.774102,0.746922,0.504541,819,316,239,859
Random Forest,0.855799,0.915792,0.826241,0.880907,0.852699,0.713156,932,196,126,979
XGBoost,0.857143,0.926414,0.831986,0.875236,0.853063,0.715107,926,187,132,988


In [24]:
results.to_csv("model_comparison.csv")

In [25]:
import joblib
import os

os.makedirs("model", exist_ok=True)

joblib.dump(label_encoders, "model/label_encoders.pkl")
joblib.dump(log_reg_pl, "model/logistic_regression.pkl")
joblib.dump(dt, "model/decision_tree.pkl")
joblib.dump(knn_pl, "model/knn.pkl")
joblib.dump(nb, "model/naive_bayes.pkl")
joblib.dump(rf, "model/random_forest.pkl")
joblib.dump(xgb, "model/xgboost.pkl")


['model/xgboost.pkl']

In [26]:
train_df = X_train.copy()
reverse_map = {1: "yes", 0: "no"}
train_df[target] = pd.Series(y_train).map(reverse_map)
train_df.to_csv("train.csv")

In [27]:
test_df = pd.DataFrame(X_test)
test_df.to_csv("test.csv")