In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (accuracy_score, roc_auc_score, precision_score, 
                             recall_score, f1_score, matthews_corrcoef)

Load the Dataset

In [5]:
data = pd.read_csv('Bank_Marketing_data_cleaned.csv')
data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


Classification Models Implementation

In [6]:
# Preprocessing
X = data.drop('y', axis=1)
y = data['y'].map({'yes': 1, 'no': 0})

categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probability estimates for AUC

    accuracy = accuracy_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_pred_proba)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)

    return accuracy, auc_score, precision, recall, f1, mcc

# Initialize models
models = {
    "Logistic Regression": Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(max_iter=1000))
    ]),
    "Decision Tree": Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', DecisionTreeClassifier(random_state=42))
    ]),
    "K-Nearest Neighbors": Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', KNeighborsClassifier())
    ]),
    "Naive Bayes": Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', GaussianNB())
    ]),
    "Random Forest": Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))
    ]),
    "XGBoost": Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
    ])
}

# Evaluate each model
results = {}
for model_name, model in models.items():
    results[model_name] = evaluate_model(model, X_train, X_test, y_train, y_test)

# Print the evaluation metrics for each model
for model_name, metrics in results.items():
    print(f"{model_name}:")
    print(f"  Accuracy: {metrics[0]:.4f}")
    print(f"  AUC Score: {metrics[1]:.4f}")
    print(f"  Precision: {metrics[2]:.4f}")
    print(f"  Recall: {metrics[3]:.4f}")
    print(f"  F1 Score: {metrics[4]:.4f}")
    print(f"  MCC Score: {metrics[5]:.4f}")
    print()

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Logistic Regression:
  Accuracy: 0.9060
  AUC Score: 0.9306
  Precision: 0.6661
  Recall: 0.4068
  F1 Score: 0.5051
  MCC Score: 0.4735

Decision Tree:
  Accuracy: 0.8878
  AUC Score: 0.7352
  Precision: 0.5237
  Recall: 0.5355
  F1 Score: 0.5295
  MCC Score: 0.4659

K-Nearest Neighbors:
  Accuracy: 0.8983
  AUC Score: 0.8619
  Precision: 0.5982
  Recall: 0.4171
  F1 Score: 0.4915
  MCC Score: 0.4457

Naive Bayes:
  Accuracy: 0.8120
  AUC Score: 0.8308
  Precision: 0.3435
  Recall: 0.6519
  F1 Score: 0.4499
  MCC Score: 0.3756

Random Forest:
  Accuracy: 0.9105
  AUC Score: 0.9378
  Precision: 0.6806
  Recall: 0.4542
  F1 Score: 0.5448
  MCC Score: 0.5099

XGBoost:
  Accuracy: 0.9097
  AUC Score: 0.9442
  Precision: 0.6421
  Recall: 0.5283
  F1 Score: 0.5797
  MCC Score: 0.5328

