In [1]:
import os
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from tabulate import tabulate

**Load The Data**

In [2]:
current_dir = os.getcwd() 
relative_path_train = os.path.join('..', 'data', 'preprocessed_train_data.csv')
relative_path_test = os.path.join('..', 'data', 'preprocessed_test_data.csv')

preprocessed_train_data = pd.read_csv(os.path.join(current_dir, relative_path_train))
preprocessed_test_data = pd.read_csv(os.path.join(current_dir, relative_path_test))

**Data Spliting**

In [3]:
x_train = preprocessed_train_data.drop(["satisfaction"], axis = 1)
y_train = preprocessed_train_data["satisfaction"]

x_test = preprocessed_test_data.drop(["satisfaction"], axis = 1)
y_test = preprocessed_test_data["satisfaction"]

y_test = y_test.replace({"satisfied":1, "neutral or dissatisfied":0})

k_fold = KFold(n_splits=10, shuffle=True, random_state=42)

**Create a list of classifiers**

In [4]:
classifiers = [
    ("Logistic Regression", LogisticRegression()),
    ("Decision Tree", DecisionTreeClassifier()),
    ("Random Forest", RandomForestClassifier()),
    ("Gradient Boosting", GradientBoostingClassifier()),
    ("K-Nearest Neighbors", KNeighborsClassifier()),
    ("Gaussian Naive Bayes", GaussianNB()),
    ("Multi-layer Perceptron", MLPClassifier()),
    ("XGBoost", XGBClassifier()),
    ("CatBoost", CatBoostClassifier()), 
    ("AdaBoost", AdaBoostClassifier())
]

In [5]:
# Initialize an empty DataFrame to store the results
results_df = pd.DataFrame(columns=["Classifier", "Balanced Accuracy", "Training Accuracy", "Testing Accuracy", "F1 Score", "Precision", "Recall"])

**Train and evaluate each classifier**

In [6]:
for name, clf in classifiers:
    print(name)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    y_prob = clf.predict_proba(x_test)[:, 1]
    auc_score = roc_auc_score(y_test, y_prob)
    accuracy_train = accuracy_score(y_train, clf.predict(x_train))
    accuracy_test = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    # Create a temporary Series with results for this classifier
    new_row = pd.Series({
        "Classifier": name,
        "Balanced Accuracy": auc_score,
        "Training Accuracy": accuracy_train,
        "Testing Accuracy": accuracy_test,
        "F1 Score": f1,
        "Precision": precision,
        "Recall": recall
    })

    # Append the Series as a new row to the results DataFrame
    results_df = pd.concat([results_df, new_row.to_frame().T], ignore_index=True)

# Sort the DataFrame by Balanced Accuracy in descending order
results_df = results_df.sort_values(by="Balanced Accuracy", ascending=False)

# Print the results table with styling
styled_results = results_df.style.background_gradient(cmap='Blues', subset=["Balanced Accuracy", "Training Accuracy", "Testing Accuracy", "F1 Score", "Precision", "Recall"])
#styled_results = styled_results.hide_index() 

Logistic Regression
Decision Tree
Random Forest
Gradient Boosting
K-Nearest Neighbors
Gaussian Naive Bayes
Multi-layer Perceptron




XGBoost
CatBoost
Learning rate set to 0.074823
0:	learn: 0.5925347	total: 80.5ms	remaining: 1m 20s
1:	learn: 0.4853586	total: 114ms	remaining: 56.7s
2:	learn: 0.4290905	total: 139ms	remaining: 46.2s
3:	learn: 0.3833252	total: 163ms	remaining: 40.6s
4:	learn: 0.3513727	total: 189ms	remaining: 37.5s
5:	learn: 0.3089257	total: 221ms	remaining: 36.7s
6:	learn: 0.2883640	total: 249ms	remaining: 35.3s
7:	learn: 0.2711136	total: 277ms	remaining: 34.3s
8:	learn: 0.2577476	total: 307ms	remaining: 33.9s
9:	learn: 0.2402199	total: 333ms	remaining: 33s
10:	learn: 0.2314451	total: 358ms	remaining: 32.2s
11:	learn: 0.2185457	total: 383ms	remaining: 31.5s
12:	learn: 0.2103959	total: 413ms	remaining: 31.4s
13:	learn: 0.2041481	total: 443ms	remaining: 31.2s
14:	learn: 0.1956322	total: 466ms	remaining: 30.6s
15:	learn: 0.1903867	total: 493ms	remaining: 30.3s
16:	learn: 0.1835245	total: 523ms	remaining: 30.3s
17:	learn: 0.1753302	total: 549ms	remaining: 29.9s
18:	learn: 0.1715682	total: 573ms	remaining: 

In [9]:
styled_results

Unnamed: 0,Classifier,Balanced Accuracy,Training Accuracy,Testing Accuracy,F1 Score,Precision,Recall
8,CatBoost,0.994832,0.973658,0.962786,0.956157,0.972635,0.940229
6,Multi-layer Perceptron,0.993274,0.968192,0.958796,0.951558,0.965868,0.937666
2,Random Forest,0.993101,1.0,0.961403,0.954535,0.970844,0.938764
3,Gradient Boosting,0.987055,0.940878,0.941216,0.930771,0.946447,0.915606
9,AdaBoost,0.975945,0.925566,0.924347,0.911297,0.922449,0.900412
7,XGBoost,0.973716,0.972744,0.867025,0.821432,0.976909,0.70865
4,K-Nearest Neighbors,0.972847,0.953139,0.932367,0.919161,0.949283,0.890892
1,Decision Tree,0.941447,1.0,0.942441,0.933376,0.932566,0.934188
0,Logistic Regression,0.923548,0.874192,0.871568,0.847978,0.866826,0.829931
5,Gaussian Naive Bayes,0.913202,0.849515,0.84506,0.817768,0.830424,0.805492


In [10]:
'''
CatBoost achieved the highest accuracy, F1 Score & Balanced Accuracy
'''

'\nCatBoost achieved the highest accuracy, F1 Score & Balanced Accuracy\n'