In [15]:
import os
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from tabulate import tabulate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split


**Load The Data**

In [16]:
current_dir = os.getcwd() 
relative_path_train = os.path.join('..', 'data', 'preprocessed_train_data.csv')
relative_path_test = os.path.join('..', 'data', 'preprocessed_test_data.csv')

preprocessed_train_data = pd.read_csv(os.path.join(current_dir, relative_path_train))
preprocessed_test_data = pd.read_csv(os.path.join(current_dir, relative_path_test))

**Data Spliting**

In [17]:
# Split the preprocessed data into training and validation sets
x_train, x_valid, y_train, y_valid = train_test_split(
    preprocessed_train_data.drop(["satisfaction"], axis=1),  # Features for training
    preprocessed_train_data["satisfaction"],  # Labels for training
    test_size=0.2,  # Ratio of validation set size to total data size
    random_state=42  # Random seed for reproducibility
)

x_test = preprocessed_test_data.drop(["satisfaction"], axis = 1)
y_test = preprocessed_test_data["satisfaction"]

k_fold = KFold(n_splits=10, shuffle=True, random_state=42)

**Create a list of classifiers**

In [18]:
classifiers = [
    ("Logistic Regression", LogisticRegression()),
    ("Decision Tree", DecisionTreeClassifier()),
    ("Random Forest", RandomForestClassifier()),
    ("Gradient Boosting", GradientBoostingClassifier()),
    ("K-Nearest Neighbors", KNeighborsClassifier()),
    ("Gaussian Naive Bayes", GaussianNB()),
    ("Multi-layer Perceptron", MLPClassifier()),
    ("XGBoost", XGBClassifier()),
    ("CatBoost", CatBoostClassifier()), 
    ("AdaBoost", AdaBoostClassifier())
]

In [19]:
# Initialize an empty DataFrame to store the results
results_df = pd.DataFrame(columns=["Classifier", "Balanced Accuracy", "Training Accuracy", "Validation Accuracy", "Testing Accuracy", "F1 Score", "Precision", "Recall"])


**Train and evaluate each classifier**

In [20]:
for name, clf in classifiers:
    print(name)
    
    # Fit the classifier on the entire training set
    clf.fit(x_train, y_train)
    
    # Calculate training accuracy
    accuracy_train = accuracy_score(y_train, clf.predict(x_train))
    
    # Calculate validation accuracy using k-fold cross-validation
    #accuracy_validation = cross_val_score(clf, x_train, y_train, cv=k_fold, scoring='accuracy').mean()
    accuracy_validation = accuracy_score(y_valid, clf.predict(x_valid))
    y_prob = clf.predict_proba(x_test)[:, 1]
    auc_score = roc_auc_score(y_test, y_prob)
    # Make predictions on the test set
    y_pred = clf.predict(x_test)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    # Create a temporary Series with results for this classifier
    new_row = pd.Series({
        "Classifier": name,
        "Balanced Accuracy": auc_score,
        "Training Accuracy": accuracy_train,
        "Validation Accuracy": accuracy_validation,
        "Testing Accuracy": accuracy_score(y_test, y_pred),
        "F1 Score": f1,
        "Precision": precision,
        "Recall": recall
    })

    # Append the Series as a new row to the results DataFrame
    results_df = pd.concat([results_df, new_row.to_frame().T], ignore_index=True)

# Sort the DataFrame by Balanced Accuracy in descending order
results_df = results_df.sort_values(by="Balanced Accuracy", ascending=False)

Logistic Regression
Decision Tree
Random Forest
Gradient Boosting
K-Nearest Neighbors
Gaussian Naive Bayes
Multi-layer Perceptron




XGBoost
CatBoost
Learning rate set to 0.068023
0:	learn: 0.6023763	total: 1.41s	remaining: 23m 31s
1:	learn: 0.5079179	total: 1.49s	remaining: 12m 24s
2:	learn: 0.4523337	total: 1.61s	remaining: 8m 54s
3:	learn: 0.4072575	total: 1.64s	remaining: 6m 48s
4:	learn: 0.3558689	total: 1.68s	remaining: 5m 33s
5:	learn: 0.3287185	total: 1.71s	remaining: 4m 42s
6:	learn: 0.3066777	total: 1.74s	remaining: 4m 6s
7:	learn: 0.2903898	total: 1.77s	remaining: 3m 39s
8:	learn: 0.2724946	total: 1.8s	remaining: 3m 17s
9:	learn: 0.2594546	total: 1.83s	remaining: 3m
10:	learn: 0.2487500	total: 1.86s	remaining: 2m 47s
11:	learn: 0.2395237	total: 1.91s	remaining: 2m 37s
12:	learn: 0.2311329	total: 1.95s	remaining: 2m 28s
13:	learn: 0.2239853	total: 1.99s	remaining: 2m 20s
14:	learn: 0.2150287	total: 2.02s	remaining: 2m 12s
15:	learn: 0.2085634	total: 2.05s	remaining: 2m 6s
16:	learn: 0.2020303	total: 2.1s	remaining: 2m 1s
17:	learn: 0.1971781	total: 2.14s	remaining: 1m 56s
18:	learn: 0.1913298	total: 2.18s	

In [21]:
# Print the results table with styling
styled_results = results_df.style.background_gradient(cmap='Blues', subset=["Balanced Accuracy", "Training Accuracy", "Validation Accuracy", "Testing Accuracy", "F1 Score", "Precision", "Recall"])

In [22]:
styled_results

Unnamed: 0,Classifier,Balanced Accuracy,Training Accuracy,Validation Accuracy,Testing Accuracy,F1 Score,Precision,Recall
8,CatBoost,0.994653,0.974387,0.962321,0.963102,0.956518,0.973283,0.94032
6,Multi-layer Perceptron,0.993183,0.967265,0.957942,0.955477,0.947788,0.959568,0.936293
2,Random Forest,0.992909,0.999964,0.962851,0.961048,0.95411,0.970552,0.938215
3,Gradient Boosting,0.987125,0.941027,0.941581,0.940821,0.930209,0.947244,0.913776
9,AdaBoost,0.975683,0.925941,0.925365,0.925216,0.912015,0.92644,0.898032
7,XGBoost,0.9726,0.974832,0.962273,0.874965,0.834648,0.972249,0.731167
4,K-Nearest Neighbors,0.971995,0.951566,0.932294,0.931103,0.91751,0.9493,0.88778
1,Decision Tree,0.94039,1.0,0.944517,0.941414,0.932181,0.931457,0.932906
0,Logistic Regression,0.923516,0.87391,0.875271,0.871726,0.848264,0.866527,0.830755
5,Gaussian Naive Bayes,0.913085,0.848947,0.848997,0.844388,0.817088,0.829218,0.805309


In [10]:
'''
CatBoost achieved the highest accuracy, F1 Score & Balanced Accuracy
'''

'\nCatBoost achieved the highest accuracy, F1 Score & Balanced Accuracy\n'

**MultinomialNB Naive Bayes**

scale train and test data

In [23]:
# Create an instance of MinMaxScaler
scaler = MinMaxScaler()

# Fit the scaler on the training data and transform both training and test data
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)
x_valid_scaled = scaler.transform(x_valid)

Train the MultinomialNB classifier

In [24]:
# Create an instance of the classifier
clf = MultinomialNB()
clf.fit(x_train_scaled, y_train)
y_pred = clf.predict(x_test_scaled)
y_prob = clf.predict_proba(x_test_scaled)[:, 1]
auc_score = roc_auc_score(y_test, y_prob)
accuracy_train = accuracy_score(y_train, clf.predict(x_train_scaled))
accuracy_test = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

#accuracy_validation = cross_val_score(clf, x_train_scaled, y_train, cv=k_fold, scoring='accuracy').mean()
accuracy_validation = accuracy_score(y_valid, clf.predict(x_valid_scaled))   
print("Balanced Accuracy: ", auc_score)
print("Training Accuracy: ", accuracy_train)
print("Testing Accuracy: ", accuracy_test)
print("Validation Accuracy: ", accuracy_validation)
print("F1 Score: ", f1)
print("Precision: ", precision)
print("Recall: ", recall)

Balanced Accuracy:  0.8741034831339871
Training Accuracy:  0.7691493329162807
Testing Accuracy:  0.7649034093153716
Validation Accuracy:  0.7636302391607719
F1 Score:  0.7330791657322269
Precision:  0.7187335092348285
Recall:  0.7480091533180778


**MultinomialNB Naive Bayes with applying grouping on columns with continous values**

Load the data

In [25]:
current_dir = os.getcwd() 
relative_path_train = os.path.join('..', 'data', 'preprocessed_train_data_after_grouping.csv')
relative_path_test = os.path.join('..', 'data', 'preprocessed_test_data_after_grouping.csv')

new_preprocessed_train_data = pd.read_csv(os.path.join(current_dir, relative_path_train))
new_preprocessed_test_data = pd.read_csv(os.path.join(current_dir, relative_path_test))

In [26]:
# Split the preprocessed data into training and validation sets
x_train, x_valid, y_train, y_valid = train_test_split(
    preprocessed_train_data.drop(["satisfaction"], axis=1),  # Features for training
    preprocessed_train_data["satisfaction"],  # Labels for training
    test_size=0.2,  # Ratio of validation set size to total data size
    random_state=42  # Random seed for reproducibility
)

x_test = preprocessed_test_data.drop(["satisfaction"], axis = 1)
y_test = preprocessed_test_data["satisfaction"]

k_fold = KFold(n_splits=10, shuffle=True, random_state=42)

scale train and test data

In [27]:
# Create an instance of MinMaxScaler
scaler = MinMaxScaler()

# Fit the scaler on the training data and transform both training and test data
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)
x_valid_scaled = scaler.transform(x_valid)

k_fold = KFold(n_splits=10, shuffle=True, random_state=42)

Train the MultinomialNB classifier

In [28]:
# Create an instance of the classifier
clf = MultinomialNB()
clf.fit(x_train_scaled, y_train)
y_pred = clf.predict(x_test_scaled)
y_prob = clf.predict_proba(x_test_scaled)[:, 1]
auc_score = roc_auc_score(y_test, y_prob)
accuracy_train = accuracy_score(y_train, clf.predict(x_train_scaled))
accuracy_test = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

#accuracy_validation = cross_val_score(clf, x_train_scaled, y_train, cv=k_fold, scoring='accuracy').mean()
accuracy_validation = accuracy_score(y_valid, clf.predict(x_valid_scaled))
    
print("Balanced Accuracy: ", auc_score)
print("Training Accuracy: ", accuracy_train)
print("Testing Accuracy: ", accuracy_test)
print("Validation Accuracy: ", accuracy_validation)
print("F1 Score: ", f1)
print("Precision: ", precision)
print("Recall: ", recall)

Balanced Accuracy:  0.8741034831339871
Training Accuracy:  0.7691493329162807
Testing Accuracy:  0.7649034093153716
Validation Accuracy:  0.7636302391607719
F1 Score:  0.7330791657322269
Precision:  0.7187335092348285
Recall:  0.7480091533180778
