In [1]:
import os
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from tabulate import tabulate
from sklearn.model_selection import cross_val_score


**Load The Data**

In [37]:
current_dir = os.getcwd() 
relative_path_train = os.path.join('..', 'data', 'preprocessed_train_data.csv')
relative_path_test = os.path.join('..', 'data', 'preprocessed_test_data.csv')

preprocessed_train_data = pd.read_csv(os.path.join(current_dir, relative_path_train))
preprocessed_test_data = pd.read_csv(os.path.join(current_dir, relative_path_test))

**Data Spliting**

In [39]:
x_train = preprocessed_train_data.drop(["satisfaction"], axis = 1)
y_train = preprocessed_train_data["satisfaction"]

x_test = preprocessed_test_data.drop(["satisfaction"], axis = 1)
y_test = preprocessed_test_data["satisfaction"]

k_fold = KFold(n_splits=10, shuffle=True, random_state=42)

**Create a list of classifiers**

In [17]:
classifiers = [
    ("Logistic Regression", LogisticRegression()),
    ("Decision Tree", DecisionTreeClassifier()),
    ("Random Forest", RandomForestClassifier()),
    ("Gradient Boosting", GradientBoostingClassifier()),
    ("K-Nearest Neighbors", KNeighborsClassifier()),
    ("Gaussian Naive Bayes", GaussianNB()),
    ("Multi-layer Perceptron", MLPClassifier()),
    ("XGBoost", XGBClassifier()),
    ("CatBoost", CatBoostClassifier()), 
    ("AdaBoost", AdaBoostClassifier())
]

In [16]:
# Initialize an empty DataFrame to store the results
results_df = pd.DataFrame(columns=["Classifier", "Balanced Accuracy", "Training Accuracy", "Validation Accuracy", "Testing Accuracy", "F1 Score", "Precision", "Recall"])


**Train and evaluate each classifier**

In [21]:
for name, clf in classifiers:
    print(name)
    
    # Fit the classifier on the entire training set
    clf.fit(x_train, y_train)
    
    # Calculate training accuracy
    accuracy_train = accuracy_score(y_train, clf.predict(x_train))
    
    # Calculate validation accuracy using k-fold cross-validation
    accuracy_validation = cross_val_score(clf, x_train, y_train, cv=k_fold, scoring='accuracy').mean()
    
    y_prob = clf.predict_proba(x_test)[:, 1]
    auc_score = roc_auc_score(y_test, y_prob)
    # Make predictions on the test set
    y_pred = clf.predict(x_test)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    # Create a temporary Series with results for this classifier
    new_row = pd.Series({
        "Classifier": name,
        "Balanced Accuracy": auc_score,
        "Training Accuracy": accuracy_train,
        "Validation Accuracy": accuracy_validation,
        "Testing Accuracy": accuracy_score(y_test, y_pred),
        "F1 Score": f1,
        "Precision": precision,
        "Recall": recall
    })

    # Append the Series as a new row to the results DataFrame
    results_df = pd.concat([results_df, new_row.to_frame().T], ignore_index=True)

# Sort the DataFrame by Balanced Accuracy in descending order
results_df = results_df.sort_values(by="Balanced Accuracy", ascending=False)

Logistic Regression
Decision Tree
Random Forest
Gradient Boosting
K-Nearest Neighbors
Gaussian Naive Bayes
Multi-layer Perceptron




XGBoost
CatBoost
Learning rate set to 0.074823
0:	learn: 0.5925347	total: 501ms	remaining: 8m 20s
1:	learn: 0.4853586	total: 544ms	remaining: 4m 31s
2:	learn: 0.4290905	total: 579ms	remaining: 3m 12s
3:	learn: 0.3833252	total: 613ms	remaining: 2m 32s
4:	learn: 0.3513727	total: 650ms	remaining: 2m 9s
5:	learn: 0.3089257	total: 684ms	remaining: 1m 53s
6:	learn: 0.2883640	total: 736ms	remaining: 1m 44s
7:	learn: 0.2711136	total: 784ms	remaining: 1m 37s
8:	learn: 0.2577476	total: 821ms	remaining: 1m 30s
9:	learn: 0.2402199	total: 860ms	remaining: 1m 25s
10:	learn: 0.2314451	total: 894ms	remaining: 1m 20s
11:	learn: 0.2185457	total: 927ms	remaining: 1m 16s
12:	learn: 0.2103959	total: 977ms	remaining: 1m 14s
13:	learn: 0.2041481	total: 1.02s	remaining: 1m 12s
14:	learn: 0.1956322	total: 1.06s	remaining: 1m 9s
15:	learn: 0.1903867	total: 1.1s	remaining: 1m 7s
16:	learn: 0.1835245	total: 1.13s	remaining: 1m 5s
17:	learn: 0.1753302	total: 1.17s	remaining: 1m 3s
18:	learn: 0.1715682	total: 1.22s

In [22]:
# Print the results table with styling
styled_results = results_df.style.background_gradient(cmap='Blues', subset=["Balanced Accuracy", "Training Accuracy", "Validation Accuracy", "Testing Accuracy", "F1 Score", "Precision", "Recall"])

In [10]:
'''
CatBoost achieved the highest accuracy, F1 Score & Balanced Accuracy
'''

'\nCatBoost achieved the highest accuracy, F1 Score & Balanced Accuracy\n'

**Binominal Naive Bayes**

scale train and test data

In [40]:
# Create an instance of MinMaxScaler
scaler = MinMaxScaler()

# Fit the scaler on the training data and transform both training and test data
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

Train the MultinomialNB classifier

In [41]:
# Create an instance of the classifier
clf = MultinomialNB()
clf.fit(x_train_scaled, y_train)
y_pred = clf.predict(x_test_scaled)
y_prob = clf.predict_proba(x_test_scaled)[:, 1]
auc_score = roc_auc_score(y_test, y_prob)
accuracy_train = accuracy_score(y_train, clf.predict(x_train_scaled))
accuracy_test = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

accuracy_validation = cross_val_score(clf, x_train_scaled, y_train, cv=k_fold, scoring='accuracy').mean()
    
print("Balanced Accuracy: ", auc_score)
print("Training Accuracy: ", accuracy_train)
print("Testing Accuracy: ", accuracy_test)
print("Validation Accuracy: ", accuracy_validation)
print("F1 Score: ", f1)
print("Precision: ", precision)
print("Recall: ", recall)

Balanced Accuracy:  0.8741036230929793
Training Accuracy:  0.7680455035417308
Testing Accuracy:  0.7649034093153716
Validation Accuracy:  0.7680455332217699
F1 Score:  0.7330791657322269
Precision:  0.7187335092348285
Recall:  0.7480091533180778


**Binominal Naive Bayes with applying grouping on columns with continous values**

Load the data

In [3]:
current_dir = os.getcwd() 
relative_path_train = os.path.join('..', 'data', 'preprocessed_train_data_after_grouping.csv')
relative_path_test = os.path.join('..', 'data', 'preprocessed_test_data_after_grouping.csv')

new_preprocessed_train_data = pd.read_csv(os.path.join(current_dir, relative_path_train))
new_preprocessed_test_data = pd.read_csv(os.path.join(current_dir, relative_path_test))

In [4]:
x_train = new_preprocessed_train_data.drop(["satisfaction"], axis = 1)
y_train = new_preprocessed_train_data["satisfaction"]

x_test = new_preprocessed_test_data.drop(["satisfaction"], axis = 1)
y_test = new_preprocessed_test_data["satisfaction"]

combine the data

Split the data again to the train and test sets

scale train and test data

In [5]:
# Create an instance of MinMaxScaler
scaler = MinMaxScaler()

# Fit the scaler on the training data and transform both training and test data
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

k_fold = KFold(n_splits=10, shuffle=True, random_state=42)

Train the MultinomialNB classifier

In [6]:
# Create an instance of the classifier
clf = MultinomialNB()
clf.fit(x_train_scaled, y_train)
y_pred = clf.predict(x_test_scaled)
y_prob = clf.predict_proba(x_test_scaled)[:, 1]
auc_score = roc_auc_score(y_test, y_prob)
accuracy_train = accuracy_score(y_train, clf.predict(x_train_scaled))
accuracy_test = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

accuracy_validation = cross_val_score(clf, x_train_scaled, y_train, cv=k_fold, scoring='accuracy').mean()
    
print("Balanced Accuracy: ", auc_score)
print("Training Accuracy: ", accuracy_train)
print("Testing Accuracy: ", accuracy_test)
print("Validation Accuracy: ", accuracy_validation)
print("F1 Score: ", f1)
print("Precision: ", precision)
print("Recall: ", recall)

Balanced Accuracy:  0.8660401815904305
Training Accuracy:  0.7687865722205113
Testing Accuracy:  0.7656145063801209
Validation Accuracy:  0.7687288538824919
F1 Score:  0.7336236699142458
Precision:  0.7199506520972858
Recall:  0.7478260869565218
