<a href="https://colab.research.google.com/github/huyminh1115/Trip-Advisor-Hotel-Project/blob/main/Code/Final_ML_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import string
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
import pickle
from wordcloud import WordCloud

# Preprocessing and evaluation
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l1, l2

# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

In [2]:
!pip install wordcloud



In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Prepare data

In [4]:
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/processed_data_v2.csv")

In [5]:
data.head()

Unnamed: 0,Review,Rating,Length,Word_count,cleaned_review,sentiment,tokenized_review,review_vector
0,nice hotel expensive parking got good deal sta...,4,593,87,nice hotel expensive parking got good deal sta...,0.208744,"['nice', 'hotel', 'expensive', 'parking', 'got...",[ 0.26746067 -0.39511007 0.2275819 0.329419...
1,ok nothing special charge diamond member hilto...,2,1689,250,ok nothing special charge diamond member hilto...,0.248633,"['ok', 'nothing', 'special', 'charge', 'diamon...",[-0.13580382 -0.10084884 -0.06840305 0.191020...
2,nice rooms not 4* experience hotel monaco seat...,3,1427,217,nice room experience hotel monaco seattle good...,0.29442,"['nice', 'room', 'experience', 'hotel', 'monac...",[ 0.12907822 -0.4086012 0.00375678 0.077708...
3,"unique, great stay, wonderful time hotel monac...",5,600,89,unique great stay wonderful time hotel monaco ...,0.504825,"['unique', 'great', 'stay', 'wonderful', 'time...",[ 0.14176802 -0.31248108 0.11944489 0.082050...
4,"great stay great stay, went seahawk game aweso...",5,1281,191,great stay great stay went seahawk game awesom...,0.469643,"['great', 'stay', 'great', 'stay', 'went', 'se...",[-0.13615511 -0.08891882 -0.0077215 0.260447...


In [6]:
data['sentiment'] = data['Rating'].apply(lambda x: 1 if x > 3 else 2 if x == 3 else 0)

def convert_to_float_array(s):
    # Bỏ dấu ngoặc rồi tách theo khoảng trắng
    s = s.strip('[]').split()
    # Chuyển từng phần tử trong danh sách thành float
    return [float(x) for x in s]


data['review_vector'] = data['review_vector'].apply(convert_to_float_array)
# Prepare data
X = np.array(data['review_vector'].tolist())
y = data['sentiment']

In [7]:
# Initialize 10-Fold Cross-Validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Save the indices of each fold for reuse
fold_indices = [(train_idx, test_idx) for train_idx, test_idx in kf.split(X)]

In [22]:
from xgboost import XGBClassifier
# Danh sách các mô hình
models = [
    ('Decision Tree', DecisionTreeClassifier()),
    ('SVC', SVC()),
    ('Random Forest', RandomForestClassifier()),
    ('Logistic Regression', LogisticRegression(max_iter=1000)),
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ('Bernoulli Naive Bayes', BernoulliNB()),
]

# Tạo các param_grid cho từng mô hình
param_grids = {
    'Decision Tree': {'max_depth': [3, 5, 10], 'criterion': ['gini', 'entropy']},
    'SVC': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
    'Random Forest': {'n_estimators': [10, 100], 'max_depth': [3, 5, 10]},
    'Logistic Regression': {'C': [0.1, 1, 10], 'solver': ['lbfgs', 'liblinear']},
    'K-Nearest Neighbors': {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']},
    'Bernoulli Naive Bayes': {'alpha': [0.1, 1, 10]},
}


In [18]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import time

# Store results for each fold and model
results = {name: [] for name, _ in models}

# Perform 10-fold cross-validation manually
for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X, y), start=1):
    print(f"\nRunning Fold {fold_idx}...")

    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    for name, model in models:
        print(f"Running GridSearchCV for {name} (Fold {fold_idx})...")

        # Initialize GridSearchCV
        param_grid = param_grids.get(name, {})
        grid_search = GridSearchCV(estimator=model, param_grid=param_grid,
                                   cv=5, n_jobs=-1, scoring='accuracy')

        # Measure time
        start_time = time.time()
        grid_search.fit(X_train, y_train)
        elapsed_time = time.time() - start_time

        # Predictions and metrics
        y_pred = grid_search.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')

        # Store results for this fold
        results[name].append({
            "Fold": fold_idx,
            "Best Params": grid_search.best_params_,
            "Best Score": grid_search.best_score_,
            "Time": elapsed_time,
            "Accuracy": accuracy,
            "F1-score": f1,
            "Precision": precision,
            "Recall": recall
        })

        # Print fold results
        print(f"Fold {fold_idx} - {name} Results:")
        print(f"  Best Score: {grid_search.best_score_:.4f}")
        print(f"  Best Params: {grid_search.best_params_}")
        print(f"  Accuracy: {accuracy:.4f}, F1: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")
        print(f"  Time Taken: {elapsed_time:.2f} seconds\n")

# Calculate average metrics for each model
for name in results:
    print(f"\nAveraged Results for {name}:")
    metrics = {k: np.mean([r[k] for r in results[name]]) for k in ["Accuracy", "F1-score", "Precision", "Recall", "Best Score"]}
    print(f"  Accuracy: {metrics['Accuracy']:.4f}")
    print(f"  F1-score: {metrics['F1-score']:.4f}")
    print(f"  Precision: {metrics['Precision']:.4f}")
    print(f"  Recall: {metrics['Recall']:.4f}\n")
    print(f"  Best Score: {metrics['Best Score']:.4f}\n")


Running Fold 1...
Running GridSearchCV for Decision Tree (Fold 1)...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 1 - Decision Tree Results:
  Best Score: 0.7927
  Best Params: {'criterion': 'gini', 'max_depth': 5}
  Accuracy: 0.8020, F1: 0.7496, Precision: 0.7102, Recall: 0.8020
  Time Taken: 56.42 seconds

Running GridSearchCV for SVC (Fold 1)...
Fold 1 - SVC Results:
  Best Score: 0.8411
  Best Params: {'C': 10, 'kernel': 'linear'}
  Accuracy: 0.8537, F1: 0.8216, Precision: 0.8352, Recall: 0.8537
  Time Taken: 502.21 seconds

Running GridSearchCV for Random Forest (Fold 1)...
Fold 1 - Random Forest Results:
  Best Score: 0.8210
  Best Params: {'max_depth': 10, 'n_estimators': 100}
  Accuracy: 0.8273, F1: 0.7777, Precision: 0.7902, Recall: 0.8273
  Time Taken: 173.19 seconds

Running GridSearchCV for Logistic Regression (Fold 1)...
Fold 1 - Logistic Regression Results:
  Best Score: 0.8426
  Best Params: {'C': 10, 'solver': 'lbfgs'}
  Accuracy: 0.8571, F1: 0.8341, Precision: 0.8352, Recall: 0.8571
  Time Taken: 53.04 seconds

Running GridSearchCV for K-Nearest Neighbors (Fold 1)...
Fold 1 -

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 2 - Decision Tree Results:
  Best Score: 0.7933
  Best Params: {'criterion': 'gini', 'max_depth': 5}
  Accuracy: 0.7931, F1: 0.7460, Precision: 0.7050, Recall: 0.7931
  Time Taken: 50.51 seconds

Running GridSearchCV for SVC (Fold 2)...


KeyboardInterrupt: 

In [23]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import time

# Store results for each fold and model
results = {name: [] for name, _ in models}

# Perform 10-fold cross-validation manually
for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X, y), start=1):
    print(f"\nRunning Fold {fold_idx}...")

    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    for name, model in models:
        print(f"Running GridSearchCV for {name} (Fold {fold_idx})...")

        # Initialize GridSearchCV
        param_grid = param_grids.get(name, {})
        grid_search = GridSearchCV(estimator=model, param_grid=param_grid,
                                   cv=5, n_jobs=-1, scoring='accuracy')

        # Measure time
        start_time = time.time()
        grid_search.fit(X_train, y_train)
        elapsed_time = time.time() - start_time

        # Predictions and metrics on training set
        y_train_pred = grid_search.predict(X_train)
        train_accuracy = accuracy_score(y_train, y_train_pred)
        train_f1 = f1_score(y_train, y_train_pred, average='weighted')
        train_precision = precision_score(y_train, y_train_pred, average='weighted')
        train_recall = recall_score(y_train, y_train_pred, average='weighted')

        # Predictions and metrics on test set
        y_test_pred = grid_search.predict(X_test)
        test_accuracy = accuracy_score(y_test, y_test_pred)
        test_f1 = f1_score(y_test, y_test_pred, average='weighted')
        test_precision = precision_score(y_test, y_test_pred, average='weighted')
        test_recall = recall_score(y_test, y_test_pred, average='weighted')

        # Store results for this fold
        results[name].append({
            "Fold": fold_idx,
            "Best Params": grid_search.best_params_,
            "Best Score": grid_search.best_score_,
            "Time": elapsed_time,
            "Accuracy": accuracy,
            "F1-score": f1,
            "Precision": precision,
            "Recall": recall
        })

        # Store results for this fold
        results[name].append({
            "Fold": fold_idx,
            "Best Params": grid_search.best_params_,
            "Best Score": grid_search.best_score_,
            "Time": elapsed_time,
            "Train Accuracy": train_accuracy,
            "Train F1-score": train_f1,
            "Train Precision": train_precision,
            "Train Recall": train_recall,
            "Test Accuracy": test_accuracy,
            "Test F1-score": test_f1,
            "Test Precision": test_precision,
            "Test Recall": test_recall,
        })

        # Print fold results
        print(f"Fold {fold_idx} - {name} Results:")
        print(f"  Best Score: {grid_search.best_score_:.4f}")
        print(f"  Best Params: {grid_search.best_params_}")
        print(f"  Train -> Accuracy: {train_accuracy:.4f}, F1: {train_f1:.4f}, Precision: {train_precision:.4f}, Recall: {train_recall:.4f}")
        print(f"  Test  -> Accuracy: {test_accuracy:.4f}, F1: {test_f1:.4f}, Precision: {test_precision:.4f}, Recall: {test_recall:.4f}")
        print(f"  Time Taken: {elapsed_time:.2f} seconds\n")


Running Fold 1...
Running GridSearchCV for Decision Tree (Fold 1)...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 1 - Decision Tree Results:
  Best Score: 0.7926
  Best Params: {'criterion': 'gini', 'max_depth': 5}
  Train -> Accuracy: 0.8057, F1: 0.7540, Precision: 0.7147, Recall: 0.8057
  Test  -> Accuracy: 0.8020, F1: 0.7496, Precision: 0.7102, Recall: 0.8020
  Time Taken: 54.44 seconds

Running GridSearchCV for SVC (Fold 1)...
Fold 1 - SVC Results:
  Best Score: 0.8411
  Best Params: {'C': 10, 'kernel': 'linear'}
  Train -> Accuracy: 0.8474, F1: 0.8132, Precision: 0.8233, Recall: 0.8474
  Test  -> Accuracy: 0.8537, F1: 0.8216, Precision: 0.8352, Recall: 0.8537
  Time Taken: 493.46 seconds

Running GridSearchCV for Random Forest (Fold 1)...
Fold 1 - Random Forest Results:
  Best Score: 0.8206
  Best Params: {'max_depth': 10, 'n_estimators': 100}
  Train -> Accuracy: 0.8908, F1: 0.8702, Precision: 0.9006, Recall: 0.8908
  Test  -> Accuracy: 0.8302, F1: 0.7799, Precision: 0.7403, Recall: 0.8302
  Time Taken: 174.76 seconds

Running GridSearchCV for Logistic Regression (Fold 1)...
Fold 1 - Lo

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 2 - Decision Tree Results:
  Best Score: 0.7933
  Best Params: {'criterion': 'gini', 'max_depth': 5}
  Train -> Accuracy: 0.8062, F1: 0.7569, Precision: 0.7156, Recall: 0.8062
  Test  -> Accuracy: 0.7931, F1: 0.7460, Precision: 0.7050, Recall: 0.7931
  Time Taken: 50.85 seconds

Running GridSearchCV for SVC (Fold 2)...
Fold 2 - SVC Results:
  Best Score: 0.8426
  Best Params: {'C': 10, 'kernel': 'linear'}
  Train -> Accuracy: 0.8483, F1: 0.8150, Precision: 0.8250, Recall: 0.8483
  Test  -> Accuracy: 0.8429, F1: 0.8060, Precision: 0.8086, Recall: 0.8429
  Time Taken: 469.71 seconds

Running GridSearchCV for Random Forest (Fold 2)...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 2 - Random Forest Results:
  Best Score: 0.8221
  Best Params: {'max_depth': 10, 'n_estimators': 100}
  Train -> Accuracy: 0.8911, F1: 0.8705, Precision: 0.9006, Recall: 0.8911
  Test  -> Accuracy: 0.8248, F1: 0.7755, Precision: 0.7349, Recall: 0.8248
  Time Taken: 170.64 seconds

Running GridSearchCV for Logistic Regression (Fold 2)...
Fold 2 - Logistic Regression Results:
  Best Score: 0.8433
  Best Params: {'C': 10, 'solver': 'lbfgs'}
  Train -> Accuracy: 0.8492, F1: 0.8271, Precision: 0.8249, Recall: 0.8492
  Test  -> Accuracy: 0.8472, F1: 0.8255, Precision: 0.8234, Recall: 0.8472
  Time Taken: 54.28 seconds

Running GridSearchCV for K-Nearest Neighbors (Fold 2)...
Fold 2 - K-Nearest Neighbors Results:
  Best Score: 0.8116
  Best Params: {'n_neighbors': 7, 'weights': 'uniform'}
  Train -> Accuracy: 0.8405, F1: 0.8079, Precision: 0.8187, Recall: 0.8405
  Test  -> Accuracy: 0.8160, F1: 0.7758, Precision: 0.7611, Recall: 0.8160
  Time Taken: 23.54 seconds

Running GridSearchCV fo

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 3 - Decision Tree Results:
  Best Score: 0.7950
  Best Params: {'criterion': 'gini', 'max_depth': 5}
  Train -> Accuracy: 0.8054, F1: 0.7561, Precision: 0.7145, Recall: 0.8054
  Test  -> Accuracy: 0.7984, F1: 0.7499, Precision: 0.7094, Recall: 0.7984
  Time Taken: 49.58 seconds

Running GridSearchCV for SVC (Fold 3)...
Fold 3 - SVC Results:
  Best Score: 0.8425
  Best Params: {'C': 10, 'kernel': 'linear'}
  Train -> Accuracy: 0.8490, F1: 0.8155, Precision: 0.8252, Recall: 0.8490
  Test  -> Accuracy: 0.8419, F1: 0.8099, Precision: 0.8143, Recall: 0.8419
  Time Taken: 478.36 seconds

Running GridSearchCV for Random Forest (Fold 3)...
Fold 3 - Random Forest Results:
  Best Score: 0.8202
  Best Params: {'max_depth': 10, 'n_estimators': 100}
  Train -> Accuracy: 0.8937, F1: 0.8740, Precision: 0.9030, Recall: 0.8937
  Test  -> Accuracy: 0.8199, F1: 0.7708, Precision: 0.7664, Recall: 0.8199
  Time Taken: 175.77 seconds

Running GridSearchCV for Logistic Regression (Fold 3)...
Fold 3 - Lo

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 5 - Decision Tree Results:
  Best Score: 0.7938
  Best Params: {'criterion': 'gini', 'max_depth': 5}
  Train -> Accuracy: 0.8035, F1: 0.7527, Precision: 0.7130, Recall: 0.8035
  Test  -> Accuracy: 0.7814, F1: 0.7255, Precision: 0.6825, Recall: 0.7814
  Time Taken: 49.97 seconds

Running GridSearchCV for SVC (Fold 5)...
Fold 5 - SVC Results:
  Best Score: 0.8419
  Best Params: {'C': 10, 'kernel': 'linear'}
  Train -> Accuracy: 0.8481, F1: 0.8141, Precision: 0.8215, Recall: 0.8481
  Test  -> Accuracy: 0.8380, F1: 0.8029, Precision: 0.8195, Recall: 0.8380
  Time Taken: 460.33 seconds

Running GridSearchCV for Random Forest (Fold 5)...
Fold 5 - Random Forest Results:
  Best Score: 0.8220
  Best Params: {'max_depth': 10, 'n_estimators': 100}
  Train -> Accuracy: 0.8927, F1: 0.8724, Precision: 0.9022, Recall: 0.8927
  Test  -> Accuracy: 0.8170, F1: 0.7643, Precision: 0.7974, Recall: 0.8170
  Time Taken: 167.35 seconds

Running GridSearchCV for Logistic Regression (Fold 5)...
Fold 5 - Lo

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 6 - Decision Tree Results:
  Best Score: 0.7944
  Best Params: {'criterion': 'entropy', 'max_depth': 5}
  Train -> Accuracy: 0.8019, F1: 0.7497, Precision: 0.7110, Recall: 0.8019
  Test  -> Accuracy: 0.7989, F1: 0.7461, Precision: 0.7098, Recall: 0.7989
  Time Taken: 50.56 seconds

Running GridSearchCV for SVC (Fold 6)...
Fold 6 - SVC Results:
  Best Score: 0.8422
  Best Params: {'C': 10, 'kernel': 'linear'}
  Train -> Accuracy: 0.8478, F1: 0.8154, Precision: 0.8235, Recall: 0.8478
  Test  -> Accuracy: 0.8492, F1: 0.8168, Precision: 0.8148, Recall: 0.8492
  Time Taken: 480.66 seconds

Running GridSearchCV for Random Forest (Fold 6)...
Fold 6 - Random Forest Results:
  Best Score: 0.8209
  Best Params: {'max_depth': 10, 'n_estimators': 100}
  Train -> Accuracy: 0.8913, F1: 0.8716, Precision: 0.9009, Recall: 0.8913
  Test  -> Accuracy: 0.8243, F1: 0.7747, Precision: 0.7367, Recall: 0.8243
  Time Taken: 173.31 seconds

Running GridSearchCV for Logistic Regression (Fold 6)...
Fold 6 -

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 7 - Decision Tree Results:
  Best Score: 0.7990
  Best Params: {'criterion': 'gini', 'max_depth': 5}
  Train -> Accuracy: 0.8095, F1: 0.7621, Precision: 0.7824, Recall: 0.8095
  Test  -> Accuracy: 0.7892, F1: 0.7396, Precision: 0.6987, Recall: 0.7892
  Time Taken: 50.62 seconds

Running GridSearchCV for SVC (Fold 7)...
Fold 7 - SVC Results:
  Best Score: 0.8445
  Best Params: {'C': 10, 'kernel': 'linear'}
  Train -> Accuracy: 0.8505, F1: 0.8194, Precision: 0.8281, Recall: 0.8505
  Test  -> Accuracy: 0.8355, F1: 0.8015, Precision: 0.7973, Recall: 0.8355
  Time Taken: 479.54 seconds

Running GridSearchCV for Random Forest (Fold 7)...
Fold 7 - Random Forest Results:
  Best Score: 0.8213
  Best Params: {'max_depth': 10, 'n_estimators': 100}
  Train -> Accuracy: 0.8925, F1: 0.8721, Precision: 0.9021, Recall: 0.8925
  Test  -> Accuracy: 0.8145, F1: 0.7635, Precision: 0.7249, Recall: 0.8145
  Time Taken: 168.78 seconds

Running GridSearchCV for Logistic Regression (Fold 7)...
Fold 7 - Lo

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 8 - Decision Tree Results:
  Best Score: 0.7949
  Best Params: {'criterion': 'entropy', 'max_depth': 5}
  Train -> Accuracy: 0.8009, F1: 0.7497, Precision: 0.7103, Recall: 0.8009
  Test  -> Accuracy: 0.7882, F1: 0.7338, Precision: 0.6928, Recall: 0.7882
  Time Taken: 51.71 seconds

Running GridSearchCV for SVC (Fold 8)...
Fold 8 - SVC Results:
  Best Score: 0.8429
  Best Params: {'C': 10, 'kernel': 'linear'}
  Train -> Accuracy: 0.8494, F1: 0.8171, Precision: 0.8258, Recall: 0.8494
  Test  -> Accuracy: 0.8370, F1: 0.7996, Precision: 0.7966, Recall: 0.8370
  Time Taken: 455.28 seconds

Running GridSearchCV for Random Forest (Fold 8)...
Fold 8 - Random Forest Results:
  Best Score: 0.8214
  Best Params: {'max_depth': 10, 'n_estimators': 100}
  Train -> Accuracy: 0.8934, F1: 0.8738, Precision: 0.9028, Recall: 0.8934
  Test  -> Accuracy: 0.8184, F1: 0.7680, Precision: 0.7614, Recall: 0.8184
  Time Taken: 171.99 seconds

Running GridSearchCV for Logistic Regression (Fold 8)...
Fold 8 -

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 9 - Decision Tree Results:
  Best Score: 0.7960
  Best Params: {'criterion': 'entropy', 'max_depth': 5}
  Train -> Accuracy: 0.8060, F1: 0.7576, Precision: 0.7164, Recall: 0.8060
  Test  -> Accuracy: 0.7877, F1: 0.7372, Precision: 0.6940, Recall: 0.7877
  Time Taken: 49.30 seconds

Running GridSearchCV for SVC (Fold 9)...
Fold 9 - SVC Results:
  Best Score: 0.8424
  Best Params: {'C': 10, 'kernel': 'linear'}
  Train -> Accuracy: 0.8482, F1: 0.8140, Precision: 0.8229, Recall: 0.8482
  Test  -> Accuracy: 0.8404, F1: 0.8077, Precision: 0.8169, Recall: 0.8404
  Time Taken: 472.26 seconds

Running GridSearchCV for Random Forest (Fold 9)...
Fold 9 - Random Forest Results:
  Best Score: 0.8230
  Best Params: {'max_depth': 10, 'n_estimators': 100}
  Train -> Accuracy: 0.8927, F1: 0.8731, Precision: 0.9021, Recall: 0.8927
  Test  -> Accuracy: 0.8165, F1: 0.7639, Precision: 0.7224, Recall: 0.8165
  Time Taken: 174.62 seconds

Running GridSearchCV for Logistic Regression (Fold 9)...
Fold 9 -

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 10 - Decision Tree Results:
  Best Score: 0.7924
  Best Params: {'criterion': 'gini', 'max_depth': 5}
  Train -> Accuracy: 0.8057, F1: 0.7577, Precision: 0.7158, Recall: 0.8057
  Test  -> Accuracy: 0.8111, F1: 0.7675, Precision: 0.7290, Recall: 0.8111
  Time Taken: 50.20 seconds

Running GridSearchCV for SVC (Fold 10)...
Fold 10 - SVC Results:
  Best Score: 0.8407
  Best Params: {'C': 10, 'kernel': 'linear'}
  Train -> Accuracy: 0.8463, F1: 0.8132, Precision: 0.8210, Recall: 0.8463
  Test  -> Accuracy: 0.8560, F1: 0.8269, Precision: 0.8294, Recall: 0.8560
  Time Taken: 469.60 seconds

Running GridSearchCV for Random Forest (Fold 10)...
Fold 10 - Random Forest Results:
  Best Score: 0.8199
  Best Params: {'max_depth': 10, 'n_estimators': 100}
  Train -> Accuracy: 0.8918, F1: 0.8724, Precision: 0.9016, Recall: 0.8918
  Test  -> Accuracy: 0.8355, F1: 0.7889, Precision: 0.7525, Recall: 0.8355
  Time Taken: 168.46 seconds

Running GridSearchCV for Logistic Regression (Fold 10)...
Fold 

KeyError: 'Train Accuracy'

In [31]:
# Calculate and display average metrics for each model
for name in results:
    print(f"\nAveraged Results for {name}:")

    # Compute average metrics across all folds
    avg_metrics = {
    metric: np.mean([
        result[metric] for result in results[name]
        if metric in result  # Ensure the key exists
    ])
    for metric in [
        "Train Accuracy", "Train F1-score", "Train Precision", "Train Recall",
        "Test Accuracy", "Test F1-score", "Test Precision", "Test Recall", "Best Score"
    ]
    }

    # Display average results
    print(f"  Train -> Accuracy: {avg_metrics['Train Accuracy']:.4f}, "
          f"F1: {avg_metrics['Train F1-score']:.4f}, "
          f"Precision: {avg_metrics['Train Precision']:.4f}, "
          f"Recall: {avg_metrics['Train Recall']:.4f}")

    print(f"  Test  -> Accuracy: {avg_metrics['Test Accuracy']:.4f}, "
          f"F1: {avg_metrics['Test F1-score']:.4f}, "
          f"Precision: {avg_metrics['Test Precision']:.4f}, "
          f"Recall: {avg_metrics['Test Recall']:.4f}")

    print(f"  Best Score: {avg_metrics['Best Score']:.4f}\n")


Averaged Results for Decision Tree:
  Train -> Accuracy: 0.8050, F1: 0.7557, Precision: 0.7256, Recall: 0.8050
  Test  -> Accuracy: 0.7946, F1: 0.7447, Precision: 0.7044, Recall: 0.7946
  Best Score: 0.7944


Averaged Results for SVC:
  Train -> Accuracy: 0.8484, F1: 0.8153, Precision: 0.8240, Recall: 0.8484
  Test  -> Accuracy: 0.8438, F1: 0.8104, Precision: 0.8140, Recall: 0.8438
  Best Score: 0.8424


Averaged Results for Random Forest:
  Train -> Accuracy: 0.8922, F1: 0.8723, Precision: 0.9018, Recall: 0.8922
  Test  -> Accuracy: 0.8222, F1: 0.7722, Precision: 0.7496, Recall: 0.8222
  Best Score: 0.8212


Averaged Results for Logistic Regression:
  Train -> Accuracy: 0.8486, F1: 0.8242, Precision: 0.8235, Recall: 0.8486
  Test  -> Accuracy: 0.8449, F1: 0.8202, Precision: 0.8175, Recall: 0.8449
  Best Score: 0.8439


Averaged Results for K-Nearest Neighbors:
  Train -> Accuracy: 0.8400, F1: 0.8070, Precision: 0.8169, Recall: 0.8400
  Test  -> Accuracy: 0.8146, F1: 0.7745, Precision