<a href="https://colab.research.google.com/github/huyminh1115/Trip-Advisor-Hotel-Project/blob/main/Build_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import string
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
import pickle
from wordcloud import WordCloud

# Preprocessing and evaluation
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l1, l2

# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score


In [2]:
!pip install wordcloud



In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Prepare Data

In [4]:
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/processed_data_v2.csv")

In [5]:
data['sentiment'] = data['Rating'].apply(lambda x: 1 if x > 3 else 2 if x == 3 else 0)

def convert_to_float_array(s):
    # Bỏ dấu ngoặc rồi tách theo khoảng trắng
    s = s.strip('[]').split()
    # Chuyển từng phần tử trong danh sách thành float
    return [float(x) for x in s]


data['review_vector'] = data['review_vector'].apply(convert_to_float_array)
# Prepare data
X = np.array(data['review_vector'].tolist())
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# New Rating

## ML Models

In [None]:
from xgboost import XGBClassifier


# Danh sách các mô hình
models = [
    ('Decision Tree', DecisionTreeClassifier()),
    ('SVC', SVC()),
    ('Random Forest', RandomForestClassifier()),
    ('Logistic Regression', LogisticRegression(max_iter=1000)),
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ('Bernoulli Naive Bayes', BernoulliNB()),
    ('XGBoost', XGBClassifier())
]

# Tạo các param_grid cho từng mô hình
param_grids = {
    'Decision Tree': {'max_depth': [3, 5, 10], 'criterion': ['gini', 'entropy']},
    'SVC': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
    'Random Forest': {'n_estimators': [10, 100], 'max_depth': [3, 5, 10]},
    'Logistic Regression': {'C': [0.1, 1, 10], 'solver': ['lbfgs', 'liblinear']},
    'KNeighbors': {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']},
    'BernoulliNB': {'alpha': [0.1, 1, 10]},
    'XGBoost': {'learning_rate': [0.01, 0.1, 0.3], 'n_estimators': [100, 200]}
}


In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import time


# accuracy = []

# for model in models:
#     cross_val = cross_val_score(model, X_train, y_train, scoring='accuracy',
#                                cv=StratifiedKFold(10)).mean()
#     accuracy.append(cross_val)



# Kết quả lưu lại
results = []

# Lặp qua từng mô hình và thực hiện GridSearchCV
for name, model in models:
    print(f"Running GridSearchCV for {name}...")

    # Lấy param_grid tương ứng
    param_grid = param_grids.get(name, {})

    # Khởi tạo GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')

    # Bắt đầu đo thời gian
    start_time = time.time()

    # Thực hiện GridSearch
    grid_search.fit(X_train, y_train)

    # Dừng đo thời gian
    end_time = time.time()
    elapsed_time = end_time - start_time  # Tính thời gian chạy

    # Dự đoán trên tập huấn luyện
    y_pred = grid_search.predict(X_test)

    # Tính các chỉ số
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')

    # Lưu kết quả
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    results.append((name, best_params, best_score, elapsed_time, accuracy, f1, precision, recall))

    # In kết quả và thời gian chạy
    print(f"Best score for {name}: {best_score:.4f} with params: {best_params}")
    print(f"Time taken for {name}: {elapsed_time:.2f} seconds")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1-score: {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}\n")


Running GridSearchCV for Decision Tree...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best score for Decision Tree: 0.7942 with params: {'criterion': 'gini', 'max_depth': 5}
Time taken for Decision Tree: 56.96 seconds
Accuracy: 0.7936
F1-score: 0.7444
Precision: 0.7035
Recall: 0.7936

Running GridSearchCV for SVC...
Best score for SVC: 0.8412 with params: {'C': 10, 'kernel': 'linear'}
Time taken for SVC: 384.65 seconds
Accuracy: 0.8495
F1-score: 0.8143
Precision: 0.8247
Recall: 0.8495

Running GridSearchCV for Random Forest...
Best score for Random Forest: 0.8216 with params: {'max_depth': 10, 'n_estimators': 100}
Time taken for Random Forest: 151.38 seconds
Accuracy: 0.8251
F1-score: 0.7762
Precision: 0.7879
Recall: 0.8251

Running GridSearchCV for Logistic Regression...
Best score for Logistic Regression: 0.8444 with params: {'C': 10, 'solver': 'liblinear'}
Time taken for Logistic Regression: 46.57 seconds
Accuracy: 0.8480
F1-score: 0.8185
Precision: 0.8175
Recall: 0.8480

Running GridSearchCV for K-Nearest Neighbors...
Best score for K-Nearest Neighbors: 0.8084 with 

In [None]:
# acc = pd.DataFrame({'Model': models, 'Accuracy': accuracy})
# acc


Unnamed: 0,Model,Accuracy
0,DecisionTreeClassifier(),0.727184
1,SVC(),0.838153
2,RandomForestClassifier(),0.824488
3,LogisticRegression(max_iter=1000),0.842667
4,KNeighborsClassifier(),0.808992
5,BernoulliNB(),0.716876
6,"XGBClassifier(base_score=None, booster=None, c...",0.832358


## DL Model - MLP

In [None]:
!pip install scikeras[tensorflow] scikit-learn


Collecting scikeras[tensorflow]
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Collecting keras>=3.2.0 (from scikeras[tensorflow])
  Downloading keras-3.6.0-py3-none-any.whl.metadata (5.8 kB)
Collecting tensorflow>=2.16.1 (from scikeras[tensorflow])
  Downloading tensorflow-2.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting namex (from keras>=3.2.0->scikeras[tensorflow])
  Downloading namex-0.0.8-py3-none-any.whl.metadata (246 bytes)
Collecting optree (from keras>=3.2.0->scikeras[tensorflow])
  Downloading optree-0.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.8/47.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting ml-dtypes (from keras>=3.2.0->scikeras[tensorflow])
  Downloading ml_dtypes-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting tensorboard<2.18,>=2.17 (from tensor

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical


# Hàm tạo mô hình MLP
def create_mlp():
    model = Sequential()
    model.add(Dense(64, input_dim=100, activation='relu'))  # Lớp ẩn đầu tiên với 64 node và hàm kích hoạt ReLU
    model.add(Dense(64, activation='relu'))  # Lớp ẩn thứ hai với 64 node và hàm kích hoạt ReLU
    model.add(Dense(3, activation='softmax'))  # Lớp đầu ra với 3 node (tương ứng 3 lớp), hàm softmax cho phân loại
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  # Compile mô hình
    return model

# Tạo mô hình
model = create_mlp()

y_train_cat = to_categorical(y_train, num_classes=3)
y_test_cat = to_categorical(y_test, num_classes=3)

# Huấn luyện mô hình
model.fit(X_train, y_train_cat, epochs=50, batch_size=10, verbose=1)

# Đánh giá mô hình trên tập kiểm tra
_, accuracy = model.evaluate(X_test, y_test_cat, verbose=1)
print(f'Accuracy: {accuracy:.2f}')

# import time
# import numpy as np
# from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
# from sklearn.model_selection import GridSearchCV
# from scikeras.wrappers import KerasClassifier
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense
# from tensorflow.keras.utils import to_categorical

# # Hàm tạo mô hình MLP với các tham số được chuyển từ GridSearchCV
# def create_mlp(neurons=64, optimizer='adam'):
#     model = Sequential()
#     model.add(Dense(neurons, input_dim=100, activation='relu'))  # Lớp ẩn đầu tiên
#     model.add(Dense(neurons, activation='relu'))  # Lớp ẩn thứ hai
#     model.add(Dense(3, activation='softmax'))  # Lớp đầu ra cho phân loại
#     model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])  # Compile mô hình
#     return model

# # Đóng gói mô hình Keras trong KerasClassifier
# model = KerasClassifier(model=create_mlp, verbose=0)

# # Tạo lưới tham số để tinh chỉnh
# param_grid = {
#     'model__neurons': [32, 64, 128],
#     'model__optimizer': ['adam', 'rmsprop'],
#     'batch_size': [10, 20],
#     'epochs': [50, 100],
# }

# # Khởi tạo GridSearchCV
# grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')

# # Chuyển đổi nhãn thành one-hot encoding
# y_train_cat = to_categorical(y_train, num_classes=3)
# y_test_cat = to_categorical(y_test, num_classes=3)

# # Bắt đầu đo thời gian
# start_time = time.time()

# # Thực hiện GridSearchCV
# grid_search.fit(X_train, y_train_cat)

# # Kết thúc đo thời gian
# end_time = time.time()
# elapsed_time = end_time - start_time

# # Tìm ra các tham số tốt nhất
# best_params = grid_search.best_params_
# best_model = grid_search.best_estimator_

# # Dự đoán trên tập kiểm tra
# y_pred_cat = best_model.predict(X_test)
# y_pred = np.argmax(y_pred_cat, axis=1)  # Chuyển từ one-hot về nhãn thực

# # Tính toán các chỉ số
# accuracy = accuracy_score(y_test, y_pred)
# f1 = f1_score(y_test, y_pred, average='weighted')
# precision = precision_score(y_test, y_pred, average='weighted')
# recall = recall_score(y_test, y_pred, average='weighted')

# # In kết quả
# print(f"Best parameters: {best_params}")
# print(f"Time taken: {elapsed_time:.2f} seconds")
# print(f"Accuracy: {accuracy:.4f}")
# print(f"F1-score: {f1:.4f}")
# print(f"Precision: {precision:.4f}")
# print(f"Recall: {recall:.4f}")


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1640/1640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.8063 - loss: 0.5130
Epoch 2/50
[1m1640/1640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.8325 - loss: 0.4302
Epoch 3/50
[1m1640/1640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.8340 - loss: 0.4256
Epoch 4/50
[1m1640/1640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.8408 - loss: 0.4165
Epoch 5/50
[1m1640/1640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.8458 - loss: 0.4029
Epoch 6/50
[1m1640/1640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8437 - loss: 0.4027
Epoch 7/50
[1m1640/1640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8478 - loss: 0.3921
Epoch 8/50
[1m1640/1640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.8411 - loss: 0.3955
Epoch 9/50
[1m1640/1640[0m [32m━

#### LSTM

In [None]:
data.head(3)

Unnamed: 0,Review,Rating,Length,Word_count,cleaned_review,sentiment,tokenized_review,review_vector
0,nice hotel expensive parking got good deal sta...,4,593,87,nice hotel expensive parking got good deal sta...,1,"['nice', 'hotel', 'expensive', 'parking', 'got...","[0.26746067, -0.39511007, 0.2275819, 0.3294193..."
1,ok nothing special charge diamond member hilto...,2,1689,250,ok nothing special charge diamond member hilto...,0,"['ok', 'nothing', 'special', 'charge', 'diamon...","[-0.13580382, -0.10084884, -0.06840305, 0.1910..."
2,nice rooms not 4* experience hotel monaco seat...,3,1427,217,nice room experience hotel monaco seattle good...,2,"['nice', 'room', 'experience', 'hotel', 'monac...","[0.12907822, -0.4086012, 0.00375678, 0.0777084..."


In [6]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense



# Tách dữ liệu thành train và test
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_review'], data['sentiment'], test_size=0.2, random_state=42)

In [11]:


# Tokenizer: Chuyển đổi văn bản thành các chuỗi token
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Chuyển văn bản thành số
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding để các chuỗi có cùng độ dài
max_len = 100  # Độ dài tối đa của một chuỗi
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

# Chuyển nhãn thành categorical (one-hot encoding)
y_train_cat = to_categorical(y_train, num_classes=3)
y_test_cat = to_categorical(y_test, num_classes=3)

# Xây dựng mô hình LSTM
def create_lstm_model():
    model = Sequential()
    # Lớp Embedding
    model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_len))
    # LSTM Layer
    model.add(LSTM(64))
    # Lớp đầu ra
    model.add(Dense(3, activation='softmax'))
    # Compile mô hình
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Tạo mô hình
model = create_lstm_model()

# Huấn luyện mô hình và tính toán thời gian
start_time = time.time()
model.fit(X_train_padded, y_train_cat, epochs=5, batch_size=32, verbose=1, validation_data=(X_test_padded, y_test_cat))
training_time = time.time() - start_time
print(f"Thời gian huấn luyện: {training_time:.2f} giây")

# Đánh giá mô hình trên tập kiểm tra
y_test_pred_cat = model.predict(X_test_padded)
y_test_pred = y_test_pred_cat.argmax(axis=1)

# Tính toán các chỉ số
accuracy = accuracy_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred, average='weighted')
precision = precision_score(y_test, y_test_pred, average='weighted')
recall = recall_score(y_test, y_test_pred, average='weighted')

# In kết quả
print(f"Accuracy: {accuracy:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")



Epoch 1/5




[1m513/513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 13ms/step - accuracy: 0.7281 - loss: 0.7707 - val_accuracy: 0.7795 - val_loss: 0.6664
Epoch 2/5
[1m513/513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - accuracy: 0.7825 - loss: 0.6342 - val_accuracy: 0.7865 - val_loss: 0.6086
Epoch 3/5
[1m513/513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.7656 - loss: 0.6490 - val_accuracy: 0.7641 - val_loss: 0.5807
Epoch 4/5
[1m513/513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - accuracy: 0.8294 - loss: 0.4754 - val_accuracy: 0.8395 - val_loss: 0.4524
Epoch 5/5
[1m513/513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.8623 - loss: 0.3853 - val_accuracy: 0.7463 - val_loss: 0.7232
Thời gian huấn luyện: 31.27 giây
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
Accuracy: 0.7463
F1-score: 0.6596
Precision: 0.6596
Recall: 0.7463


### BERT

In [13]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00

In [None]:
!pip install -U tensorflow transformers


Collecting transformers
  Downloading transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.45.2-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m75.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m80.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.19.1
    Uninstalling tokenizers-0.19.1:
      Successfully uninstalled tokenizers-0.1

In [15]:
import time
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from datasets import Dataset
from torch.nn import functional as F



# Khởi tạo tokenizer BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize dữ liệu
def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128)

train_encodings = tokenizer(X_train.tolist(), padding=True, truncation=True, max_length=128, return_tensors="pt")
test_encodings = tokenizer(X_test.tolist(), padding=True, truncation=True, max_length=128, return_tensors="pt")

# Chuyển dữ liệu thành Dataset dạng Hugging Face
train_dataset = Dataset.from_dict({'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask'], 'labels': torch.tensor(y_train.values)})
test_dataset = Dataset.from_dict({'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask'], 'labels': torch.tensor(y_test.values)})

# Khởi tạo mô hình BERT cho phân loại
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Hàm đánh giá
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(F.softmax(torch.tensor(logits), dim=-1), dim=1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    precision = precision_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Thiết lập tham số huấn luyện
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

# Huấn luyện và tính toán thời gian
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

start_time = time.time()
trainer.train()
training_time = time.time() - start_time

# Đánh giá mô hình
eval_results = trainer.evaluate()

# In kết quả
print(f"Thời gian huấn luyện: {training_time:.2f} giây")
print(f"Accuracy: {eval_results['eval_accuracy']:.2f}")
print(f"F1-Score: {eval_results['eval_f1']:.2f}")
print(f"Precision: {eval_results['eval_precision']:.2f}")
print(f"Recall: {eval_results['eval_recall']:.2f}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3919,0.658046,0.839717,0.798238,0.765257,0.839717
2,0.1735,0.584161,0.854599,0.835927,0.826275,0.854599
3,0.6069,0.621382,0.854599,0.824346,0.820957,0.854599


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Thời gian huấn luyện: 1878.66 giây
Accuracy: 0.85
F1-Score: 0.82
Precision: 0.82
Recall: 0.85


# Old Rating

In [19]:
from sklearn.preprocessing import LabelEncoder

# Assuming 'data' is your DataFrame and 'Rating' is your target column
y_old = data['Rating']

# Create a LabelEncoder object
le = LabelEncoder()

# Fit the encoder to your target variable and transform it
y_old = le.fit_transform(y_old)


X_old_train, X_old_test, y_old_train, y_old_test = train_test_split(X, y_old, test_size=0.2, random_state=42)


## ML Models

In [None]:

# models = [DecisionTreeClassifier(),
#           SVC(),
#           RandomForestClassifier(),
#           KNeighborsClassifier(),
#           XGBClassifier()]

# accuracy = []

# for model in models:
#     cross_val = cross_val_score(model, X_old_train, y_old_train, scoring='accuracy',
#                                cv=StratifiedKFold(10)).mean()
#     accuracy.append(cross_val)

from xgboost import XGBClassifier


# Danh sách các mô hình
models = [
    ('Decision Tree', DecisionTreeClassifier()),
    ('SVC', SVC()),
    ('Random Forest', RandomForestClassifier()),
    ('Logistic Regression', LogisticRegression(max_iter=1000)),
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ('Bernoulli Naive Bayes', BernoulliNB()),
    ('XGBoost', XGBClassifier())
]

# Tạo các param_grid cho từng mô hình
param_grids = {
    'Decision Tree': {'max_depth': [3, 5, 10], 'criterion': ['gini', 'entropy']},
    'SVC': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
    'Random Forest': {'n_estimators': [10, 100], 'max_depth': [3, 5, 10]},
    'Logistic Regression': {'C': [0.1, 1, 10], 'solver': ['lbfgs', 'liblinear']},
    'KNeighbors': {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']},
    'BernoulliNB': {'alpha': [0.1, 1, 10]},
    'XGBoost': {'learning_rate': [0.01, 0.1, 0.3], 'n_estimators': [100, 200]}
}


In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import time


# accuracy = []

# for model in models:
#     cross_val = cross_val_score(model, X_train, y_train, scoring='accuracy',
#                                cv=StratifiedKFold(10)).mean()
#     accuracy.append(cross_val)



# Kết quả lưu lại
results = []

# Lặp qua từng mô hình và thực hiện GridSearchCV
for name, model in models:
    print(f"Running GridSearchCV for {name}...")

    # Lấy param_grid tương ứng
    param_grid = param_grids.get(name, {})

    # Khởi tạo GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')

    # Bắt đầu đo thời gian
    start_time = time.time()

    # Thực hiện GridSearch
    grid_search.fit(X_old_train, y_old_train)

    # Dừng đo thời gian
    end_time = time.time()
    elapsed_time = end_time - start_time  # Tính thời gian chạy

    # Dự đoán trên tập huấn luyện
    y_pred = grid_search.predict(X_old_test)

    # Tính các chỉ số
    accuracy = accuracy_score(y_old_test, y_pred)
    f1 = f1_score(y_old_test, y_pred, average='weighted')
    precision = precision_score(y_old_test, y_pred, average='weighted')
    recall = recall_score(y_old_test, y_pred, average='weighted')

    # Lưu kết quả
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    results.append((name, best_params, best_score, elapsed_time, accuracy, f1, precision, recall))

    # In kết quả và thời gian chạy
    print(f"Best score for {name}: {best_score:.4f} with params: {best_params}")
    print(f"Time taken for {name}: {elapsed_time:.2f} seconds")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1-score: {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}\n")


Running GridSearchCV for Decision Tree...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best score for Decision Tree: 0.5068 with params: {'criterion': 'gini', 'max_depth': 5}
Time taken for Decision Tree: 53.31 seconds
Accuracy: 0.5111
F1-score: 0.4829
Precision: 0.4579
Recall: 0.5111

Running GridSearchCV for SVC...
Best score for SVC: 0.6015 with params: {'C': 1, 'kernel': 'linear'}
Time taken for SVC: 787.33 seconds
Accuracy: 0.6133
F1-score: 0.5986
Precision: 0.6025
Recall: 0.6133

Running GridSearchCV for Random Forest...
Best score for Random Forest: 0.5550 with params: {'max_depth': 10, 'n_estimators': 100}
Time taken for Random Forest: 151.41 seconds
Accuracy: 0.5618
F1-score: 0.5175
Precision: 0.5190
Recall: 0.5618

Running GridSearchCV for Logistic Regression...
Best score for Logistic Regression: 0.6034 with params: {'C': 10, 'solver': 'lbfgs'}
Time taken for Logistic Regression: 94.70 seconds
Accuracy: 0.6170
F1-score: 0.6032
Precision: 0.6025
Recall: 0.6170

Running GridSearchCV for K-Nearest Neighbors...
Best score for K-Nearest Neighbors: 0.5099 with param

In [None]:
# acc = pd.DataFrame({'Model': models, 'Accuracy': accuracy})
# acc

Unnamed: 0,Model,Accuracy
0,DecisionTreeClassifier(),0.4393
1,SVC(),0.599622
2,RandomForestClassifier(),0.558506
3,KNeighborsClassifier(),0.507747
4,"XGBClassifier(base_score=None, booster=None, c...",0.566924


## DL Model - MLP

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical


# Hàm tạo mô hình MLP
def create_mlp():
    model = Sequential()
    model.add(Dense(64, input_dim=100, activation='relu'))  # Lớp ẩn đầu tiên với 64 node và hàm kích hoạt ReLU
    model.add(Dense(64, activation='relu'))  # Lớp ẩn thứ hai với 64 node và hàm kích hoạt ReLU
    model.add(Dense(5, activation='softmax'))  # Lớp đầu ra với 3 node (tương ứng 3 lớp), hàm softmax cho phân loại
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  # Compile mô hình
    return model

# Tạo mô hình
model = create_mlp()

y_old_train_cat = to_categorical(y_old_train, num_classes=5)
y_old_test_cat = to_categorical(y_old_test, num_classes=5)

# Huấn luyện mô hình
model.fit(X_old_train, y_old_train_cat, epochs=50, batch_size=10, verbose=1)

# Đánh giá mô hình trên tập kiểm tra
_, accuracy = model.evaluate(X_old_test, y_old_test_cat, verbose=1)
print(f'Accuracy: {accuracy:.2f}')

Epoch 1/50
[1m1640/1640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.5215 - loss: 1.0753
Epoch 2/50
[1m1640/1640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.5852 - loss: 0.9330
Epoch 3/50
[1m1640/1640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.5997 - loss: 0.9107
Epoch 4/50
[1m1640/1640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.5952 - loss: 0.9022
Epoch 5/50
[1m1640/1640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.6017 - loss: 0.9063
Epoch 6/50
[1m1640/1640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.5992 - loss: 0.8952
Epoch 7/50
[1m1640/1640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.5954 - loss: 0.8966
Epoch 8/50
[1m1640/1640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.6130 - loss: 0.8876
Epoch 9/50
[1m1640/1640

### LSTM

In [20]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense



# Tách dữ liệu thành train và test
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_review'], y_old, test_size=0.2, random_state=42)

In [21]:


# Tokenizer: Chuyển đổi văn bản thành các chuỗi token
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Chuyển văn bản thành số
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding để các chuỗi có cùng độ dài
max_len = 100  # Độ dài tối đa của một chuỗi
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

# Chuyển nhãn thành categorical (one-hot encoding)
y_train_cat = to_categorical(y_train, num_classes=5)
y_test_cat = to_categorical(y_test, num_classes=5)

# Xây dựng mô hình LSTM
def create_lstm_model():
    model = Sequential()
    # Lớp Embedding
    model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_len))
    # LSTM Layer
    model.add(LSTM(64))
    # Lớp đầu ra
    model.add(Dense(5, activation='softmax'))
    # Compile mô hình
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Tạo mô hình
model = create_lstm_model()

# Huấn luyện mô hình và tính toán thời gian
start_time = time.time()
model.fit(X_train_padded, y_train_cat, epochs=5, batch_size=32, verbose=1, validation_data=(X_test_padded, y_test_cat))
training_time = time.time() - start_time
print(f"Thời gian huấn luyện: {training_time:.2f} giây")

# Đánh giá mô hình trên tập kiểm tra
y_test_pred_cat = model.predict(X_test_padded)
y_test_pred = y_test_pred_cat.argmax(axis=1)

# Tính toán các chỉ số
accuracy = accuracy_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred, average='weighted')
precision = precision_score(y_test, y_test_pred, average='weighted')
recall = recall_score(y_test, y_test_pred, average='weighted')

# In kết quả
print(f"Accuracy: {accuracy:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")



Epoch 1/5




[1m513/513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 14ms/step - accuracy: 0.4416 - loss: 1.3618 - val_accuracy: 0.4860 - val_loss: 1.1987
Epoch 2/5
[1m513/513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 14ms/step - accuracy: 0.4680 - loss: 1.2508 - val_accuracy: 0.4830 - val_loss: 1.1220
Epoch 3/5
[1m513/513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - accuracy: 0.5320 - loss: 1.0622 - val_accuracy: 0.5301 - val_loss: 1.0142
Epoch 4/5
[1m513/513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - accuracy: 0.6043 - loss: 0.9131 - val_accuracy: 0.5745 - val_loss: 0.9696
Epoch 5/5
[1m513/513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.6695 - loss: 0.7746 - val_accuracy: 0.5980 - val_loss: 0.9481
Thời gian huấn luyện: 37.33 giây
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
Accuracy: 0.5980
F1-score: 0.5878
Precision: 0.5838
Recall: 0.5980


### BERT

In [23]:
import time
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from datasets import Dataset
from torch.nn import functional as F



# Khởi tạo tokenizer BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize dữ liệu
def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128)

train_encodings = tokenizer(X_train.tolist(), padding=True, truncation=True, max_length=128, return_tensors="pt")
test_encodings = tokenizer(X_test.tolist(), padding=True, truncation=True, max_length=128, return_tensors="pt")

# Chuyển dữ liệu thành Dataset dạng Hugging Face
train_dataset = Dataset.from_dict({'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask'], 'labels': torch.tensor(y_train)})
test_dataset = Dataset.from_dict({'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask'], 'labels': torch.tensor(y_test)})

# Khởi tạo mô hình BERT cho phân loại
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

# Hàm đánh giá
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(F.softmax(torch.tensor(logits), dim=-1), dim=1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    precision = precision_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Thiết lập tham số huấn luyện
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

# Huấn luyện và tính toán thời gian
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

start_time = time.time()
trainer.train()
training_time = time.time() - start_time

# Đánh giá mô hình
eval_results = trainer.evaluate()

# In kết quả
print(f"Thời gian huấn luyện: {training_time:.2f} giây")
print(f"Accuracy: {eval_results['eval_accuracy']:.2f}")
print(f"F1-Score: {eval_results['eval_f1']:.2f}")
print(f"Precision: {eval_results['eval_precision']:.2f}")
print(f"Recall: {eval_results['eval_recall']:.2f}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6879,0.89429,0.646743,0.62486,0.634975,0.646743
2,0.3632,0.832124,0.659185,0.647434,0.661091,0.659185
3,0.5694,1.133409,0.66626,0.663996,0.664054,0.66626


Thời gian huấn luyện: 1890.86 giây
Accuracy: 0.67
F1-Score: 0.66
Precision: 0.66
Recall: 0.67
