<a href="https://colab.research.google.com/github/huyminh1115/Trip-Advisor-Hotel-Project/blob/main/Code/Build_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import string
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
import pickle
from wordcloud import WordCloud

# Preprocessing and evaluation
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l1, l2

# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Prepare Data

In [3]:
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/processed_data_v2.csv")

In [4]:
data['sentiment'] = data['Rating'].apply(lambda x: 1 if x > 3 else 2 if x == 3 else 0)

def convert_to_float_array(s):
    # Bỏ dấu ngoặc rồi tách theo khoảng trắng
    s = s.strip('[]').split()
    # Chuyển từng phần tử trong danh sách thành float
    return [float(x) for x in s]


data['review_vector'] = data['review_vector'].apply(convert_to_float_array)
# Prepare data
X = np.array(data['review_vector'].tolist())
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# New Rating

## ML Models

In [17]:
from xgboost import XGBClassifier


models = [DecisionTreeClassifier(),
          SVC(),
          RandomForestClassifier(),
          LogisticRegression(max_iter=1000),
          KNeighborsClassifier(),
          BernoulliNB(),
          XGBClassifier()]

In [18]:
accuracy = []

for model in models:
    cross_val = cross_val_score(model, X_train, y_train, scoring='accuracy',
                               cv=StratifiedKFold(10)).mean()
    accuracy.append(cross_val)

In [19]:
acc = pd.DataFrame({'Model': models, 'Accuracy': accuracy})
acc


Unnamed: 0,Model,Accuracy
0,DecisionTreeClassifier(),0.727184
1,SVC(),0.838153
2,RandomForestClassifier(),0.824488
3,LogisticRegression(max_iter=1000),0.842667
4,KNeighborsClassifier(),0.808992
5,BernoulliNB(),0.716876
6,"XGBClassifier(base_score=None, booster=None, c...",0.832358


## DL Model - MLP

In [23]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical


# Hàm tạo mô hình MLP
def create_mlp():
    model = Sequential()
    model.add(Dense(64, input_dim=100, activation='relu'))  # Lớp ẩn đầu tiên với 64 node và hàm kích hoạt ReLU
    model.add(Dense(64, activation='relu'))  # Lớp ẩn thứ hai với 64 node và hàm kích hoạt ReLU
    model.add(Dense(3, activation='softmax'))  # Lớp đầu ra với 3 node (tương ứng 3 lớp), hàm softmax cho phân loại
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  # Compile mô hình
    return model

# Tạo mô hình
model = create_mlp()

y_train_cat = to_categorical(y_train, num_classes=3)
y_test_cat = to_categorical(y_test, num_classes=3)

# Huấn luyện mô hình
model.fit(X_train, y_train_cat, epochs=50, batch_size=10, verbose=1)

# Đánh giá mô hình trên tập kiểm tra
_, accuracy = model.evaluate(X_test, y_test_cat, verbose=1)
print(f'Accuracy: {accuracy:.2f}')

Epoch 1/50
[1m1640/1640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.8086 - loss: 0.5100
Epoch 2/50
[1m1640/1640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8297 - loss: 0.4388
Epoch 3/50
[1m1640/1640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8389 - loss: 0.4176
Epoch 4/50
[1m1640/1640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.8442 - loss: 0.3982
Epoch 5/50
[1m1640/1640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8445 - loss: 0.4050
Epoch 6/50
[1m1640/1640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.8450 - loss: 0.3985
Epoch 7/50
[1m1640/1640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.8477 - loss: 0.3955
Epoch 8/50
[1m1640/1640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8503 - loss: 0.3882
Epoch 9/50
[1m1640/1640

# Old Rating

In [28]:
from sklearn.preprocessing import LabelEncoder

# Assuming 'data' is your DataFrame and 'Rating' is your target column
y_old = data['Rating']

# Create a LabelEncoder object
le = LabelEncoder()

# Fit the encoder to your target variable and transform it
y_old = le.fit_transform(y_old)


X_old_train, X_old_test, y_old_train, y_old_test = train_test_split(X, y_old, test_size=0.2, random_state=42)


## ML Models

In [29]:

models = [DecisionTreeClassifier(),
          SVC(),
          RandomForestClassifier(),
          KNeighborsClassifier(),
          XGBClassifier()]

accuracy = []

for model in models:
    cross_val = cross_val_score(model, X_old_train, y_old_train, scoring='accuracy',
                               cv=StratifiedKFold(10)).mean()
    accuracy.append(cross_val)

In [30]:
acc = pd.DataFrame({'Model': models, 'Accuracy': accuracy})
acc

Unnamed: 0,Model,Accuracy
0,DecisionTreeClassifier(),0.4393
1,SVC(),0.599622
2,RandomForestClassifier(),0.558506
3,KNeighborsClassifier(),0.507747
4,"XGBClassifier(base_score=None, booster=None, c...",0.566924


## DL Model - MLP

In [32]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical


# Hàm tạo mô hình MLP
def create_mlp():
    model = Sequential()
    model.add(Dense(64, input_dim=100, activation='relu'))  # Lớp ẩn đầu tiên với 64 node và hàm kích hoạt ReLU
    model.add(Dense(64, activation='relu'))  # Lớp ẩn thứ hai với 64 node và hàm kích hoạt ReLU
    model.add(Dense(5, activation='softmax'))  # Lớp đầu ra với 3 node (tương ứng 3 lớp), hàm softmax cho phân loại
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  # Compile mô hình
    return model

# Tạo mô hình
model = create_mlp()

y_old_train_cat = to_categorical(y_old_train, num_classes=5)
y_old_test_cat = to_categorical(y_old_test, num_classes=5)

# Huấn luyện mô hình
model.fit(X_old_train, y_old_train_cat, epochs=50, batch_size=10, verbose=1)

# Đánh giá mô hình trên tập kiểm tra
_, accuracy = model.evaluate(X_old_test, y_old_test_cat, verbose=1)
print(f'Accuracy: {accuracy:.2f}')

Epoch 1/50
[1m1640/1640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.5215 - loss: 1.0753
Epoch 2/50
[1m1640/1640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.5852 - loss: 0.9330
Epoch 3/50
[1m1640/1640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.5997 - loss: 0.9107
Epoch 4/50
[1m1640/1640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.5952 - loss: 0.9022
Epoch 5/50
[1m1640/1640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.6017 - loss: 0.9063
Epoch 6/50
[1m1640/1640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.5992 - loss: 0.8952
Epoch 7/50
[1m1640/1640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.5954 - loss: 0.8966
Epoch 8/50
[1m1640/1640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.6130 - loss: 0.8876
Epoch 9/50
[1m1640/1640