<a href="https://colab.research.google.com/github/huyminh1115/Trip-Advisor-Hotel-Project/blob/main/Code/Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import string
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
import pickle
from wordcloud import WordCloud

# Preprocessing and evaluation
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l1, l2

# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB

In [2]:
import torch
if torch.cuda.is_available():
    print("GPU is available!")
else:
    print("GPU is not available.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

!nvidia-smi

GPU is not available.
/bin/bash: line 1: nvidia-smi: command not found


In [3]:
df = pd.read_csv('./processed_data.csv')
df.head()

Unnamed: 0,Review,Rating,words,word_count,tokenized,tokenized_words,new_rating,label
0,nice hotel expensive parking got good deal sta...,4,"['nice', 'hotel', 'expensive', 'parking', 'got...",87,nice hotel expensive parking get good deal sta...,"['nice', 'hotel', 'expensive', 'parking', 'get...",average,2
1,ok nothing special charge diamond member hilto...,2,"['ok', 'nothing', 'special', 'charge', 'diamon...",250,ok nothing special charge diamond member hilto...,"['ok', 'nothing', 'special', 'charge', 'diamon...",poor,0
2,nice rooms not 4* experience hotel monaco seat...,3,"['nice', 'rooms', 'not', '4*', 'experience', '...",217,nice room not experience hotel monaco seattle ...,"['nice', 'room', 'not', 'experience', 'hotel',...",average,1
3,"unique, great stay, wonderful time hotel monac...",5,"['unique,', 'great', 'stay,', 'wonderful', 'ti...",89,unique great stay wonderful time hotel monaco ...,"['unique', 'great', 'stay', 'wonderful', 'time...",excellent,2
4,"great stay great stay, went seahawk game aweso...",5,"['great', 'stay', 'great', 'stay,', 'went', 's...",191,great stay great stay go seahawk game awesome ...,"['great', 'stay', 'great', 'stay', 'go', 'seah...",excellent,2


# Raw data

## With old rating

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['Review'], df['Rating'], test_size=0.2)

In [None]:
tfid = TfidfVectorizer()
train_tfid_matrix = tfid.fit_transform(X_train)
test_tfid_matrix = tfid.transform(X_test)

In [None]:
models = [DecisionTreeClassifier(),
          RandomForestClassifier(),
          SVC(),
          LogisticRegression(max_iter=1000),
          KNeighborsClassifier(),
          BernoulliNB()]

In [None]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


In [None]:
accuracy = []

for model in models:
    cross_val = cross_val_score(model, train_tfid_matrix, y_train, scoring='accuracy',
                               cv=StratifiedKFold(10)).mean()
    accuracy.append(cross_val)


In [None]:
models_name = ['DecisionTreeClassifier', 'RandomForestClassifier', 'SVC',
         'LogisticRegression', 'KNeighborsClassifier', 'BernoulliNB']

acc = pd.DataFrame({'Model': models_name, 'Accuracy': accuracy})
acc

Unnamed: 0,Model,Accuracy
0,DecisionTreeClassifier,0.413797
1,RandomForestClassifier,0.504384
2,SVC,0.589211
3,LogisticRegression,0.59039
4,KNeighborsClassifier,0.453069
5,BernoulliNB,0.457908


## With new rating

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['Review'], df['new_rating'], test_size=0.2)

In [None]:
tfid = TfidfVectorizer()
train_tfid_matrix = tfid.fit_transform(X_train)
test_tfid_matrix = tfid.transform(X_test)

In [None]:
models = [DecisionTreeClassifier(),
          RandomForestClassifier(),
          SVC(),
          LogisticRegression(max_iter=1000),
          KNeighborsClassifier(),
          BernoulliNB()]

In [None]:
accuracy = []

for model in models:
    cross_val = cross_val_score(model, train_tfid_matrix, y_train, scoring='accuracy',
                               cv=StratifiedKFold(10)).mean()
    accuracy.append(cross_val)


In [None]:
models_name = ['DecisionTreeClassifier', 'RandomForestClassifier', 'SVC',
         'LogisticRegression', 'KNeighborsClassifier', 'BernoulliNB']

acc = pd.DataFrame({'Model': models_name, 'Accuracy': accuracy})
acc

Unnamed: 0,Model,Accuracy
0,DecisionTreeClassifier,0.545755
1,RandomForestClassifier,0.670755
2,SVC,0.733774
3,LogisticRegression,0.730784
4,KNeighborsClassifier,0.576011
5,BernoulliNB,0.589678


# Processed data

In [None]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(df['tokenized'], df['Rating'], test_size=0.2)

In [None]:
tfid = TfidfVectorizer()
train_tfid_matrix = tfid.fit_transform(X_train_2)
test_tfid_matrix = tfid.transform(X_test_2)

In [None]:
models = [DecisionTreeClassifier(),
          RandomForestClassifier(),
          SVC(),
          LogisticRegression(max_iter=1000),
          KNeighborsClassifier(),
          BernoulliNB()]

In [None]:
accuracy = []

for model in models:
    cross_val = cross_val_score(model, train_tfid_matrix, y_train_2, scoring='accuracy',
                               cv=StratifiedKFold(10)).mean()
    accuracy.append(cross_val)


In [None]:
models_name = ['DecisionTreeClassifier', 'RandomForestClassifier', 'SVC',
         'LogisticRegression', 'KNeighborsClassifier', 'BernoulliNB']

acc = pd.DataFrame({'Model': models_name, 'Accuracy': accuracy})
acc

Unnamed: 0,Model,Accuracy
0,DecisionTreeClassifier,0.413797
1,RandomForestClassifier,0.504384
2,SVC,0.589211
3,LogisticRegression,0.59039
4,KNeighborsClassifier,0.453069
5,BernoulliNB,0.457908


## With new rating

In [8]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(df['tokenized'], df['new_rating'], test_size=0.2)

In [9]:
tfid = TfidfVectorizer()
train_tfid_matrix = tfid.fit_transform(X_train_2)
test_tfid_matrix = tfid.transform(X_test_2)

In [10]:
models = [DecisionTreeClassifier(),
          RandomForestClassifier(),
          SVC(),
          LogisticRegression(max_iter=1000),
          KNeighborsClassifier(),
          BernoulliNB()]

In [11]:
accuracy = []

for model in models:
    cross_val = cross_val_score(model, train_tfid_matrix, y_train_2, scoring='accuracy',
                               cv=StratifiedKFold(10)).mean()
    accuracy.append(cross_val)


In [12]:
models_name = ['DecisionTreeClassifier', 'RandomForestClassifier', 'SVC',
         'LogisticRegression', 'KNeighborsClassifier', 'BernoulliNB']

acc = pd.DataFrame({'Model': models_name, 'Accuracy': accuracy})
acc

Unnamed: 0,Model,Accuracy
0,DecisionTreeClassifier,0.545205
1,RandomForestClassifier,0.666057
2,SVC,0.73054
3,LogisticRegression,0.728405
4,KNeighborsClassifier,0.575097
5,BernoulliNB,0.598646
