# Sentiment Prediction of Tripadvisor Hotel Reviews
* Inspired by https://www.kaggle.com/shahraizanwar/hotel-reviews-sentiment-prediction
* Using pandas and TensorFlow


In [62]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [53]:
data = pd.read_csv('https://raw.githubusercontent.com/innovad/ml-playground/main/tripadvisor_hotel_reviews.csv')
data.head()
print('Number of lines: ' + str(len(data)))
print('Possible rating numbers: ', end='')
print(data['Rating'].unique())
print ('Max words in Review: ', end='') 
max(data["Review"].apply(lambda x: len(str(x).split(' '))))

Number of lines: 20491
Possible rating numbers: [4 2 3 5 1]
Max words in Review: 

1933

In [None]:
reviewData = data["Review"]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(reviewData)
reviewDataTokenized = tokenizer.texts_to_sequences(reviewData)

example = 2
vocab_size = len(tokenizer.word_index)+1

print("Vocabulary size: {}".format(vocab_size))
print("max length of sentence: {}".format(max([len(x) for x in reviewDataTokenized])))
print("Example Sentence: {}".format(reviewData[example]))
print("After tokenizing : {}".format(reviewDataTokenized[example]))

reviewDataTokenized = pad_sequences(reviewDataTokenized, padding='post', maxlen=350)
print("After padding : \n{}".format(reviewDataTokenized[example]))

In [88]:
encoding = {1: 0,
            2: 1,
            3: 2,
            4: 3,
            5: 4
           }

labels = ['1', '2', '3', '4', '5']
           
y = data['Rating'].copy()
y.replace(encoding, inplace=True)

In [89]:
from re import X
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    reviewDataTokenized, y, test_size=0.33, random_state=67, stratify=y
)

print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))

13728
6763
13728
6763


In [90]:
import tensorflow.keras.layers as L
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam

# hyper parameters
EPOCHS = 3
BATCH_SIZE = 100
embedding_dim = 16
units = 76

model = tf.keras.Sequential([
    L.Embedding(vocab_size, embedding_dim, input_length=reviewDataTokenized.shape[1]),
    L.Bidirectional(L.LSTM(units,return_sequences=True)),
    #L.LSTM(units,return_sequences=True),
    L.Conv1D(64,3),
    L.MaxPool1D(),
    L.Flatten(),
    L.Dropout(0.5),
    L.Dense(128, activation="relu"),
    L.Dropout(0.5),
    L.Dense(64, activation="relu"),
    L.Dropout(0.5),
    L.Dense(5, activation="softmax")
])


model.compile(loss=SparseCategoricalCrossentropy(),
              optimizer='adam',metrics=['accuracy']
             )

model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 350, 16)           835392    
                                                                 
 bidirectional_3 (Bidirectio  (None, 350, 152)         56544     
 nal)                                                            
                                                                 
 conv1d_3 (Conv1D)           (None, 348, 64)           29248     
                                                                 
 max_pooling1d_3 (MaxPooling  (None, 174, 64)          0         
 1D)                                                             
                                                                 
 flatten_3 (Flatten)         (None, 11136)             0         
                                                                 
 dropout_9 (Dropout)         (None, 11136)            

In [None]:
history = model.fit(X_train, y_train, epochs=EPOCHS, validation_split=0.12, batch_size=BATCH_SIZE, verbose=2)

Epoch 1/3
