In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, LSTM, Dropout, Flatten, Embedding, Bidirectional, Input
from tensorflow.keras import Sequential
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
data = pd.read_csv('/content/tripadvisor_hotel_reviews.csv')
data.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


Clean the text by removing links, symbols and similiar..

In [3]:
def get_clean_text(x):
    x = re.sub(r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', '', x) 
    #regex to remove to emails(above)
    x = re.sub(r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', x)
    #regex to remove URLs
    x = re.sub('RT', "", x)
    #substitute the 'RT' retweet tags with empty spaces
    x = re.sub('[^A-Z a-z]+', '', x)
    return x
data['Review'] = data['Review'].apply(lambda x:get_clean_text(x))
data.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not experience hotel monaco seattl...,3
3,unique great stay wonderful time hotel monaco ...,5
4,great stay great stay went seahawk game awesom...,5


Split the data, print the shapes to confirm everything's ok

In [4]:
X = data.Review
y = data.Rating.map({1:0, 2:1, 3:2, 4:3, 5:4})

train_size = int(len(data)*0.8)

X_train, y_train = X[:train_size], y[:train_size]
X_test, y_test = X[train_size:], y[train_size:]
#Print the shapes
print("X_train shape: ",X_train.shape)
print("X_test shape: ",X_test.shape)
print("y_train shape: ",y_train.shape)
print("y_test shape: ",y_test.shape)

X_train shape:  (16392,)
X_test shape:  (4099,)
y_train shape:  (16392,)
y_test shape:  (4099,)


Print out some statistics about text data

In [5]:
print("Max sentence length:", X.map(len).max())
print("Min sentence length:", X.map(len).min())
print("Average sentence length:", X.map(len).mean())

chars = sorted(list(set(X)))
print('Total words:', len(chars))

Max sentence length: 13056
Min sentence length: 42
Average sentence length: 701.4552242447904
Total words: 20491


In [6]:
vocab_size = 8000
max_length = 200
embedding_dim = 32
batch_size = 32

tokenizer = Tokenizer(vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, maxlen = max_length, padding='pre', truncating='pre')

test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, maxlen = max_length, padding='pre', truncating='pre')

In [7]:
model = Sequential([
                   Embedding(vocab_size, embedding_dim, input_length=max_length),
                   Bidirectional(LSTM(100, return_sequences=True)),
                   LSTM(50),
                   Dropout(0.1),
                   Dense(100, kernel_regularizer=tf.keras.regularizers.l2(0.0015), activation='relu'),
                   Dense(5, activation='softmax') 
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 32)           256000    
_________________________________________________________________
bidirectional (Bidirectional (None, 200, 200)          106400    
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                50200     
_________________________________________________________________
dropout (Dropout)            (None, 50)                0         
_________________________________________________________________
dense (Dense)                (None, 100)               5100      
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 505       
Total params: 418,205
Trainable params: 418,205
Non-trainable params: 0
__________________________________________________

In [8]:
initial_learning_rate = 0.01
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=10000,
    decay_rate=0.96,
    staircase=True)

In [9]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model.fit(train_padded, y_train, epochs=10, batch_size=batch_size,
                    validation_data=(test_padded, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
