In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

%matplotlib inline

In [2]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Input, Bidirectional, LSTM, Embedding, Dropout, SpatialDropout1D
from tensorflow.keras.optimizers import Adam, RMSprop, SGD
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [3]:
# Load in preprocessed reviews
reviews = pd.read_csv("C:/Users/Kelvin Chumbe/Anaconda Projects/Hotel Review Mining/Preprocessed_Reviews.csv")

In [4]:
reviews.head()

Unnamed: 0,hotel,review,title,rating,date,month,year
0,Sarova Whitesands Beach Resort & Spa,land country first lunch lido restaurant besid...,refresh getaway,5.0,2021-03-01,3,2021
1,Sarova Whitesands Beach Resort & Spa,high speed wifi myth complaints dismiss advise...,meet expectations,3.0,2021-03-01,3,2021
2,Sarova Whitesands Beach Resort & Spa,thank white sand staff amaze stay delicious fo...,relax spoil,4.0,2020-12-01,12,2020
3,Sarova Whitesands Beach Resort & Spa,travel family days together children pool face...,fantastic stay,5.0,2021-02-01,2,2021
4,Sarova Whitesands Beach Resort & Spa,beautiful place despite fact covid19 measure s...,five days stay sarova whitesands hotel,5.0,2021-02-01,2,2021


In [5]:
GLOVE_DIR = "C:/Users/Kelvin Chumbe/Downloads/Datasets/GloVe_archive"

In [6]:
# Create glove embeddings dict from glove file
glove_path = os.path.join(GLOVE_DIR, "glove.6B.50d.txt")
glove_embeddings= {}
EMBEDDING_DIM = 50

with open(glove_path, encoding='utf8') as file:
    for line in file:
        line = line.split()
        word = line[0]
        embedding = line[1:]

        glove_embeddings[word] = np.asarray(embedding, dtype='float32')

In [7]:
reviews['review'].str.len().max()

630

In [8]:
# Tokenize and pad tweets
reviews_text = reviews['review']

print(reviews_text[:3])

tokenizer = Tokenizer()
tokenizer.fit_on_texts(reviews_text)
sequences = tokenizer.texts_to_sequences(reviews_text)

reviews_text = pad_sequences(sequences, maxlen=630, padding='post')

print(reviews_text[:3])

print('Found {} unique tokens.'.format(len(tokenizer.word_index)))

0    land country first lunch lido restaurant besid...
1    high speed wifi myth complaints dismiss advise...
2    thank white sand staff amaze stay delicious fo...
Name: review, dtype: object
[[ 928  862   61 ...    0    0    0]
 [ 189 1097  134 ...    0    0    0]
 [  71  384  628 ...    0    0    0]]
Found 13015 unique tokens.


In [9]:
# Create weight matrix
embedding_matrix = np.random.random((len(tokenizer.word_index) + 1, EMBEDDING_DIM))

for word, i in tokenizer.word_index.items():
    embedding_vec = glove_embeddings.get(word)
        
    if embedding_vec is not None:
        embedding_matrix[i] = embedding_vec 


In [10]:
embedding_matrix[0]

array([0.1591819 , 0.01237097, 0.50161303, 0.65444941, 0.8653697 ,
       0.43536641, 0.40103026, 0.54993968, 0.49634609, 0.43810147,
       0.15930271, 0.94339484, 0.12688883, 0.65970393, 0.65094589,
       0.90598402, 0.96283628, 0.93093658, 0.5651829 , 0.15018774,
       0.45109595, 0.16819062, 0.58813657, 0.86094654, 0.73591601,
       0.98564941, 0.20038017, 0.91608363, 0.34201921, 0.39795953,
       0.68914367, 0.98707441, 0.55379934, 0.55464175, 0.52262259,
       0.98903742, 0.3133119 , 0.66334374, 0.90694546, 0.9258202 ,
       0.77031272, 0.994302  , 0.38275044, 0.86364007, 0.11049155,
       0.94488149, 0.35296381, 0.30147595, 0.36208429, 0.01445401])

In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
label_enc = LabelEncoder()

reviews['rating'] = label_enc.fit_transform(reviews['rating'].values)
# reviews['rating'] = reviews['rating'].apply(lambda x: int(x))

In [13]:
reviews['rating'].unique()

array([4, 2, 3, 0, 1], dtype=int64)

In [14]:
# reviews['Labels'] = reviews['Category'].apply(lambda x: labels_dict.get(x))

In [15]:
# Convert Categories to one-hot encoding
labels = to_categorical(reviews['rating'])
# labels = reviews['rating'].values

In [16]:
labels[:5]

array([[0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.]], dtype=float32)

In [17]:
reviews['rating'][:5]

0    4
1    2
2    3
3    4
4    4
Name: rating, dtype: int64

In [18]:
def reverseEncoded(labels):
    return np.argmax(labels, axis=1)

In [19]:
reverseEncoded(labels)[:5]

array([4, 2, 3, 4, 4], dtype=int64)

In [19]:
# Build a Classification Model
model = Sequential([
    Embedding(len(tokenizer.word_index) + 1, 
              EMBEDDING_DIM,
              weights=[embedding_matrix],
              input_length=630,
              trainable=False),
    SpatialDropout1D(0.2),
    Bidirectional(LSTM(200, return_sequences=True)),
    Dropout(0.2),
#     Bidirectional(LSTM(200, return_sequences=True)),
#     Dropout(0.2),
    Bidirectional(LSTM(200)),
    Dropout(0.2),
    
    Dense(512, activation='relu'),
    Dropout(0.2),
    Dense(5, activation='softmax')
])

In [20]:
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
             
model.compile(loss='categorical_crossentropy', 
              optimizer=sgd, 
              metrics=['accuracy'])

In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 630, 50)           650800    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 630, 50)           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 630, 400)          401600    
_________________________________________________________________
dropout (Dropout)            (None, 630, 400)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 400)               961600    
_________________________________________________________________
dropout_1 (Dropout)          (None, 400)               0         
_________________________________________________________________
dense (Dense)                (None, 512)               2

In [22]:
# Split train and test data
train_split = int(len(reviews_text) * 0.8)

train_X = reviews_text[:train_split]
train_Y = labels[:train_split]

test_X = reviews_text[train_split:]
test_Y = labels[train_split:]

In [23]:
train_Y[:5]

array([[0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.]], dtype=float32)

In [24]:
train_X.shape, test_X.shape, train_Y.shape, test_Y.shape

((3980, 630), (995, 630), (3980, 5), (995, 5))

In [25]:
# Create a callback to stop training once validation loss stops improving
# earlystopping = EarlyStopping(
#     monitor="val_loss",N
#     min_delta=0,
#     patience=4,
#     verbose=0,
#     mode="auto",
# )

In [26]:
# filepath = "best_model_word2vec.h5"

# model_checkpoint = ModelCheckpoint(
#     filepath,
#     save_weights_only=True,
#     monitor='val_accuracy',
#     mode='max',
# )

In [None]:
EPOCHS = 25
BATCH_SIZE = 128

history = model.fit(train_X, train_Y, validation_split=0.8, epochs=EPOCHS, batch_size=BATCH_SIZE)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25

In [None]:
loss, accuracy = model.evaluate(test_X, test_Y)
print("Loss: {}\nAccuracy: {}".format(loss, accuracy))