## Text Classification on Restaurant Reviews using LSTM

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
from tensorflow.keras.layers import Dense, Embedding, Dropout,LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer

### Load dataset

In [3]:
df = pd.read_csv('Restaurant_Reviews.tsv',sep='\t')

In [4]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [5]:
df.shape

(1000, 2)

In [6]:
df.Liked.value_counts()

1    500
0    500
Name: Liked, dtype: int64

### Clean data

In [7]:
#import nltk
#nltk.download('stopwords')

In [8]:
from nltk.corpus import stopwords     #stopwords
import re

In [9]:
stop_word = list(set(stopwords.words('english')))
if 'not' in stop_word:
    stop_word.remove('not')

In [10]:
#stop_word

In [11]:
def clean(X):
    preprocessed_reviews = []
    # tqdm is for printing the status bar
    for txt in X:
        
        txt = re.sub(r'<.*?>','',txt)    #remove tags
        #txt = text_preprocess(txt)
        #txt = re.sub("\S*\d\S*", "", txt)
        txt = re.sub('[^A-Za-z]+', ' ', txt)  #remove everything except A-Z and a-z
        txt = re.sub(r'\s+',' ',txt)        # convert more than one space's into one
       
        txt = ' '.join(e.lower() for e in txt.split() if e.lower() not in stop_word)
        
        preprocessed_reviews.append(txt)
    return np.array(preprocessed_reviews)

In [12]:
reviews=clean(df.Review)

### Split dataset into train and test

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X = reviews
y = df.Liked.values

In [15]:
np.bincount(y)

array([500, 500])

In [16]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=.20,random_state=10)

### Convert texts into vectors

In [17]:
total_words=1600  

tokenize = Tokenizer(total_words)   #tokenize our input data
tokenize.fit_on_texts(x_train)


In [18]:
word_index = tokenize.word_index
print('Found total %s unique tokens.' % len(word_index))

Found total 1645 unique tokens.


In [19]:
#word_index

In [20]:
X_train_new = tokenize.texts_to_sequences(x_train)
#X_cv_new = tokenize.texts_to_sequences(X_cv)
X_test_new = tokenize.texts_to_sequences(x_test)

In [21]:
X_train_new[0]

[13, 8, 637, 372]

In [22]:
#inv_index = {v: k for k, v in tokenize.word_index.items()}

In [23]:
max_words = 15
X_train = sequence.pad_sequences(X_train_new, maxlen=max_words,padding='post',truncating='post')
X_test = sequence.pad_sequences(X_test_new, maxlen=max_words,padding='post',truncating='post')

In [24]:
len(X_train[0])

15

In [25]:
#from keras.optimizers import Adam

### Model Building

In [26]:
from tensorflow.keras.optimizers import *

In [83]:
embed_vector_length = 32

model = Sequential()

model.add(Embedding(total_words, embed_vector_length, input_length=max_words))

#model.add(LSTM(64))
model.add(LSTM(32,dropout=.2,return_sequences=True))

model.add(LSTM(64,dropout=.2,return_sequences=True))

model.add(LSTM(128))
model.add(Dropout(.3))



model.add(Dense(256, activation='relu'))
model.add(Dropout(rate=0.4))
model.add(Dense(1, activation='sigmoid'))


model.compile(loss='binary_crossentropy', optimizer=Adam(.001), metrics=['accuracy'])


print(model.summary())


Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 15, 32)            51200     
_________________________________________________________________
lstm_12 (LSTM)               (None, 15, 32)            8320      
_________________________________________________________________
lstm_13 (LSTM)               (None, 15, 64)            24832     
_________________________________________________________________
lstm_14 (LSTM)               (None, 128)               98816     
_________________________________________________________________
dropout_7 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 256)               33024     
_________________________________________________________________
dropout_8 (Dropout)          (None, 256)              

### Model Training

In [73]:
from tensorflow.keras.callbacks import ModelCheckpoint

In [84]:
filepath = 'saved_model/model.h5'
checkpoint = ModelCheckpoint(filepath=filepath,mode='max', monitor='val_acc', save_best_only=True)

In [85]:
model.fit(X_train, y_train,batch_size=64, epochs=20,validation_data=(X_test,y_test),callbacks=[checkpoint])

Train on 800 samples, validate on 200 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fec1648b910>

In [76]:
model.evaluate(X_test,y_test)



[1.2730195140838623, 0.77]

### Save model

In [None]:
#model.save('restaurant_model.h5')

In [61]:
from tensorflow.keras.models import load_model

In [77]:
loaded_model = load_model('saved_model/model.h5')

In [78]:
loaded_model.evaluate(X_test,y_test)



[0.5324648571014404, 0.8]

### Check model on test data

In [79]:
test1 = 'food was so amazing'  #+ve review
test2 = "I visited this outlet during Navratra. One of my friends recommended me to try their Navratra \
         special Mitthi lassi and Paneer roll. It's was so good to satisfy you in your fasts.\
         The outlet is quite spacious with a very nice ambience. You can also try many items from a long \
        list in the menu."



test_samples = [test1,test2]

review = clean(test_samples)
test_token = tokenize.texts_to_sequences(test_samples)

In [80]:
test_samples_pad = sequence.pad_sequences(test_token,maxlen=max_words,padding='post',truncating='post')

In [81]:
model.predict_classes(test_samples_pad)

array([[1],
       [1]], dtype=int32)

In [82]:
loaded_model.predict_classes(test_samples_pad)

array([[1],
       [1]], dtype=int32)