# LSTM
Used sources are https://towardsdatascience.com/sentiment-analysis-with-deep-learning-62d4d0166ef6 and https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/.

In [1]:

import numpy as np
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
import pandas as pd
import nltk

nltk.download('punkt')

# fix random seed for reproducibility
np.random.seed(7)


Using TensorFlow backend.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Judit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [24]:
# load cleaned data
reviews = pd.read_csv('../Data/simpler_clean_data.csv')
reviews

Unnamed: 0,Reviewer_Score,Review
0,1,happy breakfast included expensive hotel overp...
1,0,disgusting attitude told brother serious car a...
2,1,bathroom much smaller expected tea coffee drin...
3,1,checking great big queue saturday lunchtime re...
4,1,spa basic paris visit spa arguably best positi...
5,1,bar across street open least weekend noise aud...
6,0,hotel dated whole hotel dirty room gloomy move...
7,1,convenient access via dockland rail modern spa...
8,1,nothing would definitely stay great location f...
9,1,breakfast need halal option overall great stay...


In [25]:
y = reviews.iloc[:, :-1] 
X = reviews.iloc[:, -1] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state = 5)

In [26]:
words = []

def word_cnt(text):
    tokens = nltk.word_tokenize(str(text))
    for word in tokens:
        if word not in words:
            words.append(word)

for text in X_train:
    word_cnt(text)
    

In [27]:
new_x_train = []
new_x_test = []

def text_to_intlist(text, new):
    tokens = nltk.word_tokenize(str(text))
    list = []
    for word in tokens:
        list.append(words.index(word))
    new.append(list)
   
def reviews_to_ints(df, new):
    for text in X_train:
        text_to_intlist(text, new)

reviews_to_ints(X_train, new_x_train)
reviews_to_ints(X_test, new_x_test)

In [28]:
X_train = np.array(new_x_train)
X_test = np.array(new_x_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [29]:
# truncate and pad input sequences
max_review_length = 500

top_words = len(words)

X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

# create the model
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, epochs=3, batch_size=64)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 500, 32)           483840    
_________________________________________________________________
dropout_11 (Dropout)         (None, 500, 32)           0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dropout_12 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 101       
Total params: 537,141
Trainable params: 537,141
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x2b640158f60>

In [30]:
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 90.98%
