# Sentiment Analysis with RNN

Reference:
- https://towardsdatascience.com/a-beginners-guide-on-sentiment-analysis-with-rnn-9e100627c02e

In [1]:
from keras.datasets import imdb

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
vocab_size = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=vocab_size)

In [3]:
X_train.shape

(25000,)

In [4]:
X_test.shape

(25000,)

In [5]:
word2id = imdb.get_word_index()
id2word = {i: word for word, i in word2id.items()}

In [6]:
[id2word.get(i, '') for i in X_train[6]]

['the',
 'and',
 'full',
 'involving',
 'to',
 'impressive',
 'boring',
 'this',
 'as',
 'and',
 'and',
 'br',
 'villain',
 'and',
 'and',
 'need',
 'has',
 'of',
 'costumes',
 'b',
 'message',
 'to',
 'may',
 'of',
 'props',
 'this',
 'and',
 'and',
 'concept',
 'issue',
 'and',
 'to',
 "god's",
 'he',
 'is',
 'and',
 'unfolds',
 'movie',
 'women',
 'like',
 "isn't",
 'surely',
 "i'm",
 'and',
 'to',
 'toward',
 'in',
 "here's",
 'for',
 'from',
 'did',
 'having',
 'because',
 'very',
 'quality',
 'it',
 'is',
 'and',
 'and',
 'really',
 'book',
 'is',
 'both',
 'too',
 'worked',
 'carl',
 'of',
 'and',
 'br',
 'of',
 'reviewer',
 'closer',
 'figure',
 'really',
 'there',
 'will',
 'and',
 'things',
 'is',
 'far',
 'this',
 'make',
 'mistakes',
 'and',
 'was',
 "couldn't",
 'of',
 'few',
 'br',
 'of',
 'you',
 'to',
 "don't",
 'female',
 'than',
 'place',
 'she',
 'to',
 'was',
 'between',
 'that',
 'nothing',
 'and',
 'movies',
 'get',
 'are',
 'and',
 'br',
 'yes',
 'female',
 'just

In [7]:
#max review length
len(max((X_train + X_test), key=len))

2569

In [8]:
#min review length
len(min((X_train + X_test), key=len))

69

## pad sequences

In [9]:
from keras.preprocessing import sequence

max_words = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

## Model : RNN

In [10]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

In [11]:
embedding_size = 32

model = Sequential()
model.add(Embedding(vocab_size, embedding_size, input_length=max_words))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [12]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________


In [13]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [14]:
batch_size = 64
num_epoches = 30
validation_split = 0.2

In [15]:
from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard

early_stopping_callback = EarlyStopping(monitor='val_loss', \
                                       patience=100)

In [16]:
# X_valid, y_valid = X_train[:batch_size], y_train[:batch_size]
# X_train2, y_train2 = X_train[batch_size:], y_train[batch_size:]
# history = model.fit(X_train2, y_train2,\
#                     validation_split=(X_valid, y_valid),\
#                    batch_size=batch_size, epochs=num_epoches)

In [17]:
history = model.fit(X_train, y_train,\
                   batch_size=batch_size, epochs=num_epoches,\
                   verbose=1, validation_split=validation_split,\
                   callbacks=[early_stopping_callback])

Train on 20000 samples, validate on 5000 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [18]:
score = model.evaluate(X_test, y_test, verbose=1)
print('\n')
print('test score : ', score[0])
print('test accuracy : ', score[1])



test score :  1.0346745039898158
test accuracy :  0.82944
