# Sentiment Analysis -- More MOVIE Revies

_Natural Langauge Processing 
---



## RNNs


In [1]:
from keras.datasets import imdb  # import the built-in imdb dataset in Keras

vocabulary_size = 5000

# Loading the training and test data (note the difference in convention compared to scikit-learn)
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=vocabulary_size)
print("Loaded dataset with {} training samples, {} test samples".format(len(X_train), len(X_test)))

Using TensorFlow backend.


Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz
Loaded dataset with 25000 training samples, 25000 test samples


In [2]:
# Inspect a sample review and its label
print("--- Review ---")
print(X_train[7])
print("--- Label ---")
print(y_train[7])

--- Review ---
[1, 660, 6, 22, 9, 2279, 218, 2707, 6, 78, 155, 146, 55, 1688, 8, 106, 6, 2279, 22, 48, 50, 9, 6, 213, 8, 30, 93, 5, 14, 31, 434, 47, 31, 45, 4, 400, 2, 65, 7, 6, 1619, 132, 5, 27, 223, 17, 36, 585, 111, 153, 7, 199, 2, 5, 958, 21, 13, 215, 3081, 25, 15, 14, 238, 43, 30, 44, 4, 91, 2279, 22, 8, 216, 46, 7, 363, 11, 4, 3814, 88, 2, 113, 11, 2714, 16, 1211, 8, 135, 4, 222, 30, 2848, 8, 106, 3309, 44, 2, 341, 5, 3497, 10, 10, 44, 4, 64, 1566, 85, 74, 4, 2, 1163, 7, 4625, 9, 15, 4, 485, 9, 256, 34, 723, 2, 137, 29, 16, 35, 4843, 1020, 132, 45, 43, 6, 147, 902, 15, 363, 210, 177, 84, 7, 1874, 4776, 8, 297, 2188, 832, 315, 14, 999, 48, 25, 26, 2648, 692, 33, 32, 45, 184, 578, 15, 2, 9, 355, 18, 4, 173]
--- Label ---
1


Notice that the label is an integer (0 for negative, 1 for positive), and the review itself is stored as a sequence of integers. These are word IDs that have been preassigned to individual words. To map them back to the original words, you can use the dictionary returned by `imdb.get_word_index()`.

In [3]:
# Map word IDs back to words
word2id = imdb.get_word_index()
id2word = {i: word for word, i in word2id.items()}
print("--- Review (with words) ---")
print([id2word.get(i, " ") for i in X_train[7]])
print("--- Label ---")
print(y_train[7])

Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json
--- Review (with words) ---
['the', 'this', 'of', 'plot', 'many', 'was', 'one', 'life', 'styles', 'and', 'was', 'one', 'saw', 'in', 'is', 'intent', 'showed', 'scene', 'it', 'by', 'success', 'unrealistic', 'was', 'how', 'least', 'called', 'and', 'those', 'this', 'of', 'and', 'fast', 'original', 'oh', 'and', 'and', 'or', 'them', 'are', 'was', 'kelly', 'be', 'possible', 'factory', 'to', 'supposed', 'all', 'with', 'and', 'but', 'him', 'god', 'for', 'obvious', 'this', 'and', 'no', 'remains', 'and', 'eyes', 'made', 'try', 'and', 'this', 'is', 'you', 'oh', 'and', 'we', 'of', 'yes', 'br', 'and', 'and', 'lady', 'br', 'screen', 'be', 'runs', 'protagonist', 'in', 'sheriff', 'with', 'be', 'behavior', 'almost', 'this', 'decides', 'spoiler', 'and', 'was', 'one', 'point', 'between', 'all', 'with', 'world', 'turns', 'in', 'beyond', 'think', 'being', 'is', 'soundtrack', 'alive', 'for', 'it', 'friends', 'was', 'least', 'ma



### Pad sequences


In [4]:
from keras.preprocessing import sequence

max_words = 500

X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

###  Design an RNN model for sentiment analysis



In [5]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout


embedding_size=32
model = Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))
model.add(LSTM(100))

model.add(Dense(1, activation="sigmoid"))



print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None




### Train and evaluate model



In [6]:

model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])


In [7]:

batch_size = 64
num_epochs = 3

X_valid, y_valid = X_train[:batch_size], y_train[:batch_size]
X_train2, y_train2 =  X_train[batch_size:], y_train[batch_size:]

model.fit(X_train2, y_train2, validation_data=(X_valid, y_valid), epochs=num_epochs, batch_size=batch_size)

Train on 24936 samples, validate on 64 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f6f58bbf048>

In [8]:

import os

model_file = "rnn_model.h5"  # HDF5 file
model.save(os.path.join(os.getcwd(), model_file))



In [9]:
# Evaluate the model
scores = model.evaluate(X_test, y_test, verbose=0)  # returns loss and other metrics specified in model.compile()
print("Test accuracy:", scores[1])  # scores[1] should correspond to accuracy if you passed in metrics=['accuracy']

Test accuracy: 0.88096
