In [19]:
from keras.datasets import imdb
from keras.preprocessing import sequence
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import load_model
import os

vocabulary_size = 5000

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=vocabulary_size, skip_top=100, oov_char=None)
print("Loaded dataset with {} training samples, {} test samples".format(len(X_train), len(X_test)))

Loaded dataset with 25000 training samples, 25000 test samples


In [20]:
review_sample_index = 7

# Inspect a sample review and its label
print("--- Review ---")
print(X_train[review_sample_index])
print("--- Label ---")
print(y_train[review_sample_index])

--- Review ---
[716, 689, 4367, 2343, 4804, 2315, 628, 150, 4069, 2909, 847, 313, 176, 138, 4434, 183, 192, 799, 588, 3231, 152, 339, 4869, 345, 4804, 142, 218, 208, 853, 659, 882, 183, 115, 172, 174, 1001, 398, 1001, 1055, 526, 3717, 1094, 871, 2030, 1109, 230, 4324, 251, 1034, 195, 301, 783, 2945, 103, 465, 845, 446, 1895, 184, 207, 110, 197, 601, 964, 2152, 595, 258, 1730, 338, 550, 728, 1196, 1839, 1546, 602, 120, 320, 786, 196, 786, 225, 373, 1009, 130, 1104, 1292, 225, 194, 1703, 803, 1004, 155, 3231, 853, 2029, 117, 430, 424, 2337, 178, 424, 1465, 178, 142, 424, 178, 662, 130, 898, 1686, 267, 185, 430, 118, 277, 1188, 100, 216, 357, 114, 367, 115, 788, 121, 278, 818, 162, 4165, 237, 600, 306, 157, 549, 628, 824, 4104, 138, 774, 1059, 159, 150, 229, 497, 1493, 175, 251, 453, 189, 127, 394, 292, 107, 2826, 1082, 1251, 906, 1134, 244, 2519, 135, 233, 466, 112, 398, 526, 1572, 4413, 1094, 225, 599, 133, 225, 227, 541, 4323, 171, 139, 539, 3231, 164, 426, 344, 624, 4617, 1082, 629, 2

In [21]:
print("X_train.shape: ", X_train.shape)
print("y_train.shape: ", y_train.shape)
print("X_test.shape: ", X_test.shape)
print("y_test.shape: ", y_test.shape)

X_train.shape:  (25000,)
y_train.shape:  (25000,)
X_test.shape:  (25000,)
y_test.shape:  (25000,)


In [22]:
print(y_train[:200])

[1 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0 1 0 0 1 0 1 1 0 0 1 0 0 1 1 1 0 0 0
 1 0 0 1 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 1 1 1 1 1 1 1 1 0 0 1
 1 0 1 0 0 1 1 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 0
 1 0 1 0 0 1 0 1 1 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1 1 1 0 1
 1 1 1 1 0 0 1 1 0 0 0 0 0 0 1 1 0 1 0 1 0 0 1 1 1 0 1 1 0 1 1 1 1 0 0 0 0
 1 0 1 0 0 0 0 0 0 0 1 1 1 0 0]


In [23]:
longest_review_index = np.argmax(np.vectorize(len)(X_train))
review_sample_index = longest_review_index

In [24]:
# Map word IDs back to words
word2id = imdb.get_word_index()
id2word = {i: word for word, i in word2id.items()}

print("--- Longest review ---")
print([id2word.get(i, " ") for i in X_train[review_sample_index]])
print("--- Label ---")
print(y_train[review_sample_index])

--- Longest review ---
['expecting', 'later', 'hurts', 'ways', 'caine', 'expecting', 'land', '1980s', 'manage', 'ill', 'photography', 'land', '1980s', 'body', 'seems', 'better', 'hurts', 'ways', 'caine', 'expecting', 'sort', 'ill', 'photography', 'soft', 'warm', 'expecting', 'give', 'through', "i'm", 'documentary', 'slasher', 'style', 'finds', 'hand', '1980s', 'land', 'exciting', 'strong', 'hits', 'none', '1980s', 'check', '3', 'none', 'go', 'gripping', 'tv', 'place', 'caine', 'ideas', '1980s', 'sky', 'show', 'try', 'clothes', 'ideas', 'watch', 'finds', 'laugh', '1980s', 'caine', 'sky', 'moved', 'place', 'place', 'suppose', 'husband', 'maybe', "i'm", 'caine', '1980s', 'caine', 'show', 'times', 'new', 'none', 'moments', 'things', 'funny', 'caine', 'gripping', 'check', 'committed', '1980s', "i'm", 'none', 'pace', 'leading', 'give', 'those', 'level', 'doll', '1980s', 'better', 'try', 'million', 'widely', '1980s', 'cole', 'watch', '1980s', 'expecting', 'worker', 'watch', '1980s', 'moments'

In [25]:
sorted(id2word.items())

[(1, 'the'),
 (2, 'and'),
 (3, 'a'),
 (4, 'of'),
 (5, 'to'),
 (6, 'is'),
 (7, 'br'),
 (8, 'in'),
 (9, 'it'),
 (10, 'i'),
 (11, 'this'),
 (12, 'that'),
 (13, 'was'),
 (14, 'as'),
 (15, 'for'),
 (16, 'with'),
 (17, 'movie'),
 (18, 'but'),
 (19, 'film'),
 (20, 'on'),
 (21, 'not'),
 (22, 'you'),
 (23, 'are'),
 (24, 'his'),
 (25, 'have'),
 (26, 'he'),
 (27, 'be'),
 (28, 'one'),
 (29, 'all'),
 (30, 'at'),
 (31, 'by'),
 (32, 'an'),
 (33, 'they'),
 (34, 'who'),
 (35, 'so'),
 (36, 'from'),
 (37, 'like'),
 (38, 'her'),
 (39, 'or'),
 (40, 'just'),
 (41, 'about'),
 (42, "it's"),
 (43, 'out'),
 (44, 'has'),
 (45, 'if'),
 (46, 'some'),
 (47, 'there'),
 (48, 'what'),
 (49, 'good'),
 (50, 'more'),
 (51, 'when'),
 (52, 'very'),
 (53, 'up'),
 (54, 'no'),
 (55, 'time'),
 (56, 'she'),
 (57, 'even'),
 (58, 'my'),
 (59, 'would'),
 (60, 'which'),
 (61, 'only'),
 (62, 'story'),
 (63, 'really'),
 (64, 'see'),
 (65, 'their'),
 (66, 'had'),
 (67, 'can'),
 (68, 'were'),
 (69, 'me'),
 (70, 'well'),
 (71, 'than'),


In [26]:
max_words = 500

X_train_sized = sequence.pad_sequences(X_train, maxlen=max_words)
X_test_sized = sequence.pad_sequences(X_test, maxlen=max_words)

In [27]:
print("--- Longest review ---")
print([id2word.get(i, " ") for i in X_train_sized[review_sample_index]])
print("--- Label ---")
print(y_train[review_sample_index])

--- Longest review ---
['perspective', 'maybe', 'give', 'budget', "i'm", 'caine', 'those', 'sky', 'thought', 'ever', 'understand', 'seem', 'ever', 'creators', 'capture', 'along', 'granted', 'perspective', 'school', 'try', 'nazis', 'welles', 'sequels', 'greatest', 'core', 'sky', 'beginning', 'hiding', 'eventually', 'know', 'b', 'here', 'new', 'after', 'died', 'dan', 'fill', 'b', 'often', 'daughter', 'fire', 'casting', 'core', 'eaten', 'screening', 'faults', 'fire', 'acting', 'best', 'saw', '1', 'cast', 'takes', 'kind', 'suddenly', 'days', 'seen', 'actors', 'until', 'takes', 'seeing', 'set', 'couple', 'shots', 'core', 'core', 'eaten', 'screening', 'clichéd', 'often', 'try', 'feminist', 'experience', 'dick', 'right', 'jay', 'takes', 'fire', 'guess', 'core', 'seen', 'does', 'having', 'takes', 'seen', 'true', 'cast', 'ok', 'better', 'known', 'core', 'true', 'coming', 'takes', 'core', 'turned', 'beating', 'takes', 'got', 'nothing', 'does', 'these', 'quite', 'murder', 'seen', 'worker', 'those

In [40]:
dropout1 = 0.5
dropout2 = 0.5
embedding_dims = 256
recurrent_nodes = 128

model = Sequential()
model.add(Embedding(vocabulary_size, embedding_dims, input_length=max_words))
model.add(Dropout(dropout1))
model.add(LSTM(recurrent_nodes))
model.add(Dropout(dropout2))
model.add(Dense(1, activation='sigmoid'))

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 500, 256)          1280000   
_________________________________________________________________
dropout_7 (Dropout)          (None, 500, 256)          0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 128)               197120    
_________________________________________________________________
dropout_8 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 129       
Total params: 1,477,249
Trainable params: 1,477,249
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
batch_size = 128
num_epochs = 10000
validation_frac = 0.1
interim_model_file = 'sentiment_rnn_running_best.h5'

checkpointer = ModelCheckpoint(filepath=interim_model_file, monitor='val_loss',
                               save_best_only=True, verbose=1)
earlystopper = EarlyStopping(monitor='val_loss', patience=5)

training_history = model.fit(X_train_sized, y_train, batch_size=batch_size, epochs=num_epochs, verbose=1,
          callbacks=[checkpointer, earlystopper], validation_split=validation_frac)

Train on 22500 samples, validate on 2500 samples
Epoch 1/10000

In [None]:
cache_dir = ''

final_model_file = "sentiment_rnn_last.h5"  # HDF5 file
model.save(os.path.join(cache_dir, final_model_file))

In [None]:
model = load_model(os.path.join(cache_dir, 'sentiment_rnn_running_best.h5'))

In [None]:
scores = model.evaluate(X_test_sized, y_test, verbose=1)

In [None]:
print("Test accuracy:", scores)

In [None]:
scores