In [None]:
# RNN to classify IMDB movie reviews
#   predict: positive (1), negative (0)
#
# 1. Load data and inspect
# 2. Embeddings
# 3. Train RNN (LSTM) to classify the reviews

In [52]:
from keras.datasets import imdb

# load the reviews with the top 3000 words only
# skip top 5 most frequent words (like 'a', 'the'): stop words
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=3000,
                                                      skip_top=5)

In [58]:
# gets the word index json
text_to_index = imdb.get_word_index()

# maps word index json from term -> index to index -> term
index_to_text = dict((text_to_index[k], k) for k in text_to_index)

# converts first review from index to words 
print('label', y_train[0])
print('review (numbers):', X_train[0])
print('review (words):', " ".join([index_to_text[x] for x in X_train[0]]))

label 1
review (numbers): [2, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 2, 66, 2, 2, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 2, 172, 112, 167, 2, 336, 385, 39, 2, 172, 2, 1111, 17, 546, 38, 13, 447, 2, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 2, 1920, 2, 469, 2, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 2, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 2, 2223, 2, 16, 480, 66, 2, 33, 2, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 2, 107, 117, 2, 15, 256, 2, 2, 7, 2, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 2, 2, 1029, 13, 104, 88, 2, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 2, 18, 2, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 2, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 2, 113, 103, 32, 15, 16, 2, 19, 178, 32]
review (words): and as you with out themselves powerful lets loves the

In [55]:
import numpy as np
lengths = np.array([len(X_train[i]) for i in range(len(X_train))])

# to find out what window size to use
# we'll have to tune this for speeding up training or improving accuracy
window_size = int(np.mean(lengths))
window_size

# or you can try the max (but very slow)
#lengths[np.argmax(lengths)]

238

In [108]:
# Create our RNN model
from keras.models import Sequential
from keras.layers import LSTM, Embedding, Dense

vocab_size = 3000 # this can also be tuned (speed vs. accuracy)
embedding_size = 50 # another parameter to tune (speed vs. accuracy)
lstm_output_size = 32 # tune to how many output features

model = Sequential()
# featurizer
model.add(Embedding(vocab_size, embedding_size))
# return_sequences: output will be a sequence that you can
# feed into another LSTM
model.add(LSTM(64, return_sequences=True))
model.add(LSTM(32))
# classifier
model.add(Dense(lstm_output_size, activation='relu')) # binary classifier
model.add(Dense(1, activation='sigmoid')) # binary classifier
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, None, 50)          150000    
_________________________________________________________________
lstm_8 (LSTM)                (None, None, 64)          29440     
_________________________________________________________________
lstm_9 (LSTM)                (None, 32)                12416     
_________________________________________________________________
dense_10 (Dense)             (None, 32)                1056      
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 33        
Total params: 192,945
Trainable params: 192,945
Non-trainable params: 0
_________________________________________________________________


In [103]:
# Feature Engineering: make sure sequences are same length (window_size)
# before feeding into Keras
from keras.preprocessing import sequence

# pads or truncates the sequences
# Note: maybe truncating='post' because earlier words may be
# useful for semantic meaning
# Note: if you are doing text prediction, then later words are
# more useful for prediction
X_train = sequence.pad_sequences(X_train, maxlen=window_size)
X_test = sequence.pad_sequences(X_test, maxlen=window_size)

print(X_train.shape)

(25000, 238)


In [105]:
# Train
model.compile('rmsprop', loss='binary_crossentropy',
              metrics=['accuracy'])

In [106]:
from keras.callbacks import TensorBoard, EarlyStopping
import time

tensorboard = TensorBoard(log_dir='logs/imdb_lstm%d' % time.time())
earlystop = EarlyStopping(patience=1)

history = model.fit(X_train, y_train, batch_size=32, epochs=10,
                    callbacks=[tensorboard, earlystop],
                    validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10


In [107]:
# Score
model.evaluate(X_test, y_test)



[0.4200911285352707, 0.8476]

In [80]:
model.metrics_names

['loss', 'acc']

In [109]:
# Predict
pred = model.predict_classes(X_test)

In [110]:
from sklearn.metrics import confusion_matrix, classification_report

print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))

             precision    recall  f1-score   support

          0       0.50      0.38      0.43     12500
          1       0.50      0.63      0.56     12500

avg / total       0.50      0.50      0.50     25000

[[4692 7808]
 [4607 7893]]


In [None]:
# To improve accuracy try increasing these:
#
# vocab_size = 3000 # this can also be tuned (speed vs. accuracy)
# embedding_size = 50 # another parameter to tune (speed vs. accuracy)
# lstm_output_size = 32 # tune to how many output features
# 
# Try recurrent_dropout (reduce overfit)
# Add another LSTM layer (learn more)

# To speed things up:
# 
# Increase batch size to 64, 128 (number of times per epoch)
# Reduce the hyperparameters above
#

In [96]:
# Predict
# Source: https://www.imdb.com/title/tt3104988/reviews

test1 = """I was so excited when I first learned that Kevin Kwan's "Crazy Rich Asians" was going to become a film! The book was way more appealing than I had first imagined it would be, and I'm happy to report that Jon Chu's screen version has surpassed my hopeful-but-wary expectations. Not to make it sound too simplistic, the movie was beautiful and very, very funny. Go see it!
Yes, it is a romantic comedy - but this has such intriguing social and cultural undercurrents that it tempts even the fairly observant watcher away from taking the "Cinderella" story at its glitzy face value. While the numerous characters had to have their backstories compressed to fit into just two hours, we are given enough great dialogue, effervescent or slightly evil portrayals, and sumptuous visual clues to make the friends and family members in Singapore come alive.
All the aunties, cousins and ladies-in-waiting may be slightly overwhelming for people who haven't read the book, but anyone with wacky friends and pompous relatives should get it, even if they are not Asian. 
I liked film's especially clever use of graphics, as well as the smooth-to-rocking score, the lush and verdant locations, the perfect designer costuming, and pretty much everything else. One of my favorite lines was about having attended Cal State Fullerton; but you must to watch it for yourself. I honestly have not laughed out loud during a film this much in decades. Oh, and I rather liked Chris Pang, too. A totally hot actor, even though I'm old enough to be his mother.
As soon as Crazy Rich Asians officially opens, I'm going to catch it again. The preview was not enough, and there were so many little moments that deserve a second or third look. Now we must all hope that a sequel with the same talented cast and Chu in charge is coming our way before too long. Thank you all, you fabulous Asian actors, crew, writers and backers who made this possible. And no, I'm not of even a little bit Asian ancestry.
"""
test2 = """what a boring movie. This was a very boring film. I fell asleep in the cinema. This movie deserves no attention! I do not recommend this movie because it's a waste of time."""

def clean_and_get_sequence(text):
    # https://keras.io/preprocessing/text/#text_to_word_sequence
    from keras.preprocessing.text import text_to_word_sequence

    test_sequence = text_to_word_sequence(text, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\n   ',
                                          lower=True, split=' ')
    # drop words not in vocab
    test_sequence_cleaned = [s for s in test_sequence if s in text_to_index]

    # map to indices
    test_sequence_index = [text_to_index[s] for s in test_sequence_cleaned]
    #print('as index\n', sequence_index)

    # filter out top 3000
    test_sequence_index_3000 = [i for i in test_sequence_index if i <= 3000]
    #print('as index (top 3000 only)\n', test_sequence_index_3000)

    # look at review
    test_review = ' '.join([index_to_text[i] for i in test_sequence_index_3000])
    #print('as words (top 3000 only)\n', test_review)
    
    return test_sequence_index_3000

test1_index = clean_and_get_sequence(test1)
test2_index = clean_and_get_sequence(test2)

test_reviews = [test1_index, test2_index]

#print('Pad sequences (samples x time)')
test_reviews = sequence.pad_sequences(test_reviews, maxlen=window_size)
#print('test_reviews shape:', test_reviews.shape)

tests = [test1, test2]
pred_prob = model.predict(test_reviews)
pred_label = model.predict_classes(test_reviews)

for text, label, probability in zip(tests, pred_label, pred_prob):
    print(text)
    print('positive', label, 'probability', probability)
    print('-------')

I was so excited when I first learned that Kevin Kwan's "Crazy Rich Asians" was going to become a film! The book was way more appealing than I had first imagined it would be, and I'm happy to report that Jon Chu's screen version has surpassed my hopeful-but-wary expectations. Not to make it sound too simplistic, the movie was beautiful and very, very funny. Go see it!
Yes, it is a romantic comedy - but this has such intriguing social and cultural undercurrents that it tempts even the fairly observant watcher away from taking the "Cinderella" story at its glitzy face value. While the numerous characters had to have their backstories compressed to fit into just two hours, we are given enough great dialogue, effervescent or slightly evil portrayals, and sumptuous visual clues to make the friends and family members in Singapore come alive.
All the aunties, cousins and ladies-in-waiting may be slightly overwhelming for people who haven't read the book, but anyone with wacky friends and pomp

In [112]:
from keras.layers import Conv1D, GlobalMaxPooling1D, Activation

# Try 1D Convolution as a comparison
filters = 250 # rectangle depth
kernel_size = 3

# building CNN model
model_cnn = Sequential()

# featurizer
# 3000 vocab, 50-dimension embedding vector, review length 238
model_cnn.add(Embedding(vocab_size, embedding_size,
              input_length=window_size))
model_cnn.add(Conv1D(filters, kernel_size, activation='relu'))
model_cnn.add(GlobalMaxPooling1D())

# classifier
model_cnn.add(Dense(filters, activation='relu'))
model_cnn.add(Dense(1, activation='sigmoid'))

model_cnn.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 238, 50)           150000    
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 236, 250)          37750     
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 250)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 250)               62750     
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 251       
Total params: 250,751
Trainable params: 250,751
Non-trainable params: 0
_________________________________________________________________


In [114]:
model_cnn.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

tensorboard = TensorBoard(log_dir='./logs/imdb_cnn%d' % time.time())
earlystop = EarlyStopping(patience=1)

history = model_cnn.fit(X_train, y_train, batch_size=32, epochs=10,
                        validation_split=0.2,
                        callbacks=[tensorboard, earlystop])

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10


In [115]:
model_cnn.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

tensorboard = TensorBoard(log_dir='./logs/imdb_cnn%d' % time.time())
earlystop = EarlyStopping(patience=1)

history = model_cnn.fit(X_train, y_train, batch_size=64, epochs=10,
                        validation_split=0.2,
                        callbacks=[tensorboard, earlystop])

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
