We'll look at these:

https://github.com/keras-team/keras/blob/master/examples/imdb_cnn.py

https://github.com/keras-team/keras/blob/master/examples/imdb_lstm.py

In [1]:
# 1. Load the IMDB dataset
#
# 2. Train a Convolutional neural net to classify text
#
# 3. Predict, evaluate
#
# 4. Try a LSTM to classify text


In [4]:
# 1. Load the IMDB dataset

from keras.datasets import imdb

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=5000)

In [5]:
word_index = imdb.get_word_index()
word_index

{'fawn': 34701,
 'tsukino': 52006,
 'nunnery': 52007,
 'sonja': 16816,
 'vani': 63951,
 'woods': 1408,
 'spiders': 16115,
 'hanging': 2345,
 'woody': 2289,
 'trawling': 52008,
 "hold's": 52009,
 'comically': 11307,
 'localized': 40830,
 'disobeying': 30568,
 "'royale": 52010,
 "harpo's": 40831,
 'canet': 52011,
 'aileen': 19313,
 'acurately': 52012,
 "diplomat's": 52013,
 'rickman': 25242,
 'arranged': 6746,
 'rumbustious': 52014,
 'familiarness': 52015,
 "spider'": 52016,
 'hahahah': 68804,
 "wood'": 52017,
 'transvestism': 40833,
 "hangin'": 34702,
 'bringing': 2338,
 'seamier': 40834,
 'wooded': 34703,
 'bravora': 52018,
 'grueling': 16817,
 'wooden': 1636,
 'wednesday': 16818,
 "'prix": 52019,
 'altagracia': 34704,
 'circuitry': 52020,
 'crotch': 11585,
 'busybody': 57766,
 "tart'n'tangy": 52021,
 'burgade': 14129,
 'thrace': 52023,
 "tom's": 11038,
 'snuggles': 52025,
 'francesco': 29114,
 'complainers': 52027,
 'templarios': 52125,
 '272': 40835,
 '273': 52028,
 'zaniacs': 52130,

In [6]:
# gets the word index json
text_to_index = imdb.get_word_index()

# maps word index json from term -> index to index -> term
index_to_text = dict((text_to_index[k], k) for k in text_to_index)

# index_to_text = dict()
# for k in text_to_index:
#    index_to_text[text_to_index[k]] = k


# converts first review from index to words 
print(y_train[0]) # sentiment
print(x_train[0])
print(" ".join([index_to_text[x] for x in x_train[0]])) # review

# [1, 2, 3]

# [1, 0, 0], [0, 1, 0], [0, 0, 1]

print(y_train[1]) # sentiment
print(x_train[1])
print(" ".join([index_to_text[x] for x in x_train[1]])) # review

1
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 2, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 2, 19, 178, 32]
the as you with out themselves powerful lets loves their becomes reaching

In [7]:
index_to_text[2]

'and'

In [8]:
len(text_to_index)

88584

In [78]:
# 2. Train a Convolutional neural net to classify text

from keras.models import Sequential
from keras.layers import Embedding # New!
from keras.layers import Conv1D, GlobalMaxPooling1D, Dense, Activation
from keras.preprocessing import sequence

max_features = 5000 # size of vocab
input_length = 400 # max length of review (must PAD reviews)

# (5000 one-hot -> 50 embedded features)
embedding_dims = 50 # size of embedding layer 

# Pad the sequences (make them same length for neural net.)
# Padding add <PAD> for missing words

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=input_length)
x_test = sequence.pad_sequences(x_test, maxlen=input_length)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

filters = 250 # rectangle depth
kernel_size = 3

# building CNN model
model = Sequential()

# featurizer
#                   5000 vocab, 50 compressed dimension, review length 400
model.add(Embedding(max_features, embedding_dims,
                    input_length=input_length))
# no dropout for now
model.add(Conv1D(filters, kernel_size, padding='valid', # no padding
                 activation='relu', strides=1))
model.add(GlobalMaxPooling1D()) # output size = 250

# classifier
model.add(Dense(filters)) # input size 250 -> output size 250
model.add(Activation('relu'))
model.add(Dense(1)) # input size 250 -> output size = 1 (-inf to inf)
model.add(Activation('sigmoid')) # sigmoid - binary 0 or 1 

model.summary()

Pad sequences (samples x time)
x_train shape: (25000, 400)
x_test shape: (25000, 400)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 400, 50)           250000    
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 398, 250)          37750     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 250)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 250)               62750     
_________________________________________________________________
activation_3 (Activation)    (None, 250)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 251       
________________________________________________________

In [79]:
# Exercise:

# Complete training following the example here:
# https://github.com/keras-team/keras/blob/master/examples/imdb_cnn.py

# Bonus: use tensorboard callbacks and early stopping

In [80]:
from keras.callbacks import TensorBoard, EarlyStopping
import time

epochs=10
batch_size=32

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

tensorboard = TensorBoard(log_dir='./logs/%d' % time.time())
earlystopping = EarlyStopping(patience=3)

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test),
          callbacks=[tensorboard, earlystopping])

Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


<keras.callbacks.History at 0x1681cb179b0>

In [76]:
model.save('imdb_lstm_2.h5')

In [60]:
score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.4832260911282897
Test accuracy: 0.88976


In [71]:
# Predict
# Source: https://www.imdb.com/title/tt3104988/reviews

test1 = """I was so excited when I first learned that Kevin Kwan's "Crazy Rich Asians" was going to become a film! The book was way more appealing than I had first imagined it would be, and I'm happy to report that Jon Chu's screen version has surpassed my hopeful-but-wary expectations. Not to make it sound too simplistic, the movie was beautiful and very, very funny. Go see it!
Yes, it is a romantic comedy - but this has such intriguing social and cultural undercurrents that it tempts even the fairly observant watcher away from taking the "Cinderella" story at its glitzy face value. While the numerous characters had to have their backstories compressed to fit into just two hours, we are given enough great dialogue, effervescent or slightly evil portrayals, and sumptuous visual clues to make the friends and family members in Singapore come alive.
All the aunties, cousins and ladies-in-waiting may be slightly overwhelming for people who haven't read the book, but anyone with wacky friends and pompous relatives should get it, even if they are not Asian. 
I liked film's especially clever use of graphics, as well as the smooth-to-rocking score, the lush and verdant locations, the perfect designer costuming, and pretty much everything else. One of my favorite lines was about having attended Cal State Fullerton; but you must to watch it for yourself. I honestly have not laughed out loud during a film this much in decades. Oh, and I rather liked Chris Pang, too. A totally hot actor, even though I'm old enough to be his mother.
As soon as Crazy Rich Asians officially opens, I'm going to catch it again. The preview was not enough, and there were so many little moments that deserve a second or third look. Now we must all hope that a sequel with the same talented cast and Chu in charge is coming our way before too long. Thank you all, you fabulous Asian actors, crew, writers and backers who made this possible. And no, I'm not of even a little bit Asian ancestry.
"""
test2 = """what a boring movie. This was a very boring film. I fell asleep in the cinema. This movie deserves no attention! I do not recommend this movie because it's a waste of time."""

def clean_and_get_sequence(text):
    # https://keras.io/preprocessing/text/#text_to_word_sequence
    from keras.preprocessing.text import text_to_word_sequence

    test_sequence = text_to_word_sequence(text, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\n   ',
                                          lower=True, split=' ')
    # print('before cleaning:', test_sequence)

    # drop words not in vocab
    test_sequence_cleaned = [s for s in test_sequence if s in text_to_index]
    #print('after cleaning:', len(test_sequence_cleaned))

    # words that got dropped
    #print('dropped words:', set(test_sequence) - set(test_sequence_cleaned))

    # map to indices
    test_sequence_index = [text_to_index[s] for s in test_sequence_cleaned]
    #print('as index\n', sequence_index)

    # filter out top 5000
    test_sequence_index_5000 = [i for i in test_sequence_index if i <= 5000]
    print('as index (top 5000 only)\n', test_sequence_index_5000)

    # look at review
    test_review = ' '.join([index_to_text[i] for i in test_sequence_index_5000])
    print('as words (top 5000 only)\n', test_review)
    
    return test_sequence_index_5000

test1_index = clean_and_get_sequence(test1)
test2_index = clean_and_get_sequence(test2)

as index (top 5000 only)
 [10, 13, 35, 2226, 51, 10, 83, 2048, 12, 1839, 929, 1023, 13, 167, 5, 410, 3, 19, 1, 271, 13, 93, 50, 2273, 71, 10, 66, 83, 3789, 9, 59, 27, 2, 143, 651, 5, 4433, 12, 2633, 265, 307, 44, 58, 18, 1395, 21, 5, 94, 9, 478, 96, 4271, 1, 17, 13, 304, 2, 52, 52, 160, 137, 64, 9, 419, 9, 6, 3, 728, 209, 18, 11, 44, 138, 1771, 1028, 2, 2664, 12, 9, 57, 1, 1015, 242, 36, 653, 1, 2376, 62, 30, 91, 390, 1104, 134, 1, 1939, 102, 66, 5, 25, 65, 5, 1180, 80, 40, 104, 631, 72, 23, 345, 192, 84, 411, 39, 1073, 442, 2, 1111, 3623, 5, 94, 1, 366, 2, 220, 1063, 8, 213, 1236, 29, 1, 2, 1911, 8, 1061, 200, 27, 1073, 3978, 15, 81, 34, 771, 329, 1, 271, 18, 256, 16, 4943, 366, 2, 4781, 141, 76, 9, 57, 45, 33, 23, 21, 2185, 10, 420, 595, 259, 1093, 358, 4, 2873, 14, 70, 14, 1, 3554, 5, 600, 1, 2, 1976, 1, 401, 2, 181, 73, 282, 331, 28, 4, 58, 511, 408, 13, 41, 257, 1107, 18, 22, 212, 5, 103, 9, 15, 621, 10, 1249, 25, 21, 1495, 43, 1289, 312, 3, 19, 11, 73, 8, 2737, 446, 2, 10, 244, 4

In [53]:
from keras.preprocessing import sequence

test_reviews = [test1_index, test2_index]

print('Pad sequences (samples x time)')
test_reviews = sequence.pad_sequences(test_reviews, maxlen=input_length)
print('test_reviews shape:', test_reviews.shape)

Pad sequences (samples x time)
test_reviews shape: (2, 400)


In [72]:
tests = [test1, test2]
pred_prob = model.predict(test_reviews)
pred_label = model.predict_classes(test_reviews)

for text, label, probability in zip(tests, pred_label, pred_prob):
    print(text)
    print('positive', label, 'probability', probability)
    print('-------')

I was so excited when I first learned that Kevin Kwan's "Crazy Rich Asians" was going to become a film! The book was way more appealing than I had first imagined it would be, and I'm happy to report that Jon Chu's screen version has surpassed my hopeful-but-wary expectations. Not to make it sound too simplistic, the movie was beautiful and very, very funny. Go see it!
Yes, it is a romantic comedy - but this has such intriguing social and cultural undercurrents that it tempts even the fairly observant watcher away from taking the "Cinderella" story at its glitzy face value. While the numerous characters had to have their backstories compressed to fit into just two hours, we are given enough great dialogue, effervescent or slightly evil portrayals, and sumptuous visual clues to make the friends and family members in Singapore come alive.
All the aunties, cousins and ladies-in-waiting may be slightly overwhelming for people who haven't read the book, but anyone with wacky friends and pomp

In [75]:
from keras.models import load_model

model2 = load_model('imdb_lstm')
model2.summary()

tests = [test1, test2]
pred_prob = model2.predict(test_reviews)
pred_label = model2.predict_classes(test_reviews)

for text, label, probability in zip(tests, pred_label, pred_prob):
    print(text)
    print('positive', label, 'probability', probability)
    print('-------')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 400, 50)           250000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 398, 250)          37750     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 250)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 250)               62750     
_________________________________________________________________
activation_1 (Activation)    (None, 250)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 251       
_________________________________________________________________
activation_2 (Activation)    (None, 1)                 0         
Total para

In [77]:
from keras.models import load_model

model3 = load_model('imdb_lstm_terence.h5')
model3.summary()

tests = [test1, test2]
pred_prob = model3.predict(test_reviews)
pred_label = model3.predict_classes(test_reviews)

for text, label, probability in zip(tests, pred_label, pred_prob):
    print(text)
    print('positive', label, 'probability', probability)
    print('-------')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 400, 50)           250000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 398, 250)          37750     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 250)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 250)               62750     
_________________________________________________________________
activation_1 (Activation)    (None, 250)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 251       
_________________________________________________________________
activation_2 (Activation)    (None, 1)                 0         
Total para

## RNN


Hidden layer at time step $t$ ($h_t$), is computed from using $h_{t-1}$ (from previous time step) ($\sigma$ = sigmoid):

$$h_t=\sigma(W_{xh}x_t+W_{hh}h_{t−1})$$

Output (only based on latest $h_t$):

$$y_t = softmax(W_{hy}h_t)$$

### RNN: Classic Unrolling
![rnn](https://chunml.github.io/images/projects/creating-text-generator-using-recurrent-neural-network/vanilla_RNN.png)


### RNN problem: Vanishing Gradient
- As we back-propagate, gradient of the cost function w.r.t. weights tends to diminish.
- This means we forget stuff from earlier time steps.
- Training becomes very slow because RNN doesn't remember much 

![vanishing gradient](https://cdn-images-1.medium.com/max/2000/1*FWy4STsp8k0M5Yd8LifG_Q.png)

### LSTM

- $h_{t-1}$ is the output at time step $t-1$
- Cell state ($C_t$) holds the "long-short term memory" and is controlled by 3 gates:
  - Input gate: decides which values to update ($i_t$)
  - Forget gate: decides which values to forget ($f_t$)
  - Output gate: decides which values to output ($o_t$)
  
- $h_t$ is the output at time step $t$

![lstm](https://chunml.github.io/images/projects/creating-text-generator-using-recurrent-neural-network/LSTM.png)

In [64]:
from keras.layers import LSTM

print('Build model...')
model_lstm = Sequential()
model_lstm.add(Embedding(max_features, 128))
model_lstm.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model_lstm.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model_lstm.compile(loss='binary_crossentropy', 
                   optimizer='adam',
                   metrics=['accuracy'])

tensorboard = TensorBoard(log_dir='./logs/lstm_%d' % time.time())
earlystopping = EarlyStopping(patience=1)

print('Train...')
model_lstm.fit(x_train, y_train, 
               batch_size=batch_size,
               epochs=15,
               validation_data=(x_test, y_test),
               callbacks=[tensorboard, earlystopping])

Build model...
Train...
Train on 25000 samples, validate on 25000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15


<keras.callbacks.History at 0x16874e23a90>

In [65]:
score, acc = model_lstm.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.586160055847168
Test accuracy: 0.67104


In [128]:
index_to_text[0] = '<PAD>'
print(" ".join([index_to_text[x] for x in x_train[0]])) # review

<PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD

In [108]:
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding

input_length=10 # sentence length
max_features=5000 # word vocab ['cat', 'dog', 'hello', 'world', .. , 'I']
output_dim=2 # output dimension 5000 -> 2

model = Sequential()
embedding = Embedding(input_dim=max_features, # vocab
                      output_dim=output_dim, # reduced dimension
                      input_length=input_length) # length sentence
model.add(embedding)

input_array = np.random.randint(max_features, size=(1, input_length))

model.compile('rmsprop', 'mse')
output_array = model.predict(input_array)

print(input_array)
print(output_array)

# [918] => [-1.09779835e-02  1.01016387e-02] # (2,) (embeddings: efficient)
# [918] => [0 0 0 0 0 0 0 .. 0 1 0 ..... 0] # (5000,) one-hot (inefficient!)

[[ 918  217  725  499 3686 4094 1907 3472  265  541]]
[[[-1.09779835e-02  1.01016387e-02]
  [ 2.35645883e-02  2.97786482e-02]
  [ 4.68710177e-02  1.79429390e-02]
  [-1.49728172e-02 -5.07915020e-03]
  [-2.50376817e-02 -3.04645300e-03]
  [-8.12233612e-03 -4.28397171e-02]
  [ 3.54124643e-02  4.47824113e-02]
  [-3.34480293e-02  3.93447615e-02]
  [ 2.95992829e-02 -2.69031879e-02]
  [ 8.04662704e-05 -9.21357423e-04]]]


In [111]:
embedding.get_weights()[0][918]

array([-0.01097798,  0.01010164], dtype=float32)

In [109]:
embedding.get_config()

{'name': 'embedding_29',
 'trainable': True,
 'batch_input_shape': (None, 10),
 'dtype': 'float32',
 'input_dim': 5000,
 'output_dim': 2,
 'embeddings_initializer': {'class_name': 'RandomUniform',
  'config': {'minval': -0.05, 'maxval': 0.05, 'seed': None}},
 'embeddings_regularizer': None,
 'activity_regularizer': None,
 'embeddings_constraint': None,
 'mask_zero': False,
 'input_length': 10}