# importing basic libraries

In [21]:
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from scipy.spatial.distance import cdist

# importing other imp. libraries from keras

In [22]:
# from tf.keras.models import Sequential  # This does not work!
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [23]:
tf.__version__

'1.7.0'

In [24]:
tf.keras.__version__

'2.1.4-tf'

# load/downloading data

In [25]:
import imdb

In [26]:
imdb.maybe_download_and_extract()

Data has apparently already been downloaded and unpacked.


# spliting data

In [27]:
x_train_text, y_train = imdb.load_data(train=True)
x_test_text, y_test = imdb.load_data(train=False)

In [28]:
print("Train-set size: ", len(x_train_text))
print("Test-set size:  ", len(x_test_text))

Train-set size:  25000
Test-set size:   25000


In [29]:
data_text = x_train_text + x_test_text

In [30]:
x_train_text[1]

"Trapped: buried alive brings us to a resort that has just opened, and is soon to close.<br /><br />We start with a guy in gear blowing up drifts, to avoid the possibility of avalanches. somehow, that doesn't make sense. anyways, he's about to blow away one particularly big one, when he notices the resort is open. despite his best efforts, higher authority tells him his day is over.<br /><br />soon, as everyone expects, an avalanche hits.<br /><br />Look, i'm not gonna reveal any more, all i can say is this was a B-movie designed for the family channel (which i just saw it on, and the fact it had no commercials proves it's a B-movie) anyways, it's a pretty decent film, but it's partially unreal.<br /><br />firsthand, when people are buried by ice and snow, they're buried. not just traced by powder. or, what about a CD for a screwdriver? it's not possible. and finally, what i can't stress enough, is that an explosion cannot stop a avalanche, guaranteed.<br /><br />furthermore, it's wort

In [31]:
y_train[1]

1.0

# Tokenizer

In [32]:
num_words = 10000

In [33]:
tokenizer = Tokenizer(num_words=num_words)

In [34]:
%%time
tokenizer.fit_on_texts(data_text)

CPU times: user 26.3 s, sys: 58 ms, total: 26.4 s
Wall time: 26.6 s


In [35]:
if num_words is None:
    num_words = len(tokenizer.word_index)

In [36]:
tokenizer.word_index

{'cameo': 2111,
 'barkeeps': 110508,
 'executors': 84348,
 'closet': 4312,
 'wheeeew': 94669,
 'voiceless': 54193,
 'charu': 60696,
 'maclaughlin': 57436,
 'muldoon': 52974,
 'schlock': 5715,
 'unchecked': 31823,
 'ormsby': 71031,
 'motherf': 40560,
 "1970's": 4331,
 'touristas': 80363,
 "'heart": 24107,
 'distasful': 121416,
 'kellogg': 58259,
 "'stellar": 114139,
 'yashimo': 109245,
 'favoirite': 103650,
 'filmschool': 49217,
 'jumpstart': 39411,
 "o'quinn": 37935,
 'ds9': 18632,
 'gravic': 55846,
 'avocation': 58989,
 'fanatics': 9908,
 'carlsen': 35602,
 'bombastic': 13762,
 'obstructs': 83186,
 'hooooot': 68692,
 '914': 66162,
 'round': 2147,
 'appropriating': 59570,
 'insinuates': 31798,
 'padilla': 56440,
 'typified': 32172,
 'wasted': 1002,
 'polio': 47584,
 "'inventing": 123119,
 'bristish': 106914,
 'osterlich': 42224,
 "fag'": 78497,
 'meats': 60686,
 'charities': 50802,
 'gulpilil': 25422,
 'kellogs': 112555,
 'pertinency': 112596,
 'production': 353,
 "marriages'": 110577,

In [37]:
x_train_tokens = tokenizer.texts_to_sequences(x_train_text)

In [38]:
print(x_train_text[1])
print(np.array(x_train_tokens[1]))

Trapped: buried alive brings us to a resort that has just opened, and is soon to close.<br /><br />We start with a guy in gear blowing up drifts, to avoid the possibility of avalanches. somehow, that doesn't make sense. anyways, he's about to blow away one particularly big one, when he notices the resort is open. despite his best efforts, higher authority tells him his day is over.<br /><br />soon, as everyone expects, an avalanche hits.<br /><br />Look, i'm not gonna reveal any more, all i can say is this was a B-movie designed for the family channel (which i just saw it on, and the fact it had no commercials proves it's a B-movie) anyways, it's a pretty decent film, but it's partially unreal.<br /><br />firsthand, when people are buried by ice and snow, they're buried. not just traced by powder. or, what about a CD for a screwdriver? it's not possible. and finally, what i can't stress enough, is that an explosion cannot stop a avalanche, guaranteed.<br /><br />furthermore, it's worth

In [39]:
x_test_tokens = tokenizer.texts_to_sequences(x_test_text)

# Padding and Truncating Data

In [40]:
num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)

In [41]:
np.mean(num_tokens)

221.27716

In [42]:
np.max(num_tokens)

2209

In [43]:
np.min(num_tokens)

6

In [44]:

max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens

544

In [45]:
np.sum(num_tokens < max_tokens) / len(num_tokens)

0.94528

In [46]:
pad = 'pre'

In [47]:
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens,
                            padding=pad, truncating=pad)

In [48]:
x_test_pad = pad_sequences(x_test_tokens, maxlen=max_tokens,
                           padding=pad, truncating=pad)

In [49]:
x_train_pad.shape

(25000, 544)

In [50]:
x_test_pad.shape

(25000, 544)

In [51]:
np.array(x_train_tokens[1])

array([2499, 3625, 1126,  981,  176,    5,    3, 4399,   12,   45,   39,
       3014,    2,    6,  526,    5,  500,    7,    7,   73,  375,   16,
          3,  219,    8, 6418, 3690,   53,    5,  793,    1, 3820,    4,
        835,   12,  149,   94,  282, 3639,  237,   42,    5, 2310,  243,
         27,  583,  191,   27,   50,   28, 7017,    1, 4399,    6,  849,
        467,   24,  116, 1925, 2060, 4679,  717,   87,   24,  254,    6,
        121,    7,    7,  526,   14,  304, 5877,   32, 1903,    7,    7,
        163,  145,   21, 2174, 2517,   99,   51,   29,   10,   67,  131,
          6,   11,   13,    3,  492,   17, 2611,   15,    1,  236, 1204,
         60,   10,   39,  210,    9,   20,    2,    1,  192,    9,   66,
         54, 3478, 1658,   44,    3,  492,   17, 3639,   44,    3,  180,
        540,   19,   18,   44, 5719, 5176,    7,    7,   50,   83,   23,
       3625,   31, 1830,    2, 3015,  501, 3625,   21,   39,   31,   38,
         48,   42,    3, 4287,   15,    3,   44,   

In [52]:
x_train_pad[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

# Tokenizer Inverse Map

In [53]:
idx = tokenizer.word_index
inverse_map = dict(zip(idx.values(), idx.keys()))

In [54]:
def tokens_to_string(tokens):
    # Map from tokens back to words.
    words = [inverse_map[token] for token in tokens if token != 0]
    
    # Concatenate all words.
    text = " ".join(words)

    return text

In [55]:
x_train_text[1]

"Trapped: buried alive brings us to a resort that has just opened, and is soon to close.<br /><br />We start with a guy in gear blowing up drifts, to avoid the possibility of avalanches. somehow, that doesn't make sense. anyways, he's about to blow away one particularly big one, when he notices the resort is open. despite his best efforts, higher authority tells him his day is over.<br /><br />soon, as everyone expects, an avalanche hits.<br /><br />Look, i'm not gonna reveal any more, all i can say is this was a B-movie designed for the family channel (which i just saw it on, and the fact it had no commercials proves it's a B-movie) anyways, it's a pretty decent film, but it's partially unreal.<br /><br />firsthand, when people are buried by ice and snow, they're buried. not just traced by powder. or, what about a CD for a screwdriver? it's not possible. and finally, what i can't stress enough, is that an explosion cannot stop a avalanche, guaranteed.<br /><br />furthermore, it's wort

In [56]:
tokens_to_string(x_train_tokens[1])

"trapped buried alive brings us to a resort that has just opened and is soon to close br br we start with a guy in gear blowing up to avoid the possibility of somehow that doesn't make sense anyways he's about to blow away one particularly big one when he notices the resort is open despite his best efforts higher authority tells him his day is over br br soon as everyone expects an hits br br look i'm not gonna reveal any more all i can say is this was a b movie designed for the family channel which i just saw it on and the fact it had no commercials proves it's a b movie anyways it's a pretty decent film but it's partially unreal br br when people are buried by ice and snow they're buried not just by or what about a cd for a it's not possible and finally what i can't stress enough is that an explosion cannot stop a guaranteed br br furthermore it's worth a rental or a tv viewing but not 7 10 br br the movie is rated pg but maybe it should have received something a little more strong a

# Create the RNN

In [57]:
model = Sequential()

### Embedding layer 1

In [58]:
embedding_size = 8

In [59]:
model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedding'))

#### layer 2

In [60]:
model.add(GRU(units=16, return_sequences=True))

#### layer 3

In [61]:
model.add(GRU(units=8, return_sequences=True))

#### layer 4

In [62]:
model.add(GRU(units=4))

#### FC/last layer

In [63]:
model.add(Dense(1, activation='sigmoid'))

In [64]:
optimizer = Adam(lr=1e-3)

In [65]:
model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

In [66]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 544, 8)            80000     
_________________________________________________________________
gru_1 (GRU)                  (None, 544, 16)           1200      
_________________________________________________________________
gru_2 (GRU)                  (None, 544, 8)            600       
_________________________________________________________________
gru_3 (GRU)                  (None, 4)                 156       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 5         
Total params: 81,961
Trainable params: 81,961
Non-trainable params: 0
_________________________________________________________________


# Train the RNN

In [None]:
%%time
model.fit(x_train_pad, y_train,
          validation_split=0.05, epochs=3, batch_size=64)

Train on 23750 samples, validate on 1250 samples
Epoch 1/3
Epoch 2/3
 5056/23750 [=====>........................] - ETA: 9:24 - loss: 0.5165 - acc: 0.7486

# Performance on Test-Set

In [None]:
%%time
result = model.evaluate(x_test_pad, y_test)

In [None]:
print("Accuracy: {0:.2%}".format(result[1]))

# Example of Mis-Classified Text

In [None]:
%%time
y_pred = model.predict(x=x_test_pad[0:1000])
y_pred = y_pred.T[0]

In [None]:
cls_pred = np.array([1.0 if p>0.5 else 0.0 for p in y_pred])

In [None]:
cls_true = np.array(y_test[0:1000])

In [None]:
incorrect = np.where(cls_pred != cls_true)
incorrect = incorrect[0]

In [None]:
len(incorrect)

In [None]:
idx = incorrect[0]
idx

In [None]:
text = x_test_text[idx]
text

In [None]:
y_pred[idx]

In [None]:
cls_true[idx]

# New Data

In [None]:
text1 = "This movie is fantastic! I really like it because it is so good!"
text2 = "Good movie!"
text3 = "Maybe I like this movie."
text4 = "Meh ..."
text5 = "If I were a drunk teenager then this movie might be good."
text6 = "Bad movie!"
text7 = "Not a good movie!"
text8 = "This movie really sucks! Can I get my money back please?"
texts = [text1, text2, text3, text4, text5, text6, text7, text8]


In [None]:
tokens = tokenizer.texts_to_sequences(texts)

In [None]:
tokens_pad = pad_sequences(tokens, maxlen=max_tokens,
                           padding=pad, truncating=pad)
tokens_pad.shape

In [None]:
model.predict(tokens_pad)

# Embeddings

In [None]:
layer_embedding = model.get_layer('layer_embedding')

In [None]:
weights_embedding = layer_embedding.get_weights()[0]

In [None]:
weights_embedding.shape

In [None]:
token_good = tokenizer.word_index['good']
token_good

In [None]:
token_great = tokenizer.word_index['great']
token_great

In [None]:
weights_embedding[token_good]

In [None]:
weights_embedding[token_great]

In [None]:
token_bad = tokenizer.word_index['bad']
token_horrible = tokenizer.word_index['horrible']

In [None]:
weights_embedding[token_bad]

In [None]:
weights_embedding[token_horrible]

# Sorted Words

In [None]:
def print_sorted_words(word, metric='cosine'):
    """
    Print the words in the vocabulary sorted according to their
    embedding-distance to the given word.
    Different metrics can be used, e.g. 'cosine' or 'euclidean'.
    """

    # Get the token (i.e. integer ID) for the given word.
    token = tokenizer.word_index[word]

    # Get the embedding for the given word. Note that the
    # embedding-weight-matrix is indexed by the word-tokens
    # which are integer IDs.
    embedding = weights_embedding[token]

    # Calculate the distance between the embeddings for
    # this word and all other words in the vocabulary.
    distances = cdist(weights_embedding, [embedding],
                      metric=metric).T[0]
    
    # Get an index sorted according to the embedding-distances.
    # These are the tokens (integer IDs) for words in the vocabulary.
    sorted_index = np.argsort(distances)
    
    # Sort the embedding-distances.
    sorted_distances = distances[sorted_index]
    
    # Sort all the words in the vocabulary according to their
    # embedding-distance. This is a bit excessive because we
    # will only print the top and bottom words.
    sorted_words = [inverse_map[token] for token in sorted_index
                    if token != 0]

    # Helper-function for printing words and embedding-distances.
    def _print_words(words, distances):
        for word, distance in zip(words, distances):
            print("{0:.3f} - {1}".format(distance, word))

    # Number of words to print from the top and bottom of the list.
    k = 10

    print("Distance from '{0}':".format(word))

    # Print the words with smallest embedding-distance.
    _print_words(sorted_words[0:k], sorted_distances[0:k])

    print("...")

    # Print the words with highest embedding-distance.
    _print_words(sorted_words[-k:], sorted_distances[-k:])

In [None]:
print_sorted_words('great', metric='cosine')

In [None]:
print_sorted_words('worst', metric='cosine')