In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

sns.set()
sns.set_style('white')

In [2]:
vocab_size = 10000
max_len = 100

(train_sequences,train_labels),(test_sequences,test_labels) = tf.keras.datasets.imdb.load_data(num_words=vocab_size)

In [4]:
word_index = tf.keras.datasets.imdb.get_word_index()
word_index = {k:(v+3) for k,v in word_index.items()}
word_index["<START>"] = 1
word_index["<UNK>"] = 2
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [5]:
train_ex_sequence = train_sequences[0]
print(train_ex_sequence)
train_ex_text = [reverse_word_index[index] for index in train_ex_sequence]
print(train_ex_text)

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
['<START>', 'this', 'film', 'was', 'just', 'brilliant', 'cas

In [6]:
train_sequences = pad_sequences(train_sequences,maxlen=max_len, truncating='post')
test_sequences  = pad_sequences(test_sequences,maxlen=max_len, truncating='post')

In [7]:
# encoding is important without using embedding! but why not just normalize?
def vectorize(sequences, dimension = vocab_size):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1
    return results
train_sequences_vectorized = vectorize(train_sequences)
test_sequences_vectorized = vectorize(test_sequences)

In [9]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu',input_shape=(vocab_size,)),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()
num_epochs = 2
model.fit(train_sequences_vectorized, train_labels, epochs=num_epochs, validation_data=(test_sequences_vectorized, test_labels))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 128)               1280128   
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 129       
Total params: 1,280,257
Trainable params: 1,280,257
Non-trainable params: 0
_________________________________________________________________
Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x1a4dc2d128>

In [11]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 16, input_length=max_len),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()
num_epochs = 2
model.fit(train_sequences, train_labels, epochs=num_epochs, validation_data=(test_sequences, test_labels))

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 16)           160000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 1600)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 1601      
Total params: 161,601
Trainable params: 161,601
Non-trainable params: 0
_________________________________________________________________
Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x1a4f5f76d8>

## Why use work embedding?
mainly for computational efficiency.

In [12]:
# plot embedding vectors of works or similarity