In [1]:
from keras.datasets import imdb
from keras.utils import get_file
import pickle
import numpy as np
from keras.preprocessing import sequence
from keras.layers import Input, Dense, Embedding, Flatten, Dropout, Convolution1D, MaxPooling1D, concatenate
from keras.layers.merge import Concatenate
from keras.models import Model
from keras.optimizers import Adam


Using TensorFlow backend.


In [2]:
idx = imdb.get_word_index()
idx['hello']

4822

In [3]:
idx2word = {v: k for k, v in idx.items()}

In [4]:
sorted_idx = sorted(idx, key=idx.get)
sorted_idx[0:5]

['the', 'and', 'a', 'of', 'to']

In [5]:
#grab the imdb dataset
path = get_file('imdb_full.pkl',
                origin='https://s3.amazonaws.com/text-datasets/imdb_full.pkl',
                md5_hash='d091312047c43cf9e4e38fef92437263')
f = open(path, 'rb')
(x_train, labels_train), (x_test, labels_test) = pickle.load(f)

In [6]:
#test to see if download worked--should be 25000
len(x_train)

25000

In [7]:
#test to see if entries in imdb dataset are as they should be--should be a list of numbers
', '.join(map(str, x_train[0]))

'23022, 309, 6, 3, 1069, 209, 9, 2175, 30, 1, 169, 55, 14, 46, 82, 5869, 41, 393, 110, 138, 14, 5359, 58, 4477, 150, 8, 1, 5032, 5948, 482, 69, 5, 261, 12, 23022, 73935, 2003, 6, 73, 2436, 5, 632, 71, 6, 5359, 1, 25279, 5, 2004, 10471, 1, 5941, 1534, 34, 67, 64, 205, 140, 65, 1232, 63526, 21145, 1, 49265, 4, 1, 223, 901, 29, 3024, 69, 4, 1, 5863, 10, 694, 2, 65, 1534, 51, 10, 216, 1, 387, 8, 60, 3, 1472, 3724, 802, 5, 3521, 177, 1, 393, 10, 1238, 14030, 30, 309, 3, 353, 344, 2989, 143, 130, 5, 7804, 28, 4, 126, 5359, 1472, 2375, 5, 23022, 309, 10, 532, 12, 108, 1470, 4, 58, 556, 101, 12, 23022, 309, 6, 227, 4187, 48, 3, 2237, 12, 9, 215'

In [8]:
#test to see idx2word is working--should be 'bromwell'
idx2word[23022]

'bromwell'

In [9]:
#more check if idx2word is working--should show a whole review
' '.join([idx2word[o] for o in x_train[0]])

"bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me to believe that bromwell high's satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers' pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled at high a classic line inspector i'm here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isn't"

In [10]:
#make sure training labels are there 1 is for good review, 0 for negative
labels_train[:10]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [11]:
vocab_size = 5000
#replaces the rare words
trn = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_train]
test = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_test]

In [12]:
seq_len = 500
trn = sequence.pad_sequences(trn, maxlen=seq_len, value=0)
test = sequence.pad_sequences(test, maxlen=seq_len, value=0)

In [13]:
#test to make sure training data is right shape, should be 25000,500
trn.shape

(25000, 500)

In [14]:
#first model with our own 50 dimensional word embeddings
input_tensor = Input(shape=(seq_len,), dtype='int32', name='main_input')

embedding_layer = Embedding(5000, 50, input_length=seq_len, name='main_embedding')(input_tensor)
embedding_layer = Dropout(0.2)(embedding_layer)

convs = [] 
for num in range (2, 5): 
    x = Convolution1D(64, num, padding='same', activation="relu")(embedding_layer)
    x = MaxPooling1D()(x) 
    x = Flatten()(x) 
    convs.append(x)

conv_out = Concatenate(name='concatenated_convs')(convs)
shortcut = Flatten(name='shortcut_main_embedding')(embedding_layer)
dense_in = Concatenate(name='concat_main_embedding_plus_convs')([conv_out, shortcut])

nex = Dropout(0.2)(dense_in)
nex = Dense(100, activation="relu", name='dense_consolidator')(nex)
nex = Dropout(0.2)(nex)
full_out = Dense (1, activation='sigmoid', name='final_output')(nex)

local_embedding_only_50_model = Model(input_tensor, full_out) 
local_embedding_only_50_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
main_input (InputLayer)          (None, 500)           0                                            
____________________________________________________________________________________________________
main_embedding (Embedding)       (None, 500, 50)       250000      main_input[0][0]                 
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 500, 50)       0           main_embedding[0][0]             
____________________________________________________________________________________________________
conv1d_1 (Conv1D)                (None, 500, 64)       6464        dropout_1[0][0]                  
___________________________________________________________________________________________

In [15]:
local_embedding_only_50_model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [16]:
local_embedding_only_50_model.fit(trn, labels_train, validation_data=(test, labels_test), epochs=2, batch_size=64, verbose=1)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f0b7ef37a58>

In [17]:
local_embedding_only_50_model.fit(trn, labels_train, validation_data=(test, labels_test), epochs=8, batch_size=64, verbose=1)

Train on 25000 samples, validate on 25000 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f0b78f75198>

In [18]:
#second model with our own 100 dimensional word embeddings
input_tensor = Input(shape=(seq_len,), dtype='int32', name='main_input')

embedding_layer = Embedding(5000, 100, input_length=seq_len, name='main_embedding')(input_tensor)
embedding_layer = Dropout(0.2)(embedding_layer)

convs = [] 
for num in range (2, 5): 
    x = Convolution1D(64, num, padding='same', activation="relu")(embedding_layer)
    x = MaxPooling1D()(x) 
    x = Flatten()(x) 
    convs.append(x)

conv_out = Concatenate(name='concatenated_convs')(convs)
shortcut = Flatten(name='shortcut_main_embedding')(embedding_layer)
dense_in = Concatenate(name='concat_main_embedding_plus_convs')([conv_out, shortcut])

nex = Dropout(0.2)(dense_in)
nex = Dense(100, activation="relu", name='dense_consolidator')(nex)
nex = Dropout(0.2)(nex)
full_out = Dense (1, activation='sigmoid', name='final_output')(nex)

local_embedding_only_100_model = Model(input_tensor, full_out) 
local_embedding_only_100_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
main_input (InputLayer)          (None, 500)           0                                            
____________________________________________________________________________________________________
main_embedding (Embedding)       (None, 500, 100)      500000      main_input[0][0]                 
____________________________________________________________________________________________________
dropout_4 (Dropout)              (None, 500, 100)      0           main_embedding[0][0]             
____________________________________________________________________________________________________
conv1d_4 (Conv1D)                (None, 500, 64)       12864       dropout_4[0][0]                  
___________________________________________________________________________________________

In [19]:
local_embedding_only_100_model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [20]:
local_embedding_only_100_model.fit(trn, labels_train, validation_data=(test, labels_test), epochs=2, batch_size=64, verbose=1)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f0b487bbfd0>

In [21]:
local_embedding_only_100_model.fit(trn, labels_train, validation_data=(test, labels_test), epochs=8, batch_size=64, verbose=1)

Train on 25000 samples, validate on 25000 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f0b243806d8>

In [22]:
embeddings_index = {}
f = open('glove/glove.6B.50d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [23]:
word_index = idx
truncated_word_index = {key: value for key, value in word_index.items() if value < 5000}
print(len(truncated_word_index))
glove_embedding_dimension = 50
embedding_matrix = np.zeros((len(truncated_word_index) + 1, glove_embedding_dimension))
for word, i in truncated_word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

4999


In [35]:
#third model with glove 50 dimensional word embeddings only 
input_tensor = Input(shape=(seq_len,), dtype='int32', name='main_input')

glove_embedding_layer = Embedding(5000, 50, input_length=500, name='glove_embedding', weights=[embedding_matrix], trainable=False)(input_tensor)

convs = [] 
for num in range (2, 5): 
    x = Convolution1D(64, num, padding='same', activation="relu")(glove_embedding_layer)
    x = MaxPooling1D()(x) 
    x = Flatten()(x) 
    convs.append(x)

conv_out = Concatenate(name='concatenated_convs')(convs)
shortcut = Flatten(name='shortcut_main_embedding')(glove_embedding_layer)
dense_in = Concatenate(name='concat_main_embedding_plus_convs')([conv_out, shortcut])

nex = Dropout(0.2)(dense_in)
nex = Dense(100, activation="relu", name='dense_consolidator')(nex)
nex = Dropout(0.2)(nex)
full_out = Dense (1, activation='sigmoid', name='final_output')(nex)

glove_embedding_only_50_model = Model(input_tensor, full_out) 
glove_embedding_only_50_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
main_input (InputLayer)          (None, 500)           0                                            
____________________________________________________________________________________________________
glove_embedding (Embedding)      (None, 500, 50)       250000      main_input[0][0]                 
____________________________________________________________________________________________________
conv1d_31 (Conv1D)               (None, 500, 64)       6464        glove_embedding[0][0]            
____________________________________________________________________________________________________
conv1d_32 (Conv1D)               (None, 500, 64)       9664        glove_embedding[0][0]            
___________________________________________________________________________________________

In [36]:
glove_embedding_only_50_model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [37]:
glove_embedding_only_50_model.fit(trn, labels_train, validation_data=(test, labels_test), epochs=2, batch_size=64, verbose=1)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f0ae4be71d0>

In [38]:
glove_embedding_only_50_model.fit(trn, labels_train, validation_data=(test, labels_test), epochs=8, batch_size=64, verbose=1)

Train on 25000 samples, validate on 25000 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f0ae4be72b0>

In [39]:
#fourth model combining our local embedding along with the glove embedding
input_tensor = Input(shape=(500,), dtype='int32', name='main_input')

embedding_layer = Embedding(5000, 50, input_length=500, name='main_embedding')(input_tensor)
embedding_layer = Dropout(0.2)(embedding_layer)

glove_embedding_layer = Embedding(5000, 50, input_length=500, name='glove_embedding', weights=[embedding_matrix], trainable=False)(input_tensor)

convs = [] 
for num in range (2, 5): 
    x = Convolution1D(64, num, padding='same', activation="relu")(embedding_layer)
    x = MaxPooling1D()(x) 
    x = Flatten()(x) 
    convs.append(x)

for num in range (2, 5): 
    x = Convolution1D(64, num, padding='same', activation="relu")(glove_embedding_layer)
    x = MaxPooling1D()(x) 
    x = Flatten()(x) 
    convs.append(x)

conv_out = Concatenate(name='concatenated_convs')(convs)
shortcut = Flatten(name='shortcut_main_embedding')(embedding_layer)
dense_in = Concatenate(name='concat_main_embedding_plus_convs')([conv_out, shortcut])

nex = Dropout(0.2)(dense_in)
nex = Dense(100, activation="relu", name='dense_consolidator')(nex)
nex = Dropout(0.2)(nex)
full_out = Dense (1, activation='sigmoid', name='final_output')(nex)

combined_50_50_model = Model(input_tensor, full_out) 
combined_50_50_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
main_input (InputLayer)          (None, 500)           0                                            
____________________________________________________________________________________________________
main_embedding (Embedding)       (None, 500, 50)       250000      main_input[0][0]                 
____________________________________________________________________________________________________
dropout_27 (Dropout)             (None, 500, 50)       0           main_embedding[0][0]             
____________________________________________________________________________________________________
glove_embedding (Embedding)      (None, 500, 50)       250000      main_input[0][0]                 
___________________________________________________________________________________________

In [40]:
combined_50_50_model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [41]:
combined_50_50_model.fit(trn, labels_train, validation_data=(test, labels_test), epochs=2, batch_size=64, verbose=1)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f0ae4592400>

In [42]:
combined_50_50_model.fit(trn, labels_train, validation_data=(test, labels_test), epochs=8, batch_size=64, verbose=1)

Train on 25000 samples, validate on 25000 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f0ae42ebb00>