In [14]:
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, LSTM
from keras.layers import Conv1D, Flatten, MaxPooling1D, GlobalMaxPooling1D
from keras.preprocessing import sequence, text

import numpy as np
import os
import json


# Workaround on ValueError exception when loading pickle file

Since the curent version of numpy (1.16.4) sets `allow_pickle` to `False` by default, we need to overwrite this parameter to be able to load the dataset into memory.
We should, obviously, reset the default parameters later. See how this is done in the next cell:


In [6]:
# save np.load
np_load_old = np.load

# modify the default parameters of np.load
np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)

# call load_data with allow_pickle implicitly set to true
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=vocabulary_size)

# restore np.load for future normal usage
np.load = np_load_old


In [21]:
# set parameters
vocabulary_size = 5000
max_len = 1000
batch_size = 32
embedding_dims= 25
filters = 16
kernel_size = 3
hidden_dims = 250
epochs = 10

In [22]:
# Transform the dataset
# tokenizer = text.Tokenizer(num_words=vocabulary_size)
# tokenizer.fit_on_texts(X_train)

# X_train = tokenizer.text_to_matrix(X_train)
# X_test = tokenizer.text_to_matrix(X_test)

X_train = sequence.pad_sequences(X_train, maxlen=max_len)
X_test = sequence.pad_sequences(X_test, maxlen=max_len)

In [23]:
# Prepare the model creating our own embedding with keras

model = Sequential()
# layer to map the vocab indices into embedding_dims dimensions
model.add(Embedding(vocabulary_size, embedding_dims, input_length=max_len))
model.add(Dropout(0.3))

# Add a Convolution1D to learn word group filters of size filter_length
model.add(Conv1D(filters, kernel_size, padding='valid', activation='relu'))
# we use max pooling:
model.add(MaxPooling1D())

model.add(Conv1D(filters, kernel_size, padding='valid', activation='relu'))
model.add(Flatten())
model.add(Dense(hidden_dims, activation='relu'))
model.add(Dropout(0.3))

# The output layer: positive or negative review
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [24]:
# Train the model
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x6576aa890>

In [34]:
# Prepare the model using Glove embedding

def load_glove_embeddings(src_path):
    embeddings_index = dict()
    filename = os.path.join(src_path, 'glove.6B.100d.txt')
    with open(filename) as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype = 'float32')
            embeddings_index[word] = coefs
        
    return embeddings_index

embeddings_index = load_glove_embeddings('../../../data/non_versioned')
embeddings_matrix = np.zeros((vocabulary_size, 100))
for word, index in tokenizer.word_index.items():
    if index > vocabulaty_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector



In [41]:
# Create the model
model = Sequential()

# Make this layer use the Glove embedding and do not update during training
model.add(Embedding(vocabulary_size, 100, input_length=max_len, weights=[embeddings_matrix], trainable=False))


model.add(Dropout(0.4))
# Add a Convolution1D to learn word group filters of size filter_length
model.add(Conv1D(filters, kernel_size, padding='valid', activation='relu'))
# we use max pooling:
model.add(MaxPooling1D())
model.add(Conv1D(filters, kernel_size, padding='valid', activation='relu'))
model.add(Flatten())
model.add(Dense(hidden_dims, activation='relu'))
model.add(Dropout(0.4))

# The output layer: positive or negative review
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [42]:
# Train the model
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x650479590>