In [1]:
from theano.sandbox import cuda
cuda.use('gpu3')

Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)


In [2]:
%matplotlib inline

import utils; reload(utils)
from utils import *
from __future__ import division, print_function

Using Theano backend.


In [3]:
model_path = './models/'
%mkdir -p $model_path

## Setup data

In [5]:
from keras.datasets import imdb
word_index = imdb.get_word_index()

In [6]:
idx_arr = sorted(word_index, key=word_index.get)

In [8]:
idx2word = {v: k for k, v in word_index.iteritems()}

In [9]:
path = get_file(
    'imdb_fill.pkl',
    origin='https://s3.amazonaws.com/text-datasets/imdb_full.pkl',
    md5_hash='d091312047c43cf9e4e39fef92437263')

A local file was found, but it seems to be incomplete or outdated.
Downloading data from https://s3.amazonaws.com/text-datasets/imdb_full.pkl

In [10]:
f = open(path, 'rb')
(x_train, labels_train), (x_test, labels_test) = pickle.load(f)

In [11]:
idx_arr[:10]

['the', 'and', 'a', 'of', 'to', 'is', 'br', 'in', 'it', 'i']

In [12]:
idx2word[23002]

'breadth'

In [13]:
' '.join(idx2word[o] for o in x_train[0])

"bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me to believe that bromwell high's satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers' pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled at high a classic line inspector i'm here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isn't"

In [14]:
labels_train[:10]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [15]:
# Truncate the vocab down to 5000
vocab_size = 5000

In [16]:
trn = [np.array([i if i < vocab_size - 1 else vocab_size - 1 for i in s]) for s in x_train]
test = [np.array([i if i < vocab_size - 1 else vocab_size - 1 for i in s]) for s in x_test]

Look at the distribution of lengths of sequences.

In [17]:
lens = np.array(map(len, trn))
(lens.max(), lens.min(), lens.mean())

(2493, 10, 237.71364)

Truncate posts. Setting the truncation length to twice the mean.

In [18]:
seq_len = 500

trn = sequence.pad_sequences(trn,  maxlen=seq_len, value=0)
test = sequence.pad_sequences(test, maxlen=seq_len, value=0)

In [19]:
trn.shape

(25000, 500)

## Create simple models

### Single hidden layer NN

In [20]:
model = Sequential([
    Embedding(vocab_size, 32, input_length=seq_len),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')])

In [21]:
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [22]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 500, 32)       160000      embedding_input_1[0][0]          
____________________________________________________________________________________________________
flatten_1 (Flatten)              (None, 16000)         0           embedding_1[0][0]                
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 100)           1600100     flatten_1[0][0]                  
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 100)           0           dense_1[0][0]                    
___________________________________________________________________________________________

In [23]:
model.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=2, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fb6a4567b90>

## Single conv layer with max pooling

In [24]:
conv1 = Sequential([
    Embedding(vocab_size, 32, input_length=seq_len, dropout=0.2),
    Dropout(0.2),
    # Sentence is in 1D
    # 64 filters, 5 words at a time
    Convolution1D(64, 5, border_mode='same', activation='relu'),
    Dropout(0.2),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')])

In [25]:
conv1.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_2 (Embedding)          (None, 500, 32)       160000      embedding_input_2[0][0]          
____________________________________________________________________________________________________
dropout_2 (Dropout)              (None, 500, 32)       0           embedding_2[0][0]                
____________________________________________________________________________________________________
convolution1d_1 (Convolution1D)  (None, 500, 64)       10304       dropout_2[0][0]                  
____________________________________________________________________________________________________
dropout_3 (Dropout)              (None, 500, 64)       0           convolution1d_1[0][0]            
___________________________________________________________________________________________

In [26]:
conv1.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [27]:
conv1.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=2, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fb69e74b250>

In [28]:
conv1.save_weights(model_path + 'conv1.h5')

In [29]:
conv1.load_weights(model_path + 'conv1.h5')

## Pretrained vectors

Below we replicate the previous CNN, using pre-trained embeddings.

In [30]:
def get_glove_dataset(dataset):
    """
    Download the requested glove dataset from files.fast.ai
    and return a location that can be passed to load_vectors.
    """
    # see wordvectors.ipynb for info on how these files were
    # generated from the original glove data.
    md5sums = {'6B.50d': '8e1557d1228decbda7db6dfd81cd9909',
               '6B.100d': 'c92dbbeacde2b0384a43014885a60b2c',
               '6B.200d': 'af271b46c04b0b2e41a84d8cd806178d',
               '6B.300d': '30290210376887dcc6d0a5a6374d8255'}
    glove_path = os.path.abspath('data/glove/results')
    %mkdir -p $glove_path
    return get_file(dataset,
                    'http://files.fast.ai/models/glove/' + dataset + '.tgz',
                    cache_subdir=glove_path,
                    md5_hash=md5sums.get(dataset, None),
                    untar=True)

In [31]:
def load_vectors(loc):
    return (
        load_array(loc+'.dat'),
        pickle.load(open(loc+'_words.pkl','rb')),
        pickle.load(open(loc+'_idx.pkl','rb')))

In [39]:
vecs, words, wordidx = load_vectors(get_glove_dataset('6B.50d'))

Untaring file...


Since the glove and Imdb word ids have different indexes, we create a function that creates an embedding matrix using the indexes from Imdb and embedding from Glove (if they exist).

In [40]:
def create_emb():
    n_fact = vecs.shape[1]
    emb = np.zeros((vocab_size, n_fact))

    for i in range(1,len(emb)):
        word = idx2word[i]
        if word and re.match(r"^[a-zA-Z0-9\-]*$", word):
            src_idx = wordidx[word]
            emb[i] = vecs[src_idx]
        else:
            # If we can't find the word in glove, randomly initialize
            emb[i] = normal(scale=0.6, size=(n_fact,))

    # This is our "rare word" id - we want to randomly initialize
    emb[-1] = normal(scale=0.6, size=(n_fact,))
    emb/=3
    return emb

In [41]:
emb = create_emb()

Pass the embedding matrix to Embedding constructor and set to non-trainable.

In [42]:
model = Sequential([
    Embedding(vocab_size, 50, input_length=seq_len, dropout=0.2, 
              weights=[emb], trainable=False),
    Dropout(0.25),
    Convolution1D(64, 5, border_mode='same', activation='relu'),
    Dropout(0.25),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')])

In [43]:
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [44]:
model.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=2, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fb6936bad50>

Class claims to beat the previous model, but I'm not seeing that. See how fine tuning the embedding goes.

In [45]:
model.layers[0].trainable=True

In [46]:
model.optimizer.lr=1e-4

In [47]:
model.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=1, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/1


<keras.callbacks.History at 0x7fb68bfb85d0>

In [48]:
model.save_weights(model_path + 'glove50.h5')

## Multi-size CNN

An implementation of a "multi-size" CNN.

In [49]:
from keras.layers import Merge

Use the functional API to create multiple conv layers of different sizes, and then concatenate them.

In [50]:
graph_in = Input ((vocab_size, 50))
convs = [] 
for fsz in range (3, 6): 
    x = Convolution1D(64, fsz, border_mode='same', activation="relu")(graph_in)
    x = MaxPooling1D()(x) 
    x = Flatten()(x) 
    convs.append(x)
out = Merge(mode="concat")(convs) 
graph = Model(graph_in, out)

In [51]:
emb = create_emb()

Replace the conv/max-pool layer in our original CNN with the concatenated conv layers.

In [52]:
model = Sequential ([
    Embedding(vocab_size, 50, input_length=seq_len, dropout=0.2, weights=[emb]),
    Dropout (0.2),
    graph,
    Dropout (0.5),
    Dense (100, activation="relu"),
    Dropout (0.7),
    Dense (1, activation='sigmoid')
    ])

In [53]:
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [54]:
model.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=2, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fb68353c550>

## LSTM

Coming up in the next lesson.

In [55]:
model = Sequential([
    Embedding(vocab_size, 32, input_length=seq_len, mask_zero=True,
              W_regularizer=l2(1e-6), dropout=0.2),
    LSTM(100, consume_less='gpu'),
    Dense(1, activation='sigmoid')])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_6 (Embedding)          (None, 500, 32)       160000      embedding_input_6[0][0]          
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 100)           53200       embedding_6[0][0]                
____________________________________________________________________________________________________
dense_11 (Dense)                 (None, 1)             101         lstm_1[0][0]                     
Total params: 213301
____________________________________________________________________________________________________


In [56]:
model.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=5, batch_size=64)

/home/ubuntu/.theano/compiledir_Linux-4.4--generic-x86_64-with-debian-stretch-sid-x86_64-2.7.12-64/tmp3ihEsS/mod.cpp:788:1: internal compiler error: Segmentation fault
 }
 ^
Please submit a full bug report,
with preprocessed source if appropriate.
See <file:///usr/share/doc/gcc-5/README.Bugs> for instructions.



00001	#include <Python.h>
00002	#include <iostream>
00003	#include "theano_mod_helper.h"
00004	#include <math.h>
00005	#include <numpy/arrayobject.h>
00006	#include <numpy/arrayscalars.h>
00007	#include <vector>
00008	#include <algorithm>
00009	//////////////////////
00010	////  Support Code
00011	//////////////////////
00012	
00013	#define THEANO_MACRO_MOD(x,y) (x % y)
00014	
00015	    namespace {
00016	    struct __struct_compiled_op_109774dadea06f52449f19787893597b {
00017	        PyObject* __ERROR;
00018	
00019	        PyObject* storage_V3;
00020	PyObject* storage_V5;
00021	PyObject* storage_V7;
00022	PyObject* storage_V9;
00023	PyObject* storage_V1;
00024	        
00025	
00026	        __struct_compiled_op_109774dadea06f52449f19787893597b() {
00027	            // This is only somewhat safe because we:
00028	            //  1) Are not a virtual class
00029	            //  2) Do not use any virtual classes in the members
00030	            //  3) Deal with mostly POD and pointers
0003

Exception: ('The following error happened while compiling the node', Elemwise{Composite{Switch(EQ(i0, i1), ((i2 * i0) // (i3 * i0)), i0)}}[(0, 0)](Elemwise{Composite{Switch(EQ(i0, i1), i2, i0)}}[(0, 0)].0, TensorConstant{-1}, TensorConstant{100}, TensorConstant{-100}), '\n', 'Compilation failed (return status=1): /home/ubuntu/.theano/compiledir_Linux-4.4--generic-x86_64-with-debian-stretch-sid-x86_64-2.7.12-64/tmp3ihEsS/mod.cpp:788:1: internal compiler error: Segmentation fault.  }.  ^. Please submit a full bug report,. with preprocessed source if appropriate.. See <file:///usr/share/doc/gcc-5/README.Bugs> for instructions.. ', '[Elemwise{Composite{Switch(EQ(i0, i1), ((i2 * i0) // (i3 * i0)), i0)}}[(0, 0)](<TensorType(int64, scalar)>, TensorConstant{-1}, TensorConstant{100}, TensorConstant{-100})]')