In [None]:
'''This example demonstrates the use of Convolution1D for text classification.
Gets to 0.89 test accuracy after 2 epochs.
90s/epoch on Intel i5 2.4Ghz CPU.
10s/epoch on Tesla K40 GPU.
'''
from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.datasets import imdb

# set parameters:
max_features = 5000
maxlen = 400
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 2

In [None]:
print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features, index_from=3)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

word_to_id = imdb.get_word_index()
word_to_id = {k:(v+3) for k,v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2

id_to_word = {value:key for key,value in word_to_id.items()}
print(' '.join(id_to_word[id] for id in x_train[0]))

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

In [None]:
print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))
model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

# from keras.utils import plot_model
# plot_model(model, to_file='model.png')

<img src='model.png' >

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))
model.save_weights('cnn.h5')

In [None]:
from sklearn.decomposition import PCA
import pandas as pd
model.load_weights('cnn10epoch.h5')
vectors = model.get_weights()[0]
pca = PCA(n_components=2, whiten=True)
vals = pca.fit_transform(vectors)
df = pd.DataFrame({'x1': vals[:, 0], 'x2': vals[:, 1],
                   'words': [id_to_word.get(id, '<NA>') for id in range(max_features)]})

In [None]:
import plotly.graph_objs as go
from plotly import offline as off
off.init_notebook_mode(connected=True)

fig = {
    'data': [
        {'x': df.x1, 'y': df.x2, 'text': df.words, 'mode': 'text'}
    ],
    'layout': {
        'xaxis': {'title': 'Component 1'},
        'yaxis': {'title': "Component 2"},
        'title': 'CNN word embeddings'
    }
}
off.iplot(fig, filename='cufflinks/multiple-scatter')