In [0]:
!git status -s

In [0]:
from __future__ import absolute_import
from __future__ import print_function
import numpy as np
np.random.seed(1337) # for reproducibility

from keras.preprocessing import sequence
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Reshape, Flatten, Merge
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.layers.convolutional import Convolution2D, Convolution1D, MaxPooling2D, MaxPooling1D
from keras.datasets import imdb, reuters

In [0]:
def import_path(fullpath):
    """ 
    Import a file with full path specification. Allows one to
    import from anywhere, something __import__ does not do. 
    """
    import os, sys
    path, filename = os.path.split(fullpath)
    filename, ext = os.path.splitext(filename)
    sys.path.append(path)
    module = __import__(filename)
    reload(module) # Might be out of date
    del sys.path[-1]
    return module

In [0]:
#imdb_load = import_path('/data/imports/imdb_load.py')
imdb_load = import_path('/data/notebooks/imports/imdb_load_dataframe.py')
load_character_encoded_data = imdb_load.load_character_encoded_data

In [0]:
'''
    Train a CNN on the IMDB sentiment classification task.

    GPU command:
        THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python imdb_lstm.py
'''

In [0]:
# hyperparameters
nb_epoch = 10
max_features = 20000
maxlen = 1014 # cut texts after this number of words (among top max_features most common words)
batch_size = 128
nb_classes = 1
nb_feature_maps = 32
embedding_size = 67
fully_connected_size = 512 
filter_size_row = 1
filter_size_col = 3

In [0]:
print("Loading data...")
(X_train, Y_train), (X_test, Y_test) = load_character_encoded_data()

In [0]:
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('Y_train shape:', Y_train.shape)
print('Y_test shape:', Y_test.shape)

In [0]:
import pickle

In [0]:
with open('/data/pickles/imdb_x_train.pkl', 'wb') as p:
  pickle.dump(X_train, p)

In [0]:
with open('/data/pickles/imdb_x_test.pkl', 'wb') as p:
  pickle.dump(X_test, p)

In [0]:
with open('/data/pickles/imdb_y_train.pkl', 'wb') as p:
  pickle.dump(Y_train, p)

In [0]:
with open('/data/pickles/imdb_y_test.pkl', 'wb') as p:
  pickle.dump(Y_test, p)

In [0]:
with open('/data/pickles/imdb_x_train.pkl', 'rb') as p:
  X_train = pickle.load(p)

In [0]:
X_train_4d = X_train.reshape(X_train.shape[0], 1, 1014, embedding_size)
X_test_4d = X_test.reshape(X_test.shape[0], 1, 1014, embedding_size)
X_train_4d = X_train_4d.astype("float32")
X_test_4d = X_test_4d.astype("float32")
print(X_train_4d.shape)

In [0]:
#print('Build model...')
#
## initialize the neural net and reshape the data
#model = Sequential()
#
#model.add(Convolution2D(32, 1, 3, 3, border_mode='full'))
#model.add(Activation('relu'))
#model.add(Convolution2D(32, 32, 3, 3))
#model.add(Activation('relu'))
#model.add(MaxPooling2D(poolsize=(2, 2)))
#model.add(Dropout(0.25))
#
#model.add(Flatten())
#model.add(Dense(8*1*1014*70, 128))
#model.add(Activation('relu'))
#model.add(Dropout(0.5))
#
#model.add(Dense(128, nb_classes))
#model.add(Activation('sigmoid'))

In [0]:
fully_connected = [8704,1024,1024,1]

print('Build model...')
model = Sequential()

print("Going into first layer")
model.add(Convolution1D(embedding_size,256,7))
model.add(MaxPooling1D(pool_length=3))

print("Going into second layer")
model.add(Convolution1D(256,256,7))
model.add(MaxPooling1D(pool_length=3))

print("Going into third layer")
model.add(Convolution1D(256,256,3))

print("Going into fourth layer")
model.add(Convolution1D(256,256,3))

print("Going into fifth layer")
model.add(Convolution1D(256,256,3))

print("Going into sixth layer")
model.add(Convolution1D(256,256,3))
model.add(MaxPooling1D(pool_length=3))

model.add(Flatten())

print("Going into fully Connected layer")
#Fully Connected Layers 
model.add(Dense(fully_connected[0], fully_connected[1]))
model.add(Dropout(0.25))
model.add(Activation('relu'))

model.add(Dense(fully_connected[1], fully_connected[2]))
model.add(Dropout(0.5))
model.add(Activation('relu'))

model.add(Dense(fully_connected[2], fully_connected[3]))
model.add(Activation('sigmoid'))

In [0]:
print( "Training proportion of positive tweets: {}%".format(100*Y_train.sum()/len(Y_train)) )

In [0]:
# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', optimizer='rmsprop', class_mode="binary")

In [0]:
print("Train...")
model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch, show_accuracy=True, verbose=1, validation_data=(X_test, Y_test))

In [0]:
print("Score...")
score, acc = model.evaluate(X_test, Y_test, batch_size=batch_size, show_accuracy=True)

print('Test score:', score)
print('Test accuracy:', acc)

In [0]:
# initialize the neural net and reshape the data
model = Sequential()
#model.add(Embedding(max_features, embedding_size)) # embed into dense 3D float tensor (samples, maxlen, 256)
#model.add(Reshape(1, maxlen, embedding_size)) # reshape into 4D tensor (samples, 1, maxlen, 256)

# convolution stack
model.add(Convolution2D(nb_feature_maps, nb_classes, filter_size_row, filter_size_col, border_mode='full')) # reshaped to 32 x maxlen x 256 (32 x 100 x 256)
model.add(Activation('relu'))

# convolution stack with regularization
model.add(Convolution2D(nb_feature_maps, nb_feature_maps, filter_size_row, filter_size_col, border_mode='full')) # reshaped to 32 x maxlen x 256 (32 x 100 x 256)
model.add(Activation('relu'))
model.add(MaxPooling2D(poolsize=(2, 2))) # reshaped to 32 x maxlen/2 x 256/2 (32 x 50 x 128)
model.add(Dropout(0.25))

# convolution stack with regularization
model.add(Convolution2D(nb_feature_maps, nb_feature_maps, filter_size_row, filter_size_col)) # reshaped to 32 x 50 x 128
model.add(Activation('relu'))
model.add(MaxPooling2D(poolsize=(2, 2))) # reshaped to 32 x maxlen/2/2 x 256/2/2 (32 x 25 x 64)
model.add(Dropout(0.25))

# fully-connected layer
model.add(Flatten())
model.add(Dense(nb_feature_maps * (maxlen/2/2) * (embedding_size/2/2), fully_connected_size))
model.add(Activation("relu"))
model.add(Dropout(0.50))

# output classifier
model.add(Dense(fully_connected_size, nb_classes))
model.add(Activation("sigmoid"))

In [0]:
# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', optimizer='rmsprop', class_mode="binary")

In [0]:
model.fit(X_train_4d, Y_train, batch_size=32, nb_epoch=nb_epoch, show_accuracy=True, verbose=1, validation_data=(X_test_4d, Y_test))

In [0]:
print("Score...")
score, acc = model.evaluate(X_test_4d, Y_test, batch_size=batch_size, show_accuracy=True)

print('Test score:', score)
print('Test accuracy:', acc)