In [2]:
import numpy as np
import pandas as pd
import cPickle
from collections import defaultdict
import re

from bs4 import BeautifulSoup

import sys
import os

#os.environ['KERAS_BACKEND']='tensorflow'
os.environ['KERAS_BACKEND']='theano'
os.environ['THEANO_FLAGS']='device=cuda0,floatX=float32,gpuarray.preallocate=0.3'
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional
from keras.models import Model

from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializations

In [2]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
from keras.backend.tensorflow_backend import set_session
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.3
set_session(tf.Session(config=config))
tf.device('/gpu:0')
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", "", string)    
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

data = pd.read_csv('labeledTrainData-sample1000.tsv', sep='\t')
counts = data['sentiment'].value_counts()
data_train = data[data['sentiment'].isin(counts[counts >= 20].index)]
print(data_train)
print(data_train.shape)

texts = []
labels = []

for idx, row in data_train.iterrows():
    text = BeautifulSoup(data_train.review[idx])
    texts.append(clean_str(text.get_text().encode('ascii','ignore')))
    labels.append(data_train.sentiment[idx])
    

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

           id  sentiment                                             review
0      5814_8          1  With all this stuff going down at the moment w...
1      2381_9          1  \The Classic War of the Worlds\" by Timothy Hi...
2      7759_3          0  The film starts with a manager (Nicholas Bell)...
3      3630_4          0  It must be assumed that those who praised this...
4      9495_8          1  Superbly trashy and wondrously unpretentious 8...
5      8196_8          1  I dont know why people think this is such a ba...
6      7166_2          0  This movie could have been very good, but come...
7     10633_1          0  I watched this video at a friend's house. I'm ...
8       319_1          0  A friend of mine bought this film for £1, and ...
9     8713_10          1  <br /><br />This movie is full of references. ...
10     2486_3          0  What happens when an army of wetbacks, towelhe...
11    6811_10          1  Although I generally do not like remakes belie...
12    11744_



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


Found 18949 unique tokens.
('Shape of data tensor:', (999, 1000))
('Shape of label tensor:', (999, 2))


In [3]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('Traing and validation set number of positive and negative reviews')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))

GLOVE_DIR = "glove/"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.840B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors.' % len(embeddings_index))

Traing and validation set number of positive and negative reviews
[404. 396.]
[113.  86.]
Total 2196016 word vectors.


In [4]:
# Attention GRU network		  
class AttLayer(Layer):
    def __init__(self, **kwargs):
        self.init = initializations.get('normal')
        #self.input_spec = [InputSpec(ndim=3)]
        super(AttLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape)==3
        #self.W = self.init((input_shape[-1],1))
        self.W = self.init((input_shape[-1],))
        #self.input_spec = [InputSpec(shape=input_shape)]
        self.trainable_weights = [self.W]
        super(AttLayer, self).build(input_shape)  # be sure you call this somewhere!

    def call(self, x, mask=None):
        eij = K.tanh(K.dot(x, self.W))
        
        ai = K.exp(eij)
        weights = ai/K.sum(ai, axis=1).dimshuffle(0,'x')
        
        weighted_input = x*weights.dimshuffle(0,1,'x')
        return weighted_input.sum(axis=1)

    def get_output_shape_for(self, input_shape):
        return (input_shape[0], input_shape[-1])

In [6]:
from keras.callbacks import ModelCheckpoint

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)



sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
l_gru = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
l_att = AttLayer()(l_gru)
preds = Dense(2, activation='softmax')(l_att)
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

print("model fitting - attention GRU network")
model.summary()
filepath="weights-imdb-bigruatt-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          nb_epoch=20, batch_size=50, callbacks=callbacks_list)


model fitting - attention GRU network
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_2 (InputLayer)             (None, 1000)          0                                            
____________________________________________________________________________________________________
embedding_2 (Embedding)          (None, 1000, 300)     5685000     input_2[0][0]                    
____________________________________________________________________________________________________
bidirectional_2 (Bidirectional)  (None, 1000, 600)     1081800     embedding_2[0][0]                
____________________________________________________________________________________________________
attlayer_2 (AttLayer)            (None, 600)           600         bidirectional_2[0][0]            
_____________________________________________________

<keras.callbacks.History at 0x7f09ed88f0d0>