In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"         # 3 is can change to 0-3

import pickle
import keras
from keras.models import Sequential, Model
from keras.layers.core import Dense, Dropout, Activation, Flatten, Lambda
from keras.layers.embeddings import Embedding
from keras.layers import Convolution1D, MaxPooling1D, GlobalMaxPooling1D, Input, Dense, Reshape, LSTM, GRU, Bidirectional, TimeDistributed
from keras.layers.merge import concatenate, add
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras import initializers
from keras.engine.topology import Layer
from keras import backend as K
import tensorflow as tf
import numpy as np
from util.util_functions import getWordIdx
from sklearn.metrics import classification_report, roc_auc_score

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# loading processed data

In [2]:
#loading the train_copus_padded data from .pickle file
file = open('pickle_data/train_copus_pad.pickle','rb')
train_copus_padded = pickle.load(file)

file = open('pickle_data/test_copus_pad.pickle','rb')
test_copus_padded = pickle.load(file)

file = open('pickle_data/vocab_train.pickle','rb')
vocab_to_int_train = pickle.load(file)

file = open('pickle_data/embedding_matrix','rb')
embedding_matrix = pickle.load(file)

file = open('pickle_data/train_label.pickle','rb')
train_label = pickle.load(file)

file = open('pickle_data/test_label.pickle','rb')
test_label = pickle.load(file)
file.close()

In [3]:
print('train test data shape:',train_copus_padded.shape, test_copus_padded.shape)
print('embedding_matrix shape:', embedding_matrix.shape)
#the size of vocabulary
vocab_size = len(vocab_to_int_train)
print('vocabulary size:', vocab_size)
# the maximal length of every sentence
MAX_SENTS = train_copus_padded.shape[1]
MAX_SENT_LENGTH = train_copus_padded.shape[2]
print('max sent number in a review:', MAX_SENTS, '\nmax words in a sentence:', MAX_SENT_LENGTH)

train test data shape: (25000, 36, 224) (25000, 36, 224)
embedding_matrix shape: (97162, 300)
vocabulary size: 97162
max sent number in a review: 36 
max words in a sentence: 224


# keras building model

In [4]:
class AttLayer(Layer):
    def __init__(self, attention_dim):
        self.init = initializers.get('normal')
        self.supports_masking = True
        self.attention_dim = attention_dim
        super(AttLayer, self).__init__()

    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
        self.b = K.variable(self.init((self.attention_dim, )))
        self.u = K.variable(self.init((self.attention_dim, 1)))
        self.trainable_weights = [self.W, self.b, self.u]
        super(AttLayer, self).build(input_shape)

    def compute_mask(self, inputs, mask=None):
        return mask

    def call(self, x, mask=None):
        # size of x :[batch_size, sel_len, attention_dim]
        # size of u :[batch_size, attention_dim]
        # uit = tanh(xW+b)
        uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
        ait = K.dot(uit, self.u)
        ait = K.squeeze(ait, -1)

        ait = K.exp(ait)

        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            ait *= K.cast(mask, K.floatx())
        ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        ait = K.expand_dims(ait)
        weighted_input = x * ait
        output = K.sum(weighted_input, axis=1)

        return output

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

In [5]:
# hyper-parameters
gru_dim = 50
dropout_rate = 0.3
atten_dim = 50
dense_dim = 50

batch_size = 100
epoch_num = 10

categorical_label = True

if categorical_label:
    train_label_cat = np_utils.to_categorical(train_label)
#     test_label_cat = np_utils.to_categorical(test_label)

In [6]:
# define some Keras layers
embedding_layer = Embedding(vocab_size, embedding_matrix.shape[1], input_length=MAX_SENT_LENGTH, 
                            weights=[embedding_matrix], trainable=False)

cnn_layer1 = Convolution1D(nb_filter=100,
                            filter_length=3,
                            border_mode='same',
                            activation='tanh',
                            subsample_length=1)

rnn_layer = Bidirectional(GRU(gru_dim, dropout=dropout_rate, recurrent_dropout=dropout_rate, return_sequences=True))
# rnn_layer = GRU(gru_dim, dropout=dropout_rate, recurrent_dropout=dropout_rate, return_sequences=False)

max_pooling_layer = GlobalMaxPooling1D()

  if __name__ == '__main__':


In [7]:
# build sentence encoder model
sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')

sent_embedding = embedding_layer(sentence_input)  #input shape:(MAX_SENT_LENGTH),output shape:(MAX_SENT_LENGTH,embed dimension)

sent_rnn = rnn_layer(sent_embedding) # output shape: (None, gru_dim*2)

att_out = AttLayer(atten_dim)(sent_rnn)
# att_out = Dropout(dropout_rate)(att_out)

sentEncoder = Model(sentence_input, att_out)
sentEncoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 224)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 224, 300)          29148600  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 224, 100)          105300    
_________________________________________________________________
att_layer_1 (AttLayer)       (None, 100)               5100      
Total params: 29,259,000
Trainable params: 110,400
Non-trainable params: 29,148,600
_________________________________________________________________


In [8]:
# build document encoder model
review_input = Input(shape=(MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
review_encoder = TimeDistributed(sentEncoder)(review_input)   # out shape: (None, MAX_SENTS, gru_dim*2)

cnn_out = cnn_layer1(review_encoder) # (batch_size, timesteps, nb_filter)
cnn_out = max_pooling_layer(cnn_out)  # output shape: (batch_size, nb_filter)


# dense = Dense(dense_dim, activation='tanh')(cnn_out)
# dense = Dropout(dropout_rate)(dense)

if categorical_label:
    preds = Dense(2, activation='softmax')(cnn_out) # categorical output
    model = Model(review_input, preds)
    model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['acc'])
else:
    preds = Dense(1, activation='sigmoid')(cnn_out)
    model = Model(review_input, preds)
    model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])
    
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 36, 224)           0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 36, 100)           29259000  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 36, 100)           30100     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 202       
Total params: 29,289,302
Trainable params: 140,702
Non-trainable params: 29,148,600
_________________________________________________________________


In [None]:
print('Training...')
for i in range(epoch_num):
    print('Training for epoch {}/{}'.format(i+1,epoch_num))
    if categorical_label:
        model.fit(train_copus_padded, train_label_cat, batch_size=batch_size,epochs=1)
    else:
        model.fit(train_copus_padded, train_label, batch_size=batch_size,epochs=1)
        
    print('Evaluating...')
    pred_test_prob = model.predict(test_copus_padded, batch_size=batch_size, verbose=True)
    # predict the class label
    if pred_test_prob.shape[-1]>1:
        pred_test = pred_test_prob.argmax(axis=-1)
    else:
        pred_test = (pred_test_prob>0.5).astype('int32')
        pred_test = pred_test.reshape(pred_test.shape[0])

    acc = np.sum(pred_test == test_label) / float(len(test_label))

    print("Accuracy: %.4f" % (acc))   
    print(classification_report(test_label, pred_test, digits=4, labels=[0, 1]))

Training...
Training for epoch 1/10
Epoch 1/1
Evaluating...
Accuracy: 0.8693
             precision    recall  f1-score   support

          0     0.8457    0.9035    0.8736     12500
          1     0.8964    0.8351    0.8647     12500

avg / total     0.8711    0.8693    0.8692     25000

Training for epoch 2/10
Epoch 1/1
Evaluating...
Accuracy: 0.8820
             precision    recall  f1-score   support

          0     0.9141    0.8434    0.8773     12500
          1     0.8546    0.9207    0.8864     12500

avg / total     0.8843    0.8820    0.8819     25000

Training for epoch 3/10
Epoch 1/1
Evaluating...
Accuracy: 0.8805
             precision    recall  f1-score   support

          0     0.8394    0.9411    0.8873     12500
          1     0.9330    0.8199    0.8728     12500

avg / total     0.8862    0.8805    0.8801     25000

Training for epoch 4/10
Epoch 1/1
Evaluating...
Accuracy: 0.8959
             precision    recall  f1-score   support

          0     0.8806    0.9

In [None]:
print('Training...')
for i in range(epoch_num):
    print('Training for epoch {}/{}'.format(i+1,epoch_num))
    if categorical_label:
        model.fit(train_copus_padded, train_label_cat, batch_size=batch_size,epochs=1)
    else:
        model.fit(train_copus_padded, train_label, batch_size=batch_size,epochs=1)
        
    print('Evaluating...')
    pred_test_prob = model.predict(test_copus_padded, batch_size=batch_size, verbose=True)
    # predict the class label
    if pred_test_prob.shape[-1]>1:
        pred_test = pred_test_prob.argmax(axis=-1)
    else:
        pred_test = (pred_test_prob>0.5).astype('int32')
        pred_test = pred_test.reshape(pred_test.shape[0])

    acc = np.sum(pred_test == test_label) / float(len(test_label))

    print("Accuracy: %.4f" % (acc))   
    print(classification_report(test_label, pred_test, digits=4, labels=[0, 1]))

Training...
Training for epoch 1/10
Epoch 1/1
Evaluating...
Accuracy: 0.8996
             precision    recall  f1-score   support

          0     0.8868    0.9161    0.9012     12500
          1     0.9132    0.8831    0.8979     12500

avg / total     0.9000    0.8996    0.8996     25000

Training for epoch 2/10
Epoch 1/1
 3900/25000 [===>..........................] - ETA: 4:56 - loss: 0.0826 - acc: 0.9741