In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="2"         # 3 is can change to 0-3

import pickle
import keras
from keras.models import Sequential, Model
from keras.layers.core import Dense, Dropout, Activation, Flatten, Lambda
from keras.layers.embeddings import Embedding
from keras.layers import Convolution1D, MaxPooling1D, GlobalMaxPooling1D, Input, Dense, Reshape, LSTM, GRU, Bidirectional, TimeDistributed
from keras.layers.merge import concatenate, add
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras import initializers
from keras.engine.topology import Layer
from keras import backend as K
import tensorflow as tf
import numpy as np
from util.util_functions import getWordIdx
from sklearn.metrics import classification_report, roc_auc_score

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# loading processed data

In [2]:
#loading the train_copus_padded data from .pickle file
file = open('pickle_data/train_copus_pad.pickle','rb')
train_copus_padded = pickle.load(file)

file = open('pickle_data/test_copus_pad.pickle','rb')
test_copus_padded = pickle.load(file)

file = open('pickle_data/vocab_train.pickle','rb')
vocab_to_int_train = pickle.load(file)

file = open('pickle_data/embedding_matrix','rb')
embedding_matrix = pickle.load(file)

file = open('pickle_data/train_label.pickle','rb')
train_label = pickle.load(file)

file = open('pickle_data/test_label.pickle','rb')
test_label = pickle.load(file)
file.close()

In [3]:
print('train test data shape:',train_copus_padded.shape, test_copus_padded.shape)
print('embedding_matrix shape:', embedding_matrix.shape)
#the size of vocabulary
vocab_size = len(vocab_to_int_train)
print('vocabulary size:', vocab_size)
# the maximal length of every sentence
MAX_SENTS = train_copus_padded.shape[1]
MAX_SENT_LENGTH = train_copus_padded.shape[2]
print('max sent number in a review:', MAX_SENTS, '\nmax words in a sentence:', MAX_SENT_LENGTH)

train test data shape: (25000, 36, 224) (25000, 36, 224)
embedding_matrix shape: (97162, 300)
vocabulary size: 97162
max sent number in a review: 36 
max words in a sentence: 224


### sentiment word filter construction

In [4]:
# load senti_lexicon extracted from SentiWordNet
file = open('pickle_data/senti_lexicon.pickle','rb')
senti_lexicon = pickle.load(file)
file.close()

In [5]:
#map the sentiment words to integer based on vocab2int
senti2int = [getWordIdx(word, vocab_to_int_train) for word in senti_lexicon if getWordIdx(word, vocab_to_int_train)!=1]

In [6]:
#get the filter weights based on the sentiment words&vocab2int&embedding_matrix
def Find_Filter_Weight(senti2int):
    """sentiwords is the list"""
    word_filter_weights = []
    bias_weights = []
    filter_len = 1
    for i in senti2int:
        vector = embedding_matrix[i]  # shape: 300
        vector = np.expand_dims(vector, axis=0) #shape: 1x 300
        vector = np.expand_dims(vector, axis=2) #shape: 1x 300 x 1
        if len(word_filter_weights) == 0:
            word_filter_weights = vector
        else:
            word_filter_weights = np.concatenate((word_filter_weights, vector), axis=2)
    #shape is (1, 300, 533)
    
    bias_weights = np.zeros(len(senti2int))
    cnn_wordfilter_weights = [word_filter_weights, bias_weights]
    
    return cnn_wordfilter_weights    

In [7]:
CNN_weights = Find_Filter_Weight(senti2int)
CNN_weights[0].shape

(1, 300, 410)

# keras building model

In [8]:
class AttLayer(Layer):
    def __init__(self, attention_dim):
        self.init = initializers.get('normal')
        self.supports_masking = True
        self.attention_dim = attention_dim
        super(AttLayer, self).__init__()

    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
        self.b = K.variable(self.init((self.attention_dim, )))
        self.u = K.variable(self.init((self.attention_dim, 1)))
        self.trainable_weights = [self.W, self.b, self.u]
        super(AttLayer, self).build(input_shape)

    def compute_mask(self, inputs, mask=None):
        return mask

    def call(self, x, mask=None):
        # size of x :[batch_size, sel_len, attention_dim]
        # size of u :[batch_size, attention_dim]
        # uit = tanh(xW+b)
        uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
        ait = K.dot(uit, self.u)
        ait = K.squeeze(ait, -1)

        ait = K.exp(ait)

        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            ait *= K.cast(mask, K.floatx())
        ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        ait = K.expand_dims(ait)
        weighted_input = x * ait
        output = K.sum(weighted_input, axis=1)

        return output

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

In [9]:
# hyper-parameters
gru_dim = 50
dropout_rate = 0.2
atten_dim = 100
# dense_dim = 30

batch_size = 100
epoch_num = 10

categorical_label = True

if categorical_label:
    train_label_cat = np_utils.to_categorical(train_label)
#     test_label_cat = np_utils.to_categorical(test_label)

In [10]:
# define some Keras layers
embedding_layer = Embedding(vocab_size, embedding_matrix.shape[1], input_length=MAX_SENT_LENGTH, 
                            weights=[embedding_matrix], trainable=False)

cnn_layer1 = Convolution1D(nb_filter=50,
                            filter_length=3,
                            border_mode='same',
                            activation='relu',
                            subsample_length=1)

cnn_layer2 = Convolution1D(nb_filter=CNN_weights[0].shape[2],
                            filter_length=1,
                            border_mode='same',
                            activation='relu',
                           weights = CNN_weights,
                           trainable = False,
                            subsample_length=1)

rnn_layer = Bidirectional(GRU(gru_dim, activation='tanh', dropout=dropout_rate, recurrent_dropout=dropout_rate, return_sequences=True))
# rnn_layer = GRU(gru_dim, dropout=dropout_rate, recurrent_dropout=dropout_rate, return_sequences=True)

max_pooling_layer = GlobalMaxPooling1D()

  if __name__ == '__main__':


In [11]:
# build sentence encoder model
sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')

sent_embedding = embedding_layer(sentence_input)  #input shape:(MAX_SENT_LENGTH),output shape:(MAX_SENT_LENGTH,embed dimension)

sent_cnn1 = cnn_layer1(sent_embedding) # output shape: (None, maxlen_word, nb_filter)
# we use standard max over time pooling
sent_cnn1 = max_pooling_layer(sent_cnn1)  # output shape: (None, nb_filter)

sent_cnn2 = cnn_layer2(sent_embedding) # output shape: (None, maxlen_word, nb_filter)
# we use standard max over time pooling
sent_cnn2 = max_pooling_layer(sent_cnn2)  # output shape: (None, nb_filter)

sent_cnn = concatenate([sent_cnn1, sent_cnn2])
sent_cnn = Dropout(dropout_rate)(sent_cnn)

sentEncoder = Model(sentence_input, sent_cnn)
sentEncoder.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 224)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 224, 300)     29148600    input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 224, 50)      45050       embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 224, 410)     123410      embedding_1[0][0]                
__________________________________________________________________________________________________
global_max

In [12]:
# build document encoder model
review_input = Input(shape=(MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
review_encoder = TimeDistributed(sentEncoder)(review_input)   # out shape: (None, MAX_SENTS, nb_filter)

rnn_out = rnn_layer(review_encoder) # (batch_size, timesteps, gru_dimx2)

att_out = AttLayer(atten_dim)(rnn_out)
# att_out = Dropout(dropout_rate)(att_out)

# dense = Dense(dense_dim, activation='tanh')(att_out)
# dense = Dropout(dropout_rate)(dense)

if categorical_label:
    preds = Dense(2, activation='softmax')(att_out) # categorical output
    model = Model(review_input, preds)
    model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['acc'])
else:
    preds = Dense(1, activation='sigmoid')(att_out)
    model = Model(review_input, preds)
    model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])
    
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 36, 224)           0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 36, 460)           29317060  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 36, 100)           153300    
_________________________________________________________________
att_layer_1 (AttLayer)       (None, 100)               10200     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 202       
Total params: 29,480,762
Trainable params: 208,752
Non-trainable params: 29,272,010
_________________________________________________________________


In [13]:
print('Training...')
for i in range(epoch_num):
    print('Training for epoch {}/{}'.format(i+1,epoch_num))
    if categorical_label:
        model.fit(train_copus_padded, train_label_cat, batch_size=batch_size,epochs=1)
    else:
        model.fit(train_copus_padded, train_label, batch_size=batch_size,epochs=1)
        
    print('Evaluating...')
    pred_test_prob = model.predict(test_copus_padded, batch_size=batch_size, verbose=True)
    # predict the class label
    if pred_test_prob.shape[-1]>1:
        pred_test = pred_test_prob.argmax(axis=-1)
    else:
        pred_test = (pred_test_prob>0.5).astype('int32')
        pred_test = pred_test.reshape(pred_test.shape[0])

    acc = np.sum(pred_test == test_label) / float(len(test_label))

    print("Accuracy: %.4f" % (acc))   
    print(classification_report(test_label, pred_test, digits=5, labels=[0, 1]))

Training...
Training for epoch 1/10
Epoch 1/1
Evaluating...
Accuracy: 0.8482
             precision    recall  f1-score   support

          0    0.82247   0.88800   0.85398     12500
          1    0.87830   0.80832   0.84186     12500

avg / total    0.85038   0.84816   0.84792     25000

Training for epoch 2/10
Epoch 1/1
Evaluating...
Accuracy: 0.8698
             precision    recall  f1-score   support

          0    0.90897   0.82200   0.86330     12500
          1    0.83754   0.91768   0.87578     12500

avg / total    0.87326   0.86984   0.86954     25000

Training for epoch 3/10
Epoch 1/1
Evaluating...
Accuracy: 0.8806
             precision    recall  f1-score   support

          0    0.90343   0.85240   0.87717     12500
          1    0.86029   0.90888   0.88392     12500

avg / total    0.88186   0.88064   0.88054     25000

Training for epoch 4/10
Epoch 1/1
Evaluating...
Accuracy: 0.8773
             precision    recall  f1-score   support

          0    0.83657   0.93

In [14]:
print('Training...')
for i in range(epoch_num):
    print('Training for epoch {}/{}'.format(i+1,epoch_num))
    if categorical_label:
        model.fit(train_copus_padded, train_label_cat, batch_size=batch_size,epochs=1)
    else:
        model.fit(train_copus_padded, train_label, batch_size=batch_size,epochs=1)
        
    print('Evaluating...')
    pred_test_prob = model.predict(test_copus_padded, batch_size=batch_size, verbose=True)
    # predict the class label
    if pred_test_prob.shape[-1]>1:
        pred_test = pred_test_prob.argmax(axis=-1)
    else:
        pred_test = (pred_test_prob>0.5).astype('int32')
        pred_test = pred_test.reshape(pred_test.shape[0])

    acc = np.sum(pred_test == test_label) / float(len(test_label))

    print("Accuracy: %.4f" % (acc))   
    print(classification_report(test_label, pred_test, digits=5, labels=[0, 1]))

Training...
Training for epoch 1/10
Epoch 1/1
Evaluating...
Accuracy: 0.8977
             precision    recall  f1-score   support

          0    0.86671   0.94000   0.90187     12500
          1    0.93446   0.85544   0.89320     12500

avg / total    0.90058   0.89772   0.89754     25000

Training for epoch 2/10
Epoch 1/1
Evaluating...
Accuracy: 0.9062
             precision    recall  f1-score   support

          0    0.89126   0.92520   0.90791     12500
          1    0.92224   0.88712   0.90434     12500

avg / total    0.90675   0.90616   0.90613     25000

Training for epoch 3/10
Epoch 1/1
Evaluating...
Accuracy: 0.9084
             precision    recall  f1-score   support

          0    0.90690   0.91024   0.90857     12500
          1    0.90991   0.90656   0.90823     12500

avg / total    0.90841   0.90840   0.90840     25000

Training for epoch 4/10
Epoch 1/1
Evaluating...
Accuracy: 0.9024
             precision    recall  f1-score   support

          0    0.93080   0.86