In [1]:
from __future__ import division

import numpy as np
import pandas as pd
import cPickle
from collections import defaultdict
import re

from bs4 import BeautifulSoup

import sys
import os

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
from keras.models import Model

from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers, optimizers
from nltk import tokenize
#### additional
import pickle
from args import get_parser
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
import tensorflow as tf
from tensorboardX import SummaryWriter
from sys import exit
writer = SummaryWriter()
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
class AttLayer(Layer):
    def __init__(self, attention_dim, **kwargs):
        self.init = initializers.get('normal')
        self.supports_masking = True
        self.attention_dim = attention_dim
        super(AttLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
        self.b = K.variable(self.init((self.attention_dim, )))
        self.u = K.variable(self.init((self.attention_dim, 1)))
        self.trainable_weights = [self.W, self.b, self.u]
        super(AttLayer, self).build(input_shape)

    def compute_mask(self, inputs, mask=None):
        return mask

    def call(self, x, mask=None):
        # size of x :[batch_size, sel_len, attention_dim]
        # size of u :[batch_size, attention_dim]
        # uit = tanh(xW+b)
        uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
        ait = K.dot(uit, self.u)
        ait = K.squeeze(ait, -1)

        ait = K.exp(ait)

        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            ait *= K.cast(mask, K.floatx())
        ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        ait = K.expand_dims(ait)
        weighted_input = x * ait
        output = K.sum(weighted_input, axis=1)
        return output

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])
def save_pickle(filename, obj, overwrite = False):
    make_dir(filename)
    if os.path.isfile(filename) == True and overwrite == False:
        print('already exists'+filename)
    else:
        with open(filename, 'wb') as gfp:
            pickle.dump(obj, gfp, protocol=2)
            gfp.close()
            
def make_dir(filename):
    dir_path = os.path.dirname(filename)
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
        print('make dir')
        
def load_pickle(filename):
    with open(filename, 'rb') as gfp:
        r = pickle.load(gfp)
    return r

Using TensorFlow backend.


### load saved opts, so that saved model will be loaded together

In [2]:
resume = '../../dir_HugeFiles/snap_0311/attention/full/model_e005_v--0.419'
statename =resume + '.pickle'
state = load_pickle(statename)
opts = state['opts']
opts.resume = resume
opts.gpu= 3

In [7]:
opts.wordlength

50

In [None]:
gpu_id = opts.gpu
print('Current running on GPU number:', gpu_id)
gpu_options = tf.GPUOptions(visible_device_list=str(gpu_id))
config = tf.ConfigProto(device_count = {'GPU': gpu_id, 'CPU': 10},
                        gpu_options = gpu_options,
                        intra_op_parallelism_threads = 32,
                        inter_op_parallelism_threads = 32)
sess = tf.Session(config = config)
K.set_session(sess)
####
MAX_SENT_LENGTH = opts.sentlength # 100
MAX_SENTS = opts.wordlength #15
MAX_NB_WORDS = 20000

##### customized settings ##### 
p = os.path.abspath(opts.snapshots)
tag = '/'.join(p.split(os.sep)[3:])# store the path, but w/o prefix workspace/dir_HugeFiles
print('tag of tensor board: %s'%(tag))

reviews, labels = load_pickle(opts.train)

# if want to use less data to train
small = opts.small
if small:
    reviews, labels = reviews[:5000], labels[:5000]
texts = [' '.join(recipe) for recipe in reviews]
'''
# sentences = list of string, each string contains one sentence
# texts =  flatten sentences, separate by recipes
# reviews = list of sentences
reviews = [v['directions'] for v in dic.values()]
texts = [' '.join(v['directions']) for v in dic.values()]
labels = [v['GI'] for v in dic.values()]    
'''

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)

data = np.zeros((len(texts), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')

for i, sentences in enumerate(reviews):
    for j, sent in enumerate(sentences):
        if j < MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k = 0
            for _, word in enumerate(wordTokens):
                if k < MAX_SENT_LENGTH and tokenizer.word_index[word] < MAX_NB_WORDS:
                    data[i, j, k] = tokenizer.word_index[word]
                    k = k + 1

word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

X_train, X_test, y_train, y_test = train_test_split(data, labels, stratify = labels, test_size = 0.2, random_state = opts.random)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify = y_train, test_size = 0.25, random_state = 1 + opts.random)

print('Number of positive and negative reviews in traing and validation set')
print y_train.sum(axis=0)
print y_val.sum(axis=0)

# delete variable to release memory
del data

class_wights = opts.pweight
# if -1, then automatically caculate the balanced weight
if class_wights == -1:
    class_01 = y_train.sum(axis= 0)
    class_weights = round(class_01[0]/class_01[1],1)
print('class weight is %.1f' % class_weights)

if opts.foodW2V:
    pretrained = Word2Vec.load(opts.foodW2V)# '../data/foodvec300.model'
    embeddings_index = pretrained.wv
    EMBEDDING_DIM = pretrained.vector_size
    opts.emdding_dim = EMBEDDING_DIM
    print(opts.foodW2V)

'''
embeddings_index = {}
f = open(opts.gloveW2V)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Total %s word vectors.' % len(embeddings_index))  
'''

# building Hierachical Attention network
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if word in pretrained.wv.vocab:
        embedding_vector = embeddings_index.get_vector(word)
    else:
        # words not found in embedding index will be UNK
        embedding_vector = embeddings_index.get_vector('UNK')
    embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SENT_LENGTH,
                            trainable=True,
                            mask_zero=True)

sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
l_lstm = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
l_att = AttLayer(100)(l_lstm)
sentEncoder = Model(sentence_input, l_att)

review_input = Input(shape=(MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
review_encoder = TimeDistributed(sentEncoder)(review_input)
l_lstm_sent = Bidirectional(GRU(100, return_sequences=True))(review_encoder)
l_att_sent = AttLayer(100)(l_lstm_sent)
preds = Dense(2, activation='softmax')(l_att_sent)
model = Model(review_input, preds)

#SGD
custom_optimizer = optimizers.SGD(lr=opts.lr, momentum= 0.9)
if opts.optm:
    custom_optimizer = opts.optm
model.compile(loss='binary_crossentropy', 
              optimizer = custom_optimizer, #'rmsprop',
              metrics=['acc'])

if opts.resume:
    modelname = opts.resume+'_model.h5'
    statename = opts.resume+'.pickle'
    if os.path.isfile(modelname):
        print("=> loading checkpoint '{}'".format(opts.resume))
        model.load_weights(modelname)

('Current running on GPU number:', 3)
tag of tensor board: snap_0311/attention/full




In [None]:
class AttLayer_visual(Layer):
    def __init__(self, attention_dim):
        self.init = initializers.get('normal')
        self.supports_masking = True
        self.attention_dim = attention_dim
        super(AttLayer_visual, self).__init__()

    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
        self.b = K.variable(self.init((self.attention_dim, )))
        self.u = K.variable(self.init((self.attention_dim, 1)))
        self.trainable_weights = [self.W, self.b, self.u]
        super(AttLayer_visual, self).build(input_shape)

    def compute_mask(self, inputs, mask=None):
        return mask

    def call(self, x, mask=None):
        # size of x :[batch_size, sel_len, attention_dim]
        # size of u :[batch_size, attention_dim]
        # uit = tanh(xW+b)
        uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
        ait = K.dot(uit, self.u)
        ait = K.squeeze(ait, -1)

        ait = K.exp(ait)

        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            ait *= K.cast(mask, K.floatx())
        ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        ait = K.expand_dims(ait)
        weighted_input = x * ait
        output = K.sum(weighted_input, axis=1)

        return ait

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

### prepare the word-index dictionary

In [23]:
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

### visaulize sentences importances

In [24]:
l_att_sent_vis = AttLayer_visual(100)(l_lstm_sent)
model_vis = Model(review_input, l_att_sent_vis)
l_att_vis = AttLayer_visual(100)(l_lstm)
sentEncoder_vis = Model(sentence_input, l_att_vis)

def display_sent(document_id, X, y, max_sen = 15, color = True, words = True):
    write_to_list = []
    write_to_list.append('Label as %r Low GI recipe  <br />' % (y[document_id][1] == 1))
    # only takes one document
    d1, d2 = X[document_id].shape
    X0 = X[document_id].reshape(1, d1, d2)
    ait_sent = model_vis.predict(X0)
    ait = sentEncoder_vis.predict(X[document_id])
    
    sents, scores, scores_2 = [], [], []
    for i in range(max_sen):
        sentence = [t for t in X[document_id][i] if t!=0]
        if sentence:
            str_sent = ' '.join([reverse_word_map[t] for t in sentence])
            sents.append(str_sent)
            score = ait_sent[document_id][i][0]
            scores.append(score)
            
            weight = [round(ait[i][t][0],3) for t, v in enumerate(sentence)]
            score_2 = (np.sqrt(score)*np.array(weight)).tolist() # follow the paper: use score_2
            score_3 = [weight*len(sentence) for weight in sentence]
            scores_2.append(score_3) # did not follow the paper: use weight
            
    scores_norm = [(score-min(scores))/(max(scores)-min(scores)) for score in scores]
    maxi, mini = max(sum(scores_2, [])), min(sum(scores_2, []))
    for i, score in enumerate(scores):
        if not color:
            write_to_list.append('Score %.3f: %s' %(score, sents[i]))
        else:
            write_to_list.append('<font style="background: rgba(255, 255, 0, %f)">%.3f   </font>' % (scores_norm[i], score))
            if not words:
                write_to_list.append('%s  <br />'% (sents[i]))
            else:
                sentence = [t for t in X[document_id][i] if t!=0]
                for j, word in enumerate(sentence):
                    color = (scores_2[i][j]-mini)/(maxi-mini)
                    write_to_list.append('<font style="background: rgba(255, 0, 255, %f)">%s </font>' % (10*color, reverse_word_map[word]))
                write_to_list.append('<br />')
    return write_to_list


In [25]:
w = display_sent(0, X_test, y_test)
with open('../figs/mypage.html', "a+") as html_file:
    for sent in w:
        html_file.write(sent)
    html_file.write("<br /><br /><br />")