In [7]:
import tensorflow as tf
import numpy as np
import os
import pandas as pd
import json
from utils import preprocessing_helpers, dataset_helpers

In [8]:
data_folder = 'data/data_sentiment140'
train_path = os.path.join(data_folder, 'train_data.csv')
eval_path = os.path.join(data_folder, 'eval_data.csv')
test_path = os.path.join(data_folder, 'test_data.csv')
vocab_file_path = os.path.join(data_folder, 'vocabulary_list.txt')
label_vocab = ['negative', 'positive']

In [9]:
num_epochs = 10
def train_input_fn():
    dataset = dataset_helpers.make_dataset_from_csv(train_path, num_epochs, batch_size=128, shuffle=True, drop_remainder=True)
    tokens, sequence_length, label = dataset.make_one_shot_iterator().get_next()
    return {'tokens': tokens, 'sequence_length': sequence_length}, label

In [10]:
def eval_input_fn():
    dataset = dataset_helpers.make_dataset_from_csv(eval_path, 1, batch_size=128, shuffle=False, drop_remainder=False)
    tokens, sequence_length, label = dataset.make_one_shot_iterator().get_next()
    return {'tokens': tokens, 'sequence_length': sequence_length}, label

In [11]:
fine_tune_embedding = True
embedding_dimension = 150 # Should be the same as the pre-trained matrix

def wrap_const(*args, **kwargs):
    embedding_matrix = np.load(os.path.join(data_folder,'embedding_matrix.npy')).astype(np.float32)
    return embedding_matrix

In [12]:
word_ids = tf.contrib.feature_column.sequence_categorical_column_with_vocabulary_file('tokens', vocab_file_path , num_oov_buckets=1)
word_embeddings = tf.feature_column.embedding_column(word_ids, embedding_dimension, initializer=wrap_const,
                                                     trainable=fine_tune_embedding)

feature_columns = [tf.feature_column.numeric_column('sequence_length')]
sequence_feature_columns = [word_embeddings]

INFO:tensorflow:vocabulary_size = 47874 in tokens is inferred from the number of elements in the vocabulary_file data/data_sentiment140/vocabulary_list.txt.


##### Model HP

In [13]:
params = {
    'num_classes': 2,
    'num_hidden_units': 300,
    'attention_dimension': 100,
    'number_attention_hop': 30,
    'num_dense_units': 500,
    'penalty_coeff': 0,
    'num_hidden_layers': 1,
    'masking_attention': True, # Whether or not to mask and renormalize the attention weights (putting to 0 the one for PAD)
    'feature_columns': feature_columns,
    'sequence_feature_columns': sequence_feature_columns,
    'label_vocab':label_vocab,
    'visualize_attention': True, # Only for predict mode if we want to get the attention matrix in the predictions
}

##### Model definition

In [14]:
from model.bilstm_self_attention import BiLSTMSelfAttention

In [24]:
def model_fn(features, labels, mode, params):
    label_table = tf.contrib.lookup.index_table_from_tensor(mapping=tf.constant(params['label_vocab']), default_value=-1)
    
    sequence_length = tf.feature_column.input_layer(features, params['feature_columns'])
    sequence_length = tf.cast(tf.reshape(sequence_length, [-1]),tf.int32)
    
    # Here we prefered to do the sequence length and padding direclty by the tf.data.Dataset for effeciency
    # Therefore we won't take the sequence_length given by sequence_input_layer (it will be equal to the padded size for all the elements)
    input_embeddings, _ = tf.contrib.feature_column.sequence_input_layer(features, params['sequence_feature_columns'])
    
    inputs = {'input_embeddings': input_embeddings, 'sequence_length':sequence_length}
    # Model takes as input a dictionary with a 3D tensor (batch x max_seq_len x emb dimension) and a vector batch with the size of each example
    model_bilstm_attn = BiLSTMSelfAttention(params)
    logits, A = model_bilstm_attn(inputs)
    
    with tf.variable_scope('prediction'):
        softmax = tf.nn.softmax(logits)
        predictions = tf.argmax(logits, axis=1)
        
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions_dict = {'probabilities': softmax,
                            'predictions':predictions,
                            'sequence_length':sequence_length}
        if params['visualize_attention']:
            predictions_dict['attention_matrix'] = A
        return tf.estimator.EstimatorSpec(mode, predictions=predictions_dict)
    
    # ---- Training or Evaluation Mode ---
    
    # Convert string label to onehot encoding
    labels = tf.squeeze(tf.one_hot(label_table.lookup(labels), len(params['label_vocab'])), axis=1)

    with tf.variable_scope('metrics'):
        accuracy = tf.metrics.accuracy(labels=tf.argmax(labels,axis=1), predictions=predictions, name='acc_op')

    metrics = {'accuracy': accuracy}
    tf.summary.scalar('accuracy', accuracy[1])

    with tf.variable_scope('loss'):
        P = tf.reduce_sum(tf.square(tf.matmul(A,tf.transpose(A,[0,2,1])) - tf.eye(params['number_attention_hop'])),[1,2]) 
        cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=logits)
        loss = tf.reduce_mean(cross_entropy + params['penalty_coeff']*P)

    with tf.variable_scope('summaries'):
        with tf.variable_scope('loss'):
            tf.summary.scalar('cross-entropy',tf.reduce_mean(cross_entropy))
            tf.summary.scalar('penalty', tf.reduce_mean(P))
            tf.summary.scalar('loss',loss)       

    if mode == tf.estimator.ModeKeys.EVAL:
        return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics)
    else:
        with tf.variable_scope('optimizer'):
            # Lazy Adam to handle sparse gradient updates (since we are not using all the words in each batch)
            optimizer = tf.contrib.opt.LazyAdamOptimizer(learning_rate=1e-3)
            train_op = optimizer.minimize(loss,global_step=tf.train.get_or_create_global_step())
        return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)

In [16]:
run_config = tf.estimator.RunConfig(save_checkpoints_steps=200)
estimator_model = tf.estimator.Estimator(model_fn,params=params,model_dir='./output_model3',config=run_config)
train_spec = tf.estimator.TrainSpec(train_input_fn,max_steps=None)
eval_spec = tf.estimator.EvalSpec(eval_input_fn, steps=None) # Evaluate over the whole evaluation dataset

INFO:tensorflow:Using config: {'_model_dir': './output_model3', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 200, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11fa193c8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [1]:
tf.estimator.train_and_evaluate(estimator_model, train_spec, eval_spec)

##### Prediction Part

In [17]:
def predict_input_fn(data_gen):
    def _predict_input_fn():
        dataset = dataset_helpers.make_pred_dataset_from_gen(lambda: data_gen) #take callable to generator as input
        tokens, sequence_length =  dataset.make_one_shot_iterator().get_next()
        return {"tokens":tokens, "sequence_length":sequence_length}
    return _predict_input_fn

In [18]:
texts_to_predict = ["I didn't enjoy it", 'It was a wonderful experience']
texts_to_predict_it = iter(texts_to_predict)
res = estimator_model.predict(predict_input_fn(texts_to_predict_it), yield_single_examples=False) 

In [19]:
val = next(res)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./output_model3/model.ckpt-400
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


##### Attention Visualization part

In [20]:
from utils.preprocessing_helpers import process_text_without_label
from IPython.core.display import display, HTML

In [21]:
example_index = 1
processed_text, _ = process_text_without_label(texts_to_predict[example_index].encode('utf-8'))
token_list = processed_text.decode('utf-8').split(' ')

In [22]:
print(list(zip(label_vocab, val['probabilities'][example_index])))
att_matrix = val['attention_matrix'][example_index]
content = list(zip(token_list, np.sum(att_matrix,axis=0)/np.sum(att_matrix).tolist()))
html_content = ' '.join([f'<span style="background-color:rgba(255, 0, 0, {alpha});">{token}</span>' for token,alpha in content])

[('negative', 0.00464689), ('positive', 0.9953531)]


In [23]:
display(HTML(html_content))