### 2x BiLSTM layers with attention for classifying microblog sentiment

This network implements the LSTM model in the following paper<br>
Baziotis, C., Pelekis, N., & Doulkeridis, C. (2017). Datastories at semeval-2017 task 4: Deep lstm with attention for message-level and topic-based sentiment analysis. In Proceedings of the 11th International Workshop on Semantic Evaluation (SemEval-2017) (pp. 747-754).

In [1]:
import os
from gensim.models import KeyedVectors
from shutil import rmtree
import string
import tensorflow as tf
import numpy as np
import pandas as pd
seed = 123
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
tf.logging.set_verbosity(tf.logging.INFO)
print(tf.__version__)



1.11.0


Using TensorFlow backend.


In [2]:
df_data = pd.read_csv(r"C:\Dropbox\_projects\word_embedding\stocktwits\data_final.csv")
df_data["sentiment"].value_counts()

Neutral     5836
Positive    2449
Negative    1715
Name: sentiment, dtype: int64

In [3]:
df_data.head()

Unnamed: 0,id,text,tokens,sentiment,ss_pos,ss_neg,ss_overall,ss
0,107763782,"$NRG Insider ""Killinger Elizabeth R"" sold -3,9...","<SYM> insider "" killinger elizabeth r "" sold -...",Negative,1,-3,-2,Negative
1,107050038,$SGYP and in other news the Buffalo Bills ende...,<SYM> and in other news the buffalo bills ende...,Positive,2,-3,-1,Negative
2,107050388,$KMI - yep we will need the energy to heat the...,<SYM> - yep we will need the energy to heat th...,Positive,1,-1,0,Neutral
3,107055353,"$XRP.X happy new year boyz, the wall at 14670 ...","<SYM> happy new year boyz , the wall at 0 is b...",Positive,2,-1,1,Positive
4,107058260,Wabash National upgraded by ValuEngine to stro...,wabash national upgraded by valuengine to stro...,Positive,1,-1,0,Neutral


In [4]:
# Create padded sequences
max_len = max(map(lambda x: len(x.split(" ")), df_data["tokens"]))

t = Tokenizer(split=" ", filters="")
t.fit_on_texts(df_data["tokens"])
vocab_size = len(t.word_index) + 1

sequences = t.texts_to_sequences(df_data["tokens"].values)
padded_seq = pad_sequences(sequences, maxlen=max_len, padding="post")
df_seq = pd.DataFrame(padded_seq)

df_seq.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,40
0,1,104,85,4916,4917,364,85,159,1999,65,...,0,0,0,0,0,0,0,0,0,0
1,1,18,14,266,86,5,2302,3479,1478,150,...,0,0,0,0,0,0,0,0,0,0
2,1,21,1289,58,29,189,5,593,6,4919,...,0,0,0,0,0,0,0,0,0,0
3,1,474,72,139,4921,7,5,91,23,3,...,0,0,0,0,0,0,0,0,0,0
4,4922,1010,680,59,4923,6,3481,2,4,1,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# one hot encode the classes
y_multi = to_categorical(df_data["sentiment"].apply(lambda x: 1 if x == "Positive" else -1 if x == "Negative" else 0 ),
              num_classes=3)
y_single = np.argmax(y_multi, axis=1)

In [102]:
batch_size = 128
epochs = 2
n_folds = 10
train_size = int((n_folds - 1)/n_folds*len(df_data))
steps = epochs*train_size//batch_size + 1
def train_input_fn(x_train, y_train, batch_size, seed, epochs, buffer):
    dataset = tf.data.Dataset.from_tensor_slices(({"x":x_train}, y_train))
    dataset = dataset.shuffle(buffer_size=buffer, seed=seed, reshuffle_each_iteration=True)
    dataset = dataset.repeat(epochs).batch(batch_size).prefetch(batch_size)
    iterator = dataset.make_one_shot_iterator()
    return iterator.get_next()

def eval_input_fn(x_test, y_test, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices(({"x":x_test}, y_test))
    dataset = dataset.batch(batch_size)
    iterator = dataset.make_one_shot_iterator()
    return iterator.get_next()

#### Model

In [110]:
# Bidirectional LSTM from Baziotis et al 2017
def bilstm_fn(features, labels, mode, params):
    attn_size = 1
    n_units = 150
    cell_dropout = 0.25
    # embedding
    input_layer = tf.contrib.layers.embed_sequence(
        features["x"], vocab_size, emb_dim, trainable=False,
        initializer=params["embedding_initializer"])
    print("Embedding shape:", input_layer.shape)
    
    # Gaussian noise
    noise = tf.random_normal(shape=tf.shape(input_layer), mean=0.0, stddev=0.2, dtype=tf.float32)
    print("Noise shape:", noise.shape)
    input_layer = tf.add(input_layer, noise)
    print("Noise added shape:", input_layer.shape)
    
    dropout1 = tf.layers.dropout(input_layer, 0.3)
    print("Dropout1 shape:", dropout1.shape)
    
    # BiLSTM 1
    with tf.variable_scope("BiLSTM1", reuse=tf.AUTO_REUSE):
        lstm_fw_cell1 = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.LSTMCell(n_units), cell_dropout)
        lstm_bw_cell1 = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.LSTMCell(n_units), cell_dropout)
        (outputs_fw1, outputs_bw1), final_states1 = tf.nn.bidirectional_dynamic_rnn(
            lstm_fw_cell1, lstm_bw_cell1, dropout1, dtype=tf.float32)
        outputs1 = tf.concat([outputs_fw1, outputs_bw1], axis=2)
        print("Outputs1 shape:", outputs1.shape)
    dropout2 = tf.layers.dropout(outputs1, 0.5)
    # BiLSTM 2
    with tf.variable_scope("BiLSTM2", reuse=tf.AUTO_REUSE):
        lstm_fw_cell2 = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.LSTMCell(n_units), cell_dropout)
        lstm_bw_cell2 = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.LSTMCell(n_units), cell_dropout)
        (outputs_fw2, outputs_bw2), final_states2 = tf.nn.bidirectional_dynamic_rnn(
            lstm_fw_cell2, lstm_bw_cell2, dropout2, dtype=tf.float32)
        outputs2 = tf.concat([outputs_fw2, outputs_bw2], axis=2)
        print("Outputs2 shape:", outputs2.shape)
        
    dropout3 = tf.layers.dropout(outputs2, 0.5)
        
    # Attention
    with tf.variable_scope("Attention", reuse=tf.AUTO_REUSE):
        W = tf.get_variable("W", [n_units*2, attn_size])
        b = tf.get_variable("b", [attn_size])
        e = tf.tanh(tf.tensordot(dropout3, W, axes=1) + b)
        a = tf.nn.softmax(e)
        print("a shape:", a.shape)
        r = tf.multiply(a, dropout3)
        print("r shape:", r.shape)
        
    r = tf.contrib.layers.flatten(r)
    logits = tf.layers.dense(inputs=r, units=params["n_classes"])
    print("Logits shape:", logits.shape)
    print([v.name for v in tf.trainable_variables()])
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        probs = tf.nn.softmax(logits)
        pred_indices = tf.argmax(probs, 1)
        preds = {"class":pred_indices, "probabilities":probs}
        export_outputs = {"prediction":tf.estimator.export.PredictOutput(preds)}
        return tf.estimator.EstimatorSpec(mode, predictions=preds, export_outputs=export_outputs)
        
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=labels))
    l2_loss = tf.reduce_sum([tf.nn.l2_loss(v) for v in tf.trainable_variables() if "bias" not in v.name ])
    loss = loss + 0.0001*l2_loss
    tf.summary.scalar("loss", loss)
    
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdamOptimizer()
#         train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
        # Gradient clipping by norm
        gvs = optimizer.compute_gradients(loss)
        capped_gvs = [(tf.clip_by_norm(grad, 5), var) for grad, var in gvs]
        train_op = optimizer.apply_gradients(capped_gvs, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
    
#     if mode == tf.estimator.ModeKeys.EVAL:
#         probs = tf.nn.softmax(logits)
#         pred_indices = tf.argmax(probs, 1)
#         labels_one_hot = tf.one_hot(labels, depth=params["n_classes"], on_value=True, off_value=False, dtype=tf.bool)
#         eval_metric_ops = {
#             "accuracy": tf.metrics.accuracy(labels, pred_indices),
#             "auroc": tf.metrics.auc(labels_one_hot, probs)
#         }
#         return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=eval_metric_ops)
    

In [116]:
# Custom model
def bilstm_fn2(features, labels, mode, params):
    attn_size = 1
    n_units = 150
    # embedding
    input_layer = tf.contrib.layers.embed_sequence(
        features["x"], vocab_size, emb_dim, trainable=False,
        initializer=params["embedding_initializer"])
    print("Embedding shape:", input_layer.shape)
    
    # Gaussian noise (this seems to cause performance decrease)
#     noise = tf.random_normal(shape=tf.shape(input_layer), mean=0.0, stddev=0.2, dtype=tf.float32)
#     print("Noise shape:", noise.shape)
#     input_layer = tf.add(input_layer, noise)
#     print("Noise added shape:", input_layer.shape)
    
    dropout1 = tf.layers.dropout(input_layer, 0.3)
    print("Dropout1 shape:", dropout1.shape)
    
    # BiLSTM 1
    with tf.variable_scope("BiLSTM1", reuse=tf.AUTO_REUSE):
        lstm_fw_cell1 = tf.nn.rnn_cell.LSTMCell(n_units)
        lstm_bw_cell1 = tf.nn.rnn_cell.LSTMCell(n_units)
        (outputs_fw1, outputs_bw1), final_states1 = tf.nn.bidirectional_dynamic_rnn(
            lstm_fw_cell1, lstm_bw_cell1, dropout1, dtype=tf.float32)
        outputs1 = tf.concat([outputs_fw1, outputs_bw1], axis=2)
        print("Outputs1 shape:", outputs1.shape)
    # BiLSTM 2
    with tf.variable_scope("BiLSTM2", reuse=tf.AUTO_REUSE):
        lstm_fw_cell2 = tf.nn.rnn_cell.LSTMCell(n_units)
        lstm_bw_cell2 = tf.nn.rnn_cell.LSTMCell(n_units)
        (outputs_fw2, outputs_bw2), final_states2 = tf.nn.bidirectional_dynamic_rnn(
            lstm_fw_cell2, lstm_bw_cell2, outputs1, dtype=tf.float32)
        outputs2 = tf.concat([outputs_fw2, outputs_bw2], axis=2)
        print("Outputs2 shape:", outputs2.shape)
        
    # Attention
    with tf.variable_scope("Attention", reuse=tf.AUTO_REUSE):
        W = tf.get_variable("W", [n_units*2, attn_size])
        b = tf.get_variable("b", [attn_size])
        e = tf.tanh(tf.tensordot(outputs2, W, axes=1) + b)
        a = tf.nn.softmax(e)
        print("a shape:", a.shape)
        r = tf.multiply(a, outputs2)
        print("r shape:", r.shape)
        
    r = tf.contrib.layers.flatten(r)
    logits = tf.layers.dense(inputs=r, units=params["n_classes"])
    print("Logits shape:", logits.shape)
    print([v.name for v in tf.trainable_variables()])
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        probs = tf.nn.softmax(logits)
        pred_indices = tf.argmax(probs, 1)
        preds = {"class":pred_indices, "probabilities":probs}
        export_outputs = {"prediction":tf.estimator.export.PredictOutput(preds)}
        return tf.estimator.EstimatorSpec(mode, predictions=preds, export_outputs=export_outputs)
        
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=labels))
    l2_loss = tf.reduce_sum([tf.nn.l2_loss(v) for v in tf.trainable_variables() if "bias" not in v.name ])
    loss = loss + 0.0001*l2_loss
    tf.summary.scalar("loss", loss)
    
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdamOptimizer()
#         train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
        # Gradient clipping by norm
        gvs = optimizer.compute_gradients(loss)
        capped_gvs = [(tf.clip_by_norm(grad, 5), var) for grad, var in gvs]
        train_op = optimizer.apply_gradients(capped_gvs, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
    
#     if mode == tf.estimator.ModeKeys.EVAL:
#         probs = tf.nn.softmax(logits)
#         pred_indices = tf.argmax(probs, 1)
#         labels_one_hot = tf.one_hot(labels, depth=params["n_classes"], on_value=True, off_value=False, dtype=tf.bool)
#         eval_metric_ops = {
#             "accuracy": tf.metrics.accuracy(labels, pred_indices),
#             "auroc": tf.metrics.auc(labels_one_hot, probs)
#         }
#         return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=eval_metric_ops)
    

#### Pretrained Embeddings

In [8]:
emb_dim = 300
model_dir = r"D:\word_embedding\binary\embeddings5"
column = tf.feature_column.categorical_column_with_identity('x', vocab_size)
word_embedding_column = tf.feature_column.embedding_column(column, dimension=300)

In [9]:
# load pretrained embeddings
# en_model = KeyedVectors.load_word2vec_format(r'D:\nlp_resources\word2vec\GoogleNews-vectors-negative300.bin',
#                                             binary=True)
# en_model = KeyedVectors.load_word2vec_format(r'D:\nlp_resources\glove\test_word2vec.txt',
#                                             binary=False)
# en_model = KeyedVectors.load_word2vec_format(r'D:\nlp_resources\fasttext\wiki.en.vec',
#                                             binary=False)

en_model = KeyedVectors.load_word2vec_format(r'D:\word_embedding\balanced\vectors1_w2v.txt',
                                            binary=False, encoding="UTF-8")

In [10]:
emb_dim = en_model.vector_size
embedding_matrix = np.zeros((vocab_size, emb_dim))
for word, i in t.word_index.items():
    try:
        embedding_matrix[i] = en_model.get_vector(word)
    except KeyError:
        continue

In [11]:
def my_initializer(shape=None, dtype=tf.float32, partition_info=None):
    assert dtype is tf.float32
    return embedding_matrix

# pretrained_embedding_column = tf.feature_column.embedding_column(column, 
#                                                                 dimension=300,
#                                                                 initializer=my_initializer,
#                                                                 trainable=False)

#### Cross validation

In [117]:
%%time
f1 = []
accuracy = []
skf = StratifiedKFold(n_splits=10, random_state=seed)
fold = 0
for train_idx, test_idx in skf.split(padded_seq, y_single):
    print(fold)
    fold+=1
    x_train = padded_seq[train_idx]
    x_test = padded_seq[test_idx]
    y_train = y_multi[train_idx]
    y_test = y_multi[test_idx]
    
    rmtree(model_dir, ignore_errors=True)

    params = {'n_classes':3,
              'embedding_initializer': my_initializer,
#               'embedding_initializer': tf.random_uniform_initializer(-1.0, 1.0),
              }
    classifier = tf.estimator.Estimator(model_fn=bilstm_fn2,
                                       model_dir=model_dir,params=params)
    classifier.train(input_fn=lambda :train_input_fn(x_train, y_train, batch_size, seed, epochs, train_size), steps=steps)
    pred =  classifier.predict(input_fn=lambda : eval_input_fn(x_test, y_test, batch_size))
    y_true = y_single[test_idx]
    y_pred = [p["class"] for p in pred]
    f1.append(f1_score(y_true, y_pred, average="weighted"))
    accuracy.append(accuracy_score(y_true, y_pred))
print("F1:",round(np.mean(f1), 4), 
      "Accuracy:", round(np.mean(accuracy), 4))

0
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'D:\\word_embedding\\binary\\embeddings5', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001AEED9F7940>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
Embedding shape: (?, 41, 300)
Dropout1 shape: (?, 41, 300)
Outputs1 shape: (?,