#### Convolutional neural networks based on:
Cliche, M. (2017). "BB_twtr at SemEval-2017 Task 4: Twitter Sentiment Analysis with CNNs and LSTMs." arXiv preprint arXiv:1704.06125.
Kim, Y. (2014). "Convolutional neural networks for sentence classification." arXiv preprint arXiv:1408.5882.

In [1]:
import os
from gensim.models import KeyedVectors
from shutil import rmtree
import string
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
seed = 123
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
tf.logging.set_verbosity(tf.logging.INFO)
print(tf.__version__)



1.11.0


Using TensorFlow backend.


In [2]:
df_data = pd.read_csv(r"C:\Dropbox\_projects\word_embedding\stocktwits\data_final.csv")
df_data["sentiment"].value_counts()

Neutral     5836
Positive    2449
Negative    1715
Name: sentiment, dtype: int64

In [3]:
# Create padded sequences
max_len = max(map(lambda x: len(x.split(" ")), df_data["tokens"]))

t = Tokenizer(split=" ", filters="")
t.fit_on_texts(df_data["tokens"])
vocab_size = len(t.word_index) + 1

sequences = t.texts_to_sequences(df_data["tokens"].values)
padded_seq = pad_sequences(sequences, maxlen=max_len, padding="post")
df_seq = pd.DataFrame(padded_seq)

df_seq.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,40
0,1,104,85,4916,4917,364,85,159,1999,65,...,0,0,0,0,0,0,0,0,0,0
1,1,18,14,266,86,5,2302,3479,1478,150,...,0,0,0,0,0,0,0,0,0,0
2,1,21,1289,58,29,189,5,593,6,4919,...,0,0,0,0,0,0,0,0,0,0
3,1,474,72,139,4921,7,5,91,23,3,...,0,0,0,0,0,0,0,0,0,0
4,4922,1010,680,59,4923,6,3481,2,4,1,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# one hot encode the classes
y_multi = to_categorical(df_data["sentiment"].apply(lambda x: 1 if x == "Positive" else -1 if x == "Negative" else 0 ),
              num_classes=3)
y_single = np.argmax(y_multi, axis=1)

In [5]:
batch_size = 64
epochs = 2
n_folds = 10
train_size = int((n_folds - 1)/n_folds*len(df_data))
steps = epochs*train_size//batch_size + 1
def train_input_fn(x_train, y_train, batch_size, seed, epochs, buffer):
    dataset = tf.data.Dataset.from_tensor_slices(({"x":x_train}, y_train))
    dataset = dataset.shuffle(buffer_size=buffer, seed=seed, reshuffle_each_iteration=True)
    dataset = dataset.repeat(epochs).batch(batch_size).prefetch(batch_size)
    iterator = dataset.make_one_shot_iterator()
    return iterator.get_next()

def eval_input_fn(x_test, y_test, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices(({"x":x_test}, y_test))
    dataset = dataset.batch(batch_size)
    iterator = dataset.make_one_shot_iterator()
    return iterator.get_next()

#### CNN Model

In [6]:
def cnn_model_fn(features, labels, mode, params):
    input_layer = tf.contrib.layers.embed_sequence(
        features["x"], vocab_size, emb_dim, 
        initializer=params["embedding_initializer"])
    conv_layers = []
    for i in range(3, 6):
        conv_layers.append(tf.layers.conv1d(activation=tf.nn.relu,
        inputs=input_layer, filters=100, kernel_size=i, padding="same"))
    concat = tf.concat(conv_layers, 1)
    print("Concatenated shape:", concat.shape)
    training = mode == tf.estimator.ModeKeys.TRAIN
    dropout = tf.layers.dropout(inputs=concat, rate=0.2, training=training)
    pool = tf.reduce_max(input_tensor=dropout, axis=1)
    print("Pooled shape:", pool.shape)
    dense1 = tf.layers.dense(inputs=pool, units=30)
    logits = tf.layers.dense(inputs=dense1, units=params["n_classes"])
    print("Output shape:", logits.shape)
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        probs = tf.nn.softmax(logits)
        pred_indices = tf.argmax(probs, 1)
        preds = {"class":pred_indices, "probabilities":probs}
        export_outputs = {"prediction":tf.estimator.export.PredictOutput(preds)}
        return tf.estimator.EstimatorSpec(mode, predictions=preds, export_outputs=export_outputs)
        
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels))
    tf.summary.scalar("loss", loss)
    
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdamOptimizer()
        train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
    
    if mode == tf.estimator.ModeKeys.EVAL:
        probs = tf.nn.softmax(logits)
        pred_indices = tf.argmax(probs, 1)
        labels_one_hot = tf.one_hot(labels, depth=params["n_classes"], on_value=True, off_value=False, dtype=tf.bool)
        eval_metric_ops = {
            "accuracy": tf.metrics.accuracy(labels, pred_indices),
            "auroc": tf.metrics.auc(labels_one_hot, probs)
        }
        return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=eval_metric_ops)
    

#### Pretrained embeddings

In [7]:
emb_dim = 300
model_dir = r"D:\word_embedding\binary\embeddings3"
column = tf.feature_column.categorical_column_with_identity('x', vocab_size)
word_embedding_column = tf.feature_column.embedding_column(column, dimension=300)

In [8]:
# load pretrained embeddings
# en_model = KeyedVectors.load_word2vec_format(r'D:\nlp_resources\word2vec\GoogleNews-vectors-negative300.bin',
#                                             binary=True)
# en_model = KeyedVectors.load_word2vec_format(r'D:\nlp_resources\glove\test_word2vec.txt',
#                                             binary=False)
# en_model = KeyedVectors.load_word2vec_format(r'D:\nlp_resources\fasttext\wiki.en.vec',
#                                             binary=False)

en_model = KeyedVectors.load_word2vec_format(r'D:\word_embedding\vectors2_w2v.txt',
                                            binary=False, encoding="UTF-8")

In [9]:
emb_dim = en_model.vector_size
embedding_matrix = np.zeros((vocab_size, emb_dim))
for word, i in t.word_index.items():
    try:
        embedding_matrix[i] = en_model.get_vector(word)
    except KeyError:
        continue

In [10]:
def my_initializer(shape=None, dtype=tf.float32, partition_info=None):
    assert dtype is tf.float32
    return embedding_matrix

pretrained_embedding_column = tf.feature_column.embedding_column(column, 
                                                                dimension=300,
                                                                initializer=my_initializer,
                                                                trainable=False)

#### Cross validation

In [11]:
f1 = []
accuracy = []
skf = StratifiedKFold(n_splits=10, random_state=seed)
fold = 0
for train_idx, test_idx in skf.split(padded_seq, y_single):
    print(fold)
    fold+=1
    x_train = padded_seq[train_idx]
    x_test = padded_seq[test_idx]
    y_train = y_single[train_idx]
    y_test = y_single[test_idx]
    
    rmtree(model_dir, ignore_errors=True)

    params = {'embedding_initializer': my_initializer,
#         'embedding_initializer': tf.random_uniform_initializer(-1.0, 1.0),
              'n_classes':3}
    classifier = tf.estimator.Estimator(model_fn=cnn_model_fn,
                                       model_dir=model_dir,
                                       params=params)
    classifier.train(input_fn=lambda :train_input_fn(x_train, y_train, batch_size, seed, epochs, train_size), steps=steps)
    pred =  classifier.predict(input_fn=lambda : eval_input_fn(x_test, y_test, batch_size))
    y_true = y_single[test_idx]
    y_pred = [p["class"] for p in pred]
    f1.append(f1_score(y_true, y_pred, average="weighted"))
    accuracy.append(accuracy_score(y_true, y_pred))
print(round(np.mean(f1), 4), round(np.mean(accuracy), 4))

0
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'D:\\word_embedding\\binary\\embeddings3', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000022D5D423828>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
Concatenated shape: (?, 123, 100)
Pooled shape: (?, 100)
Output shape: (?, 3)
