In [30]:
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import os
import time

from collections import namedtuple
from gensim.models import KeyedVectors, Word2Vec
from nltk.tokenize import word_tokenize, TweetTokenizer
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from tqdm import tqdm, trange

sns.set()

In [2]:
word2vec = KeyedVectors.load_word2vec_format('../datasets/glove.840B.300d.w2vformat.txt', binary=False)

In [3]:
df = pd.read_json('../datasets/RC_2018-03.filtered', lines=True, chunksize=1e4).read()
df = df[(df.body != '[deleted]') & (df.body != '[removed]')]

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  obj = concat(self)


In [4]:
def get_embed_matrix(word2vec):
    num_words = len(word2vec.vocab)
    embed_matrix = np.random.randn(num_words + 2, word2vec.vector_size)

    for i in range(num_words):
        word = word2vec.index2word[i]
        word_embedding = word2vec[word]
        
        embed_matrix[i] = word_embedding
    
    return embed_matrix

embed_matrix = get_embed_matrix(word2vec)
PAD_ID = len(word2vec.vocab)

In [5]:
tokenizer = TweetTokenizer()

df['body_tokens'] = df['body'].apply(lambda comment: tokenizer.tokenize(comment.lower()))
df['body_token_ids'] = df['body_tokens'].apply(
    lambda tokens: [
        word2vec.vocab[token].index
        for token in tokens
        if token in word2vec  # Note! We are throwing out unknown words here
    ]
)

cdf = df[df.controversiality > 0]
ncdf = df[df.controversiality == 0]

In [6]:
# We need to resample the noncontroversial samples because the dataset is highly imbalanced
ncdf_resampled = resample(ncdf, replace=False, n_samples=len(cdf), random_state=123456)

print(f"{len(cdf)} controversial examples")
print(f"{len(ncdf_resampled)} uncontroversial examples")

52574 controversial examples
52574 uncontroversial examples


In [7]:
combined_df = pd.concat([cdf, ncdf_resampled])
train_df, test_df = train_test_split(combined_df)

In [96]:
Batch = namedtuple(
    'Batch',
    ['comment_tokens', 'comment_token_ids', 'comment_lens', 'comment_controversialities'],
)

def get_batch(train_df, batch_size=100, max_comment_len=100):
    sampled = train_df.sample(batch_size)
    
    comment_tokens = sampled.body_tokens
    
    comment_token_ids = sampled.body_token_ids.apply(
        lambda token_ids: (token_ids + ([PAD_ID] * max_comment_len))[:max_comment_len]
    )
    comment_token_ids = np.array([[int(x) for x in c] for c in comment_token_ids])
    comment_lens = np.array(sampled.body_token_ids.str.len())
    comment_controversialities = np.array(sampled.controversiality > 0)
    
    return Batch(comment_tokens, comment_token_ids, comment_lens, comment_controversialities)

In [97]:
def create_graph(embed_matrix, max_comment_len=100):
    # Placeholders

    word_embeddings = tf.placeholder(tf.float32, embed_matrix.shape, name="word_embeddings")
    comment_token_ids = tf.placeholder(tf.int32, (None, max_comment_len), name="comment_token_ids")
    comment_lens = tf.placeholder(tf.int32, (None,), name="comment_lens")
    comment_controversialities = tf.placeholder(tf.int32, (None,), name="comment_controversialities")

    # Embedding lookup

    with tf.variable_scope('embed'):
        comment_vecs = tf.nn.embedding_lookup(word_embeddings, comment_token_ids)

    # RNN

    lstm_cell_size = 100

    with tf.variable_scope('lstm'):
        lstm_forward = tf.contrib.rnn.BasicLSTMCell(lstm_cell_size)
        lstm_backward = tf.contrib.rnn.BasicLSTMCell(lstm_cell_size)
        
        (fw_out, bw_out), _ = tf.nn.bidirectional_dynamic_rnn(
            lstm_forward,
            lstm_backward,
            comment_vecs,
            sequence_length=tf.minimum(comment_lens, max_comment_len),
            dtype=tf.float32,
        )

        rnn_out = tf.concat([fw_out, bw_out], axis=2)[:, -1, :]

    with tf.variable_scope('output'):
        logits = tf.contrib.layers.fully_connected(
            rnn_out,
            num_outputs=2,
            activation_fn=None,
        )

    # Loss optimization

    cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(
        logits=logits,
        labels=comment_controversialities[:, None],
    )
    loss = tf.reduce_mean(cross_entropy)

    optimizer = tf.train.AdamOptimizer()
    train_op = optimizer.minimize(loss)

    # Check accuracy

    correct = tf.equal(
        tf.argmax(logits, 1, output_type=tf.int32),
        comment_controversialities,
    )
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

    tf.summary.scalar('Loss', loss)
    tf.summary.scalar('Accuracy', accuracy)
    summary_op = tf.summary.merge_all()
    
    return {
        'loss': loss,
        'train': train_op,
        'summary': summary_op,
    }

In [98]:
tf.reset_default_graph()
graph_ops = create_graph(embed_matrix)

In [None]:
with tf.Session() as sess:
#     logdir = os.path.join('summary', str(int(time.time())))
#     writer = tf.summary.FileWriter(logdir, sess.graph)
#     saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())

    for i in range(int(1e4)):
        batch = get_batch(train_df)        
        _, loss = sess.run(
            [graph_ops['train'], graph_ops['loss']],
            {
                "word_embeddings:0": embed_matrix,
                "comment_token_ids:0": batch.comment_token_ids,
                "comment_lens:0": batch.comment_lens,
                "comment_controversialities:0": batch.comment_controversialities,
            }
        )
        print(loss)
#         writer.add_summary(summary, i)