In [1]:
import numpy as np
import random
import tensorflow as tf

from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models.doc2vec import Doc2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample

from tqdm import trange

In [2]:
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)
        return shuffled

In [3]:
sources = {
    '../datasets/train.c': 'TRAIN_CONTROVERSIAL',
    '../datasets/train.nc': 'TRAIN_UNCONTROVERSIAL',
    '../datasets/test.c': 'TEST_CONTROVERSIAL',
    '../datasets/test.nc': 'TEST_UNCONTROVERSIAL',
}

sentences = LabeledLineSentence(sources)

In [4]:
model = Doc2Vec(min_count=1, window=10, size=200, sample=1e-4, negative=5, workers=7)

model.build_vocab(sentences.to_array())



In [5]:
model.train(sentences.sentences_perm(), total_examples=len(sentences.sentences), epochs=20)

In [6]:
model.most_similar('trump')

  """Entry point for launching an IPython kernel.


[('he', 0.8239975571632385),
 ('him', 0.7968652844429016),
 ('president', 0.791419506072998),
 ('his', 0.7529799342155457),
 ('putin', 0.7302655577659607),
 ('obama', 0.7178451418876648),
 ('this', 0.7112518548965454),
 ('that', 0.7025802135467529),
 ('.', 0.6967028975486755),
 ('donald', 0.6935111284255981)]

In [7]:
model.save('twitter.d2v')

In [8]:
num_train = 26302
num_train_uc = 346313

train_arrays = np.zeros((num_train * 2, 200))
train_labels = np.zeros(num_train * 2)

for i in range(num_train):
    prefix_train_c = f"TRAIN_CONTROVERSIAL_{i}"
    train_arrays[i] = model[prefix_train_c]
    train_labels[i] = 1

for idx, i in enumerate(random.sample(range(num_train_uc), num_train)):
    prefix_train_uc = f"TRAIN_UNCONTROVERSIAL_{i}"
    ix = idx + num_train
    train_arrays[ix] = model[prefix_train_uc]
    train_labels[ix] = 0

# for i in range(12500):
#     prefix_train_pos = f"TRAIN_CONTROVERSIAL_{i}"
#     prefix_train_neg = f"TRAIN_UNCONTROVERSIAL_{i}"
#     train_arrays[i] = model[prefix_train_pos]
#     train_arrays[12500 + i] = model[prefix_train_neg]
#     train_labels[i] = 1
#     train_labels[12500 + i] = 0

In [9]:
num_test = 26156
num_test_uc = 347449

test_arrays = np.zeros((num_test * 2, 200))
test_labels = np.zeros(num_test * 2)

for i in range(num_test):
    prefix_test_c = f"TEST_CONTROVERSIAL_{i}"
    test_arrays[i] = model[prefix_test_c]
    test_labels[i] = 1

for idx, i in enumerate(random.sample(range(num_test_uc), num_test)):
    prefix_test_uc = f"TEST_UNCONTROVERSIAL_{i}"
    ix = idx + num_test
    test_arrays[ix] = model[prefix_test_uc]
    test_labels[ix] = 0

# test_arrays = np.zeros((25000, 100))
# test_labels = np.zeros(25000)

# for i in range(12500):
#     prefix_test_pos = 'TEST_CONTROVERSIAL_' + str(i)
#     prefix_test_neg = 'TEST_UNCONTROVERSIAL_' + str(i)
#     test_arrays[i] = model[prefix_test_pos]
#     test_arrays[12500 + i] = model[prefix_test_neg]
#     test_labels[i] = 1
#     test_labels[12500 + i] = 0

In [10]:
classifier = LogisticRegression()
classifier.fit(train_arrays, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [11]:
classifier.score(test_arrays, test_labels)

0.5733483713105979

In [22]:
tf.reset_default_graph()

# Placeholders

comments = tf.placeholder(
    tf.float32,
    (None, 200),
    name="comments",
)
labels = tf.placeholder(tf.int32, (None, 1), name="labels")

h1 = tf.layers.dense(comments, units=512, activation=tf.nn.relu)
h2 = tf.layers.dense(h1, units=512, activation=tf.nn.relu)
logits = tf.layers.dense(h2, units=2)

logits = tf.Print(logits, [logits])

# Loss optimization

cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(
    logits=logits,
    labels=labels,
)
loss = tf.reduce_mean(cross_entropy)

optimizer = tf.train.AdamOptimizer()
train_op = optimizer.minimize(loss)

# Check accuracy

correct = tf.equal(tf.argmax(logits, 1, output_type=tf.int32), labels)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

tf.summary.scalar('Loss', loss)
tf.summary.scalar('Accuracy', accuracy)
summary_op = tf.summary.merge_all()

In [26]:
with tf.Session() as sess:
#     logdir = os.path.join('summary', str(int(time.time())))
#     writer = tf.summary.FileWriter(logdir, sess.graph)
#     saver = tf.train.Saver()
    
    sess.run(tf.global_variables_initializer())

    for i in trange(int(1e5)):
        batch_idx = np.random.choice(len(train_arrays), 100)
        _, loss_val, accuracy_val = sess.run(
            [train_op, loss, accuracy],
            {
                comments: train_arrays[batch_idx],
                labels: train_labels[batch_idx][:, None],
            }
        )
        
#         if i % 100 == 0:
#             print(loss_val, accuracy_val)
#         writer.add_summary(summary, i)

    test_accuracy = sess.run(accuracy, {
        comments: test_arrays,
        labels: test_labels[:, None],
    })
    print(test_accuracy)

100%|██████████| 100000/100000 [05:35<00:00, 298.09it/s]


0.049046412
