In [29]:
import pickle
import shutil
import sys
import time

# NumPy and TensorFlow
import numpy as np
import tensorflow as tf

# The RNN implementation
import processing
import rnnlm

import nltk
nltk.download("punkt")

sys.path.append("..")
from features import common

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def train_model(corpus, training_data, model_name, model_params, max_time, batch_size,
                learning_rate, keep_prob, num_epochs):    
    trained_filename = 'tf_saved/rnnlm_%s' % model_name
    
    # Will print status every this many seconds
    print_interval = 5

    # Clear old log directory
    shutil.rmtree("tf_summaries", ignore_errors=True)

    with tf.Graph().as_default(), tf.Session() as session:
        # Seed RNG for repeatability
        tf.set_random_seed(42)
  
        with tf.variable_scope("model", reuse=None):
            lm = rnnlm.RNNLM(model_params)
            lm.BuildCoreGraph()
            lm.BuildTrainGraph()
  
        session.run(tf.initialize_all_variables())
        saver = tf.train.Saver()
  
        for epoch in xrange(1,num_epochs+1):
            t0_epoch = time.time()
            bi = processing.batch_generator(training_data["train_ids"], batch_size, max_time)
            print "[epoch %d] Starting epoch %d" % (epoch, epoch)

            # Run a training epoch.
            lm.RunEpoch(session, bi, train=True, verbose=True, keep_prob=keep_prob)

            print "[epoch %d] Completed in %s" % (epoch, rnnlm.pretty_timedelta(since=t0_epoch))

            print ("[epoch %d]" % epoch),
            lm.ScoreDataset(session, training_data["test_ids"], name="Test set")
            print ""

            # Save a checkpoint
            saver.save(session, 'tf_saved/rnnlm', global_step=epoch)
    
        # Save final model
        saver.save(session, trained_filename)
        
    print "Done training."

In [3]:
import scipy.sparse

class RNNClassifierFeatureExtractor(object):
    def __init__(self, corpus, model_params, true_model, false_model):
        self.corpus = corpus
        self.model_params = model_params
        self.true_model = true_model
        self.false_model = false_model
        
    def score_sentences(self, inputs, trained_filename):
        with tf.Graph().as_default(), tf.Session() as session:
            with tf.variable_scope("model", reuse=None):
                lm = rnnlm.RNNLM(self.model_params)
                lm.BuildCoreGraph()

            # Load the trained model
            saver = tf.train.Saver()
            saver.restore(session, trained_filename)

            # Actually run scoring
            results = []
            for idx, s in enumerate(inputs):
                score = 0
                for sent in processing.tokenize_sentences(s):
                    score -= lm.ScoreSeq(session, sent, self.corpus.vocab)
                results.append(score)
                if idx % 100 == 0:
                    print "  %d / %d (%f)" % (idx, len(inputs), score)

        return results
        
    def train(self, train_data):
        return self.transform(train_data)
        
    def transform(self, test_data):
        print "True scores..."
        true_scores = self.score_sentences(test_data["content"], "tf_saved/rnnlm_%s" % self.true_model)
        print "False scores..."
        false_scores = self.score_sentences(test_data["content"], "tf_saved/rnnlm_%s" % self.false_model)
        return scipy.sparse.csr_matrix([[x, y] for x, y in zip(true_scores, false_scores)])

# hasVotes-based model

In [33]:
print "Loading dataset..."
with open("/usr/src/app/data/ka-comments-balanced.pickle", "rb") as f:
    comments_dataset = pickle.load(f)    
print "Done."

Loading dataset...
Done.


In [34]:
reload(rnnlm)
reload(processing)
model_params = rnnlm.RNNParams(V=100000, H=250, num_layers=2)
train_params = {
    "max_time": 20,
    "batch_size": 50,
    "learning_rate": 0.0025,
    "keep_prob": 0.5,
    "num_epochs": 2
}

In [35]:
reload(rnnlm)
reload(processing)
corpus = processing.Corpus(comments_dataset["train_data"]["content"], model_params.V)

Building vocabulary...
Done.


In [None]:
reload(rnnlm)
reload(processing)
mini_training_data = corpus.generate_training_data(
    comments_dataset["train_data"].iloc[:5000]["content"],
    train_frac=0.8)

In [None]:
reload(rnnlm)
reload(processing)
train_model(corpus, mini_training_data, "test", model_params, **train_params)

In [None]:
reload(rnnlm)
reload(processing)
true_training_data = corpus.generate_training_data(
    comments_dataset["train_data"][comments_dataset["train_data"]["hasVotes"] == True]["content"],
    train_frac=0.95)

In [None]:
reload(rnnlm)
reload(processing)
train_model(corpus, true_training_data, "true_all", model_params, **train_params)

In [None]:
reload(rnnlm)
reload(processing)
false_training_data = corpus.generate_training_data(
    comments_dataset["train_data"][comments_dataset["train_data"]["hasVotes"] == False]["content"],
    train_frac=0.95)

In [None]:
reload(rnnlm)
reload(processing)
train_model(corpus, false_training_data, "false_all", model_params, **train_params)

In [None]:
reload(rnnlm)
reload(processing)
sents = ["the quick brown fox jumps over the lazy dog",
         "the fox quick brown jumps over the lazy dog",
        "the fox quick brown jumps dog over the lazy"]
print "Scores for TRUE model:"
rnnlm.load_and_score([s.split() for s in sents], corpus, model_params, "tf_saved/rnnlm_true_all")
print "Scores for FALSE model:"
rnnlm.load_and_score([s.split() for s in sents], corpus, model_params, "tf_saved/rnnlm_false_all")

In [None]:
reload(rnnlm)
reload(processing)
reload(common)
common.extract_features(
    "ka-comments-balanced",
    RNNClassifierFeatureExtractor(corpus, model_params, "true_all", "false_all"),
    "all-rnn", sampling=0.025)

In [30]:
reload(common)
common.test_features("all-rnn")

Loading features all-rnn.
Training models.
##        MultinomialNB         all-rnn precision: 62.9% recall: 62.9%
##            LinearSVC         all-rnn precision: 55.4% recall: 93.2%
##                  MLP         all-rnn precision: 65.1% recall: 60.1%
##                 MLP2         all-rnn precision: 66.9% recall: 53.8%


In [36]:
reload(rnnlm)
reload(processing)
reload(common)
common.extract_features(
    "ka-replies-balanced",
    RNNClassifierFeatureExtractor(corpus, model_params, "true_all", "false_all"),
    "reply-rnn", sampling=0.025)

Loading ka-replies-balanced dataset.
Training feature extractor reply-rnn.
True scores...
  0 / 7705 (201.533875)
  100 / 7705 (69.320618)
  200 / 7705 (232.644455)
  300 / 7705 (1046.712555)
  400 / 7705 (914.276390)
  500 / 7705 (1117.368958)
  600 / 7705 (149.922226)
  700 / 7705 (324.027397)
  800 / 7705 (192.422653)
  900 / 7705 (339.794617)
  1000 / 7705 (379.480927)
  1100 / 7705 (658.827087)
  1200 / 7705 (230.566313)
  1300 / 7705 (207.059769)
  1400 / 7705 (472.427658)
  1500 / 7705 (60.874001)
  1600 / 7705 (514.879982)
  1700 / 7705 (392.418266)
  1800 / 7705 (471.909561)
  1900 / 7705 (273.428284)
  2000 / 7705 (450.746002)
  2100 / 7705 (660.454025)
  2200 / 7705 (120.546432)
  2300 / 7705 (911.398396)
  2400 / 7705 (84.223907)
  2500 / 7705 (141.128983)
  2600 / 7705 (380.456436)
  2700 / 7705 (296.025970)
  2800 / 7705 (427.473694)
  2900 / 7705 (323.535736)
  3000 / 7705 (228.716812)
  3100 / 7705 (127.822060)
  3200 / 7705 (137.855209)
  3300 / 7705 (888.512474)
  340

In [37]:
reload(common)
common.test_features("reply-rnn")

Loading features reply-rnn.
Training models.
##        MultinomialNB       reply-rnn precision: 0.0% recall: 0.0%
##            LinearSVC       reply-rnn precision: 49.1% recall: 62.2%
##                  MLP       reply-rnn precision: 48.2% recall: 61.6%
##                 MLP2       reply-rnn precision: 49.0% recall: 64.3%


# Rank-based model

In [4]:
print "Loading dataset..."
with open("/usr/src/app/data/ka-rank-balanced.pickle", "rb") as f:
    ranks_dataset = pickle.load(f)    
print "Done."

Loading dataset...
Done.


In [11]:
reload(rnnlm)
reload(processing)
model_params = rnnlm.RNNParams(V=100000, H=250, num_layers=2)
train_params = {
    "max_time": 20,
    "batch_size": 50,
    "learning_rate": 0.0025,
    "keep_prob": 0.5,
    "num_epochs": 2
}

In [12]:
reload(rnnlm)
reload(processing)
ranks_corpus = processing.Corpus(ranks_dataset["train_data"]["content"], model_params.V)

Building vocabulary...
Done.


In [13]:
reload(rnnlm)
reload(processing)
mini_training_data = ranks_corpus.generate_training_data(
    ranks_dataset["train_data"].iloc[:5000]["content"],
    train_frac=0.8)

Finding sentences...
Processing sentences...
Loaded 13647 sentences (242364 tokens)
Training set: 10917 sentences (192928 tokens)
Test set: 2730 sentences (49436 tokens)
Done.


In [14]:
reload(rnnlm)
reload(processing)
train_model(ranks_corpus, mini_training_data, "ranks_test", model_params, **train_params)

[epoch 1] Starting epoch 1
[batch 46]: seen 47000 words at 4667 wps, loss = 7.306
[batch 96]: seen 97000 words at 4787 wps, loss = 6.649
[batch 142]: seen 143000 words at 4715 wps, loss = 6.416
[batch 191]: seen 192000 words at 4743 wps, loss = 6.228
[epoch 1] Completed in 0:00:43
[epoch 1] [batch 5]: seen 52100 words at 5130 wps, loss = 5.614
Test set: avg. loss: 5.614  (perplexity: 274.15)

[epoch 2] Starting epoch 2
[batch 46]: seen 47000 words at 4699 wps, loss = 5.512
[batch 94]: seen 95000 words at 4704 wps, loss = 5.428
[batch 142]: seen 143000 words at 4703 wps, loss = 5.376
[batch 186]: seen 187000 words at 4615 wps, loss = 5.345
[epoch 2] Completed in 0:00:43
[epoch 2] Test set: avg. loss: 4.951  (perplexity: 141.29)

Done training.


In [15]:
reload(rnnlm)
reload(processing)
top_training_data = ranks_corpus.generate_training_data(
    ranks_dataset["train_data"][ranks_dataset["train_data"]["topRank"] == True]["content"],
    train_frac=0.95)

Finding sentences...
Processing sentences...
Loaded 497273 sentences (8.36499e+06 tokens)
Training set: 472409 sentences (7947856 tokens)
Test set: 24864 sentences (417133 tokens)
Done.


In [16]:
reload(rnnlm)
reload(processing)
train_model(ranks_corpus, top_training_data, "top_ranks", model_params, **train_params)

[epoch 1] Starting epoch 1
[batch 42]: seen 43000 words at 4296 wps, loss = 7.316
[batch 85]: seen 86000 words at 4273 wps, loss = 6.642
[batch 132]: seen 133000 words at 4406 wps, loss = 6.372
[batch 175]: seen 176000 words at 4360 wps, loss = 6.183
[batch 220]: seen 221000 words at 4375 wps, loss = 6.026
[batch 268]: seen 269000 words at 4440 wps, loss = 5.921
[batch 314]: seen 315000 words at 4460 wps, loss = 5.823
[batch 362]: seen 363000 words at 4490 wps, loss = 5.741
[batch 407]: seen 408000 words at 4479 wps, loss = 5.674
[batch 455]: seen 456000 words at 4508 wps, loss = 5.616
[batch 502]: seen 503000 words at 4524 wps, loss = 5.562
[batch 545]: seen 546000 words at 4499 wps, loss = 5.519
[batch 585]: seen 586000 words at 4459 wps, loss = 5.482
[batch 629]: seen 630000 words at 4447 wps, loss = 5.446
[batch 672]: seen 673000 words at 4431 wps, loss = 5.415
[batch 720]: seen 721000 words at 4452 wps, loss = 5.384
[batch 766]: seen 767000 words at 4457 wps, loss = 5.357
[batch 8

In [24]:
reload(rnnlm)
reload(processing)
bottom_training_data = ranks_corpus.generate_training_data(
    ranks_dataset["train_data"][ranks_dataset["train_data"]["topRank"] == False]["content"],
    train_frac=0.95)

Finding sentences...
Processing sentences...
Loaded 414819 sentences (6.58562e+06 tokens)
Training set: 394078 sentences (6253483 tokens)
Test set: 20741 sentences (332133 tokens)
Done.


In [25]:
reload(rnnlm)
reload(processing)
train_model(ranks_corpus, bottom_training_data, "bottom_ranks", model_params, **train_params)

[epoch 1] Starting epoch 1
[batch 46]: seen 47000 words at 4667 wps, loss = 7.374
[batch 93]: seen 94000 words at 4649 wps, loss = 6.684
[batch 145]: seen 146000 words at 4803 wps, loss = 6.410
[batch 194]: seen 195000 words at 4818 wps, loss = 6.205
[batch 243]: seen 244000 words at 4822 wps, loss = 6.056
[batch 295]: seen 296000 words at 4871 wps, loss = 5.936
[batch 346]: seen 347000 words at 4896 wps, loss = 5.838
[batch 392]: seen 393000 words at 4852 wps, loss = 5.765
[batch 443]: seen 444000 words at 4877 wps, loss = 5.705
[batch 491]: seen 492000 words at 4869 wps, loss = 5.650
[batch 542]: seen 543000 words at 4887 wps, loss = 5.600
[batch 592]: seen 593000 words at 4876 wps, loss = 5.555
[batch 643]: seen 644000 words at 4889 wps, loss = 5.513
[batch 694]: seen 695000 words at 4900 wps, loss = 5.476
[batch 743]: seen 744000 words at 4895 wps, loss = 5.443
[batch 793]: seen 794000 words at 4898 wps, loss = 5.413
[batch 842]: seen 843000 words at 4896 wps, loss = 5.385
[batch 8

In [26]:
reload(rnnlm)
reload(processing)
sents = ["the quick brown fox jumps over the lazy dog",
         "the fox quick brown jumps over the lazy dog",
        "the fox quick brown jumps dog over the lazy"]
print "Scores for TRUE model:"
rnnlm.load_and_score([s.split() for s in sents], ranks_corpus, model_params, "tf_saved/rnnlm_top_ranks")
print "Scores for FALSE model:"
rnnlm.load_and_score([s.split() for s in sents], ranks_corpus, model_params, "tf_saved/rnnlm_bottom_ranks")

Scores for TRUE model:
"the quick brown fox jumps over the lazy dog" : -73.35
"the fox quick brown jumps over the lazy dog" : -75.83
"the fox quick brown jumps dog over the lazy" : -79.17
Scores for FALSE model:
"the quick brown fox jumps over the lazy dog" : -74.41
"the fox quick brown jumps over the lazy dog" : -76.30
"the fox quick brown jumps dog over the lazy" : -77.44


In [27]:
reload(rnnlm)
reload(processing)
reload(common)
common.extract_features(
    "ka-rank-balanced",
    RNNClassifierFeatureExtractor(ranks_corpus, model_params, "top_ranks", "bottom_ranks"),
    "ranks-rnn", sampling=0.025)

Loading ka-rank-balanced dataset.
Training feature extractor ranks-rnn.
True scores...
  0 / 10178 (54.553959)
  100 / 10178 (202.424813)
  200 / 10178 (542.022327)
  300 / 10178 (531.714519)
  400 / 10178 (96.577904)
  500 / 10178 (46.527985)
  600 / 10178 (64.722961)
  700 / 10178 (301.493820)
  800 / 10178 (220.761566)
  900 / 10178 (21.050035)
  1000 / 10178 (134.259415)
  1100 / 10178 (116.186839)
  1200 / 10178 (456.439852)
  1300 / 10178 (103.805710)
  1400 / 10178 (525.499256)
  1500 / 10178 (476.682327)
  1600 / 10178 (171.589852)
  1700 / 10178 (51.491280)
  1800 / 10178 (120.448235)
  1900 / 10178 (14.712406)
  2000 / 10178 (103.330330)
  2100 / 10178 (62.872917)
  2200 / 10178 (91.369926)
  2300 / 10178 (307.247263)
  2400 / 10178 (66.381248)
  2500 / 10178 (340.174171)
  2600 / 10178 (37.261574)
  2700 / 10178 (344.294365)
  2800 / 10178 (421.615618)
  2900 / 10178 (482.847153)
  3000 / 10178 (19.225313)
  3100 / 10178 (549.985173)
  3200 / 10178 (65.415085)
  3300 / 10178

In [31]:
reload(common)
common.test_features("ranks-rnn")

Loading features ranks-rnn.
Training models.
##        MultinomialNB       ranks-rnn precision: 49.5% recall: 25.4%
##            LinearSVC       ranks-rnn precision: 49.2% recall: 89.6%
##                  MLP       ranks-rnn precision: 48.0% recall: 30.6%
##                 MLP2       ranks-rnn precision: 47.4% recall: 27.1%
