In [1]:
import pickle
import shutil
import sys
import time

# NumPy and TensorFlow
import numpy as np
import tensorflow as tf

# The RNN implementation
import processing
import rnnlm

import nltk
nltk.download("punkt")

sys.path.append("..")
from features import common

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
def train_model(corpus, training_data, model_name, model_params, max_time, batch_size,
                learning_rate, keep_prob, num_epochs):    
    trained_filename = 'tf_saved/rnnlm_%s' % model_name
    
    # Will print status every this many seconds
    print_interval = 5

    # Clear old log directory
    shutil.rmtree("tf_summaries", ignore_errors=True)

    with tf.Graph().as_default(), tf.Session() as session:
        # Seed RNG for repeatability
        tf.set_random_seed(42)
  
        with tf.variable_scope("model", reuse=None):
            lm = rnnlm.RNNLM(model_params)
            lm.BuildCoreGraph()
            lm.BuildTrainGraph()
  
        session.run(tf.initialize_all_variables())
        saver = tf.train.Saver()
  
        for epoch in xrange(1,num_epochs+1):
            t0_epoch = time.time()
            bi = processing.batch_generator(training_data["train_ids"], batch_size, max_time)
            print "[epoch %d] Starting epoch %d" % (epoch, epoch)

            # Run a training epoch.
            lm.RunEpoch(session, bi, train=True, verbose=True, keep_prob=keep_prob)

            print "[epoch %d] Completed in %s" % (epoch, rnnlm.pretty_timedelta(since=t0_epoch))

            print ("[epoch %d]" % epoch),
            lm.ScoreDataset(session, training_data["test_ids"], name="Test set")
            print ""

            # Save a checkpoint
            saver.save(session, 'tf_saved/rnnlm', global_step=epoch)
    
        # Save final model
        saver.save(session, trained_filename)
        
    print "Done training."

In [3]:
print "Loading datasets..."
with open("/usr/src/app/data/ka-comments-balanced.pickle", "rb") as f:
    comments_dataset = pickle.load(f)
print "Done."

Loading datasets...
Done.


In [34]:
reload(rnnlm)
reload(processing)
model_params = rnnlm.RNNParams(V=100000, H=250, num_layers=2)
train_params = {
    "max_time": 20,
    "batch_size": 50,
    "learning_rate": 0.0025,
    "keep_prob": 0.5,
    "num_epochs": 2
}

In [35]:
reload(rnnlm)
reload(processing)
corpus = processing.Corpus(comments_dataset["train_data"]["content"], model_params.V)

Building vocabulary...
Done.


In [37]:
reload(rnnlm)
reload(processing)
mini_training_data = corpus.generate_training_data(
    comments_dataset["train_data"].iloc[:5000]["content"],
    train_frac=0.8)

Finding sentences...
Processing sentences...
Loaded 10645 sentences (172838 tokens)
Training set: 8516 sentences (137780 tokens)
Test set: 2129 sentences (35058 tokens)
Done.


In [38]:
reload(rnnlm)
reload(processing)
train_model(corpus, mini_training_data, "test", model_params, **train_params)

[epoch 1] Starting epoch 1
[batch 39]: seen 40000 words at 3924 wps, loss = 7.297
[batch 86]: seen 87000 words at 4272 wps, loss = 6.560
[batch 123]: seen 124000 words at 4080 wps, loss = 6.306
[epoch 1] Completed in 0:00:35
[epoch 1] [batch 2]: seen 30000 words at 2491 wps, loss = 6.461
Test set: avg. loss: 6.510  (perplexity: 671.49)

[epoch 2] Starting epoch 2
[batch 46]: seen 47000 words at 4693 wps, loss = 5.537
[batch 94]: seen 95000 words at 4701 wps, loss = 5.428
[batch 142]: seen 143000 words at 4718 wps, loss = 5.403
[epoch 2] Completed in 0:00:31
[epoch 2] Test set: avg. loss: 4.868  (perplexity: 130.10)

Done training.


In [39]:
reload(rnnlm)
reload(processing)
true_training_data = corpus.generate_training_data(
    comments_dataset["train_data"][comments_dataset["train_data"]["hasVotes"] == True]["content"],
    train_frac=0.95)

Finding sentences...
Processing sentences...
Loaded 600538 sentences (1.02437e+07 tokens)
Training set: 570511 sentences (9733612 tokens)
Test set: 30027 sentences (510114 tokens)
Done.


In [40]:
reload(rnnlm)
reload(processing)
train_model(corpus, true_training_data, "true_all", model_params, **train_params)

[epoch 1] Starting epoch 1
[batch 28]: seen 29000 words at 2892 wps, loss = 8.020
[batch 63]: seen 64000 words at 3170 wps, loss = 7.078
[batch 101]: seen 102000 words at 3371 wps, loss = 6.641
[batch 137]: seen 138000 words at 3420 wps, loss = 6.395
[batch 171]: seen 172000 words at 3402 wps, loss = 6.241
[batch 208]: seen 209000 words at 3439 wps, loss = 6.106
[batch 245]: seen 246000 words at 3468 wps, loss = 6.006
[batch 280]: seen 281000 words at 3470 wps, loss = 5.952
[batch 312]: seen 313000 words at 3439 wps, loss = 5.899
[batch 341]: seen 342000 words at 3383 wps, loss = 5.849
[batch 372]: seen 373000 words at 3354 wps, loss = 5.799
[batch 401]: seen 402000 words at 3308 wps, loss = 5.754
[batch 428]: seen 429000 words at 3257 wps, loss = 5.716
[batch 458]: seen 459000 words at 3236 wps, loss = 5.678
[batch 488]: seen 489000 words at 3220 wps, loss = 5.649
[batch 520]: seen 521000 words at 3213 wps, loss = 5.616
[batch 550]: seen 551000 words at 3194 wps, loss = 5.586
[batch 5

In [43]:
reload(rnnlm)
reload(processing)
false_training_data = corpus.generate_training_data(
    comments_dataset["train_data"][comments_dataset["train_data"]["hasVotes"] == False]["content"],
    train_frac=0.95)

Finding sentences...
Processing sentences...
Loaded 464182 sentences (7.00312e+06 tokens)
Training set: 440972 sentences (6652497 tokens)
Test set: 23210 sentences (350627 tokens)
Done.


In [44]:
reload(rnnlm)
reload(processing)
train_model(corpus, false_training_data, "false_all", model_params, **train_params)

[epoch 1] Starting epoch 1
[batch 39]: seen 40000 words at 3990 wps, loss = 7.225
[batch 88]: seen 89000 words at 4398 wps, loss = 6.510
[batch 136]: seen 137000 words at 4514 wps, loss = 6.199
[batch 181]: seen 182000 words at 4488 wps, loss = 6.024
[batch 226]: seen 227000 words at 4476 wps, loss = 5.890
[batch 275]: seen 276000 words at 4540 wps, loss = 5.816
[batch 323]: seen 324000 words at 4572 wps, loss = 5.738
[batch 371]: seen 372000 words at 4590 wps, loss = 5.658
[batch 419]: seen 420000 words at 4608 wps, loss = 5.594
[batch 467]: seen 468000 words at 4624 wps, loss = 5.538
[batch 517]: seen 518000 words at 4652 wps, loss = 5.495
[batch 566]: seen 567000 words at 4669 wps, loss = 5.451
[batch 615]: seen 616000 words at 4682 wps, loss = 5.412
[batch 664]: seen 665000 words at 4695 wps, loss = 5.379
[batch 712]: seen 713000 words at 4702 wps, loss = 5.347
[batch 761]: seen 762000 words at 4708 wps, loss = 5.319
[batch 809]: seen 810000 words at 4710 wps, loss = 5.293
[batch 8

In [45]:
reload(rnnlm)
reload(processing)
sents = ["the quick brown fox jumps over the lazy dog",
         "the fox quick brown jumps over the lazy dog",
        "the fox quick brown jumps dog over the lazy"]
print "Scores for TRUE model:"
rnnlm.load_and_score([s.split() for s in sents], corpus, model_params, "tf_saved/rnnlm_true_all")
print "Scores for FALSE model:"
rnnlm.load_and_score([s.split() for s in sents], corpus, model_params, "tf_saved/rnnlm_false_all")

Scores for TRUE model:
"the quick brown fox jumps over the lazy dog" : -72.27
"the fox quick brown jumps over the lazy dog" : -75.11
"the fox quick brown jumps dog over the lazy" : -77.54
Scores for FALSE model:
"the quick brown fox jumps over the lazy dog" : -75.60
"the fox quick brown jumps over the lazy dog" : -77.89
"the fox quick brown jumps dog over the lazy" : -78.64


In [51]:
import scipy.sparse

class RNNClassifierFeatureExtractor(object):
    def __init__(self, corpus, model_params):
        self.corpus = corpus
        self.model_params = model_params
        
    def score_sentences(self, inputs, trained_filename):
        with tf.Graph().as_default(), tf.Session() as session:
            with tf.variable_scope("model", reuse=None):
                lm = rnnlm.RNNLM(self.model_params)
                lm.BuildCoreGraph()

            # Load the trained model
            saver = tf.train.Saver()
            saver.restore(session, trained_filename)

            # Actually run scoring
            results = []
            for idx, s in enumerate(inputs):
                score = 0
                for sent in processing.tokenize_sentences(s):
                    score -= lm.ScoreSeq(session, sent, self.corpus.vocab)
                results.append(score)
                if idx % 100 == 0:
                    print "  %d / %d (%f)" % (idx, len(inputs), score)

        return results
        
    def train(self, train_data):
        return self.transform(train_data)
        
    def transform(self, test_data):
        print "True scores..."
        true_scores = self.score_sentences(test_data["content"], "tf_saved/rnnlm_true_all")
        print "False scores..."
        false_scores = self.score_sentences(test_data["content"], "tf_saved/rnnlm_false_all")
        return scipy.sparse.csr_matrix([[x, y] for x, y in zip(true_scores, false_scores)])

In [52]:
reload(rnnlm)
reload(processing)
reload(common)
common.extract_features(
    "ka-comments-balanced",
    RNNClassifierFeatureExtractor(corpus, model_params),
    "all-rnn", sampling=0.025)

Loading ka-comments-balanced dataset.
Training feature extractor all-rnn.
True scores...
  0 / 12206 (275.568932)
  100 / 12206 (125.487877)
  200 / 12206 (181.134043)
  300 / 12206 (25.101330)
  400 / 12206 (160.682663)
  500 / 12206 (136.817825)
  600 / 12206 (434.871063)
  700 / 12206 (44.882080)
  800 / 12206 (70.830757)
  900 / 12206 (38.050606)
  1000 / 12206 (161.857285)
  1100 / 12206 (53.129695)
  1200 / 12206 (144.602692)
  1300 / 12206 (78.797718)
  1400 / 12206 (206.971590)
  1500 / 12206 (238.381577)
  1600 / 12206 (42.337990)
  1700 / 12206 (33.927345)
  1800 / 12206 (55.874741)
  1900 / 12206 (39.610088)
  2000 / 12206 (91.220322)
  2100 / 12206 (22.809296)
  2200 / 12206 (145.533447)
  2300 / 12206 (231.976738)
  2400 / 12206 (49.401817)
  2500 / 12206 (256.009563)
  2600 / 12206 (369.585513)
  2700 / 12206 (79.510071)
  2800 / 12206 (268.714340)
  2900 / 12206 (141.187340)
  3000 / 12206 (78.467377)
  3100 / 12206 (169.388927)
  3200 / 12206 (81.549629)
  3300 / 12206 

In [53]:
reload(common)
common.test_features("all-rnn")

Loading features all-rnn.
Training models.
##        MultinomialNB         all-rnn accuracy: 62.9 %
##            LinearSVC         all-rnn accuracy: 56.3 %
##                  MLP         all-rnn accuracy: 63.3 %
##                 MLP2         all-rnn accuracy: 64.9 %
