In [1]:
import pickle
import shutil
import sys
import time

# NumPy and TensorFlow
import numpy as np
import tensorflow as tf

# The RNN implementation
import processing
import rnnlm

import nltk
nltk.download("punkt")

sys.path.append("..")
from features import common

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [21]:
def train_model(corpus, training_data, model_name, model_params, max_time, batch_size,
                learning_rate, keep_prob, num_epochs):    
    trained_filename = 'tf_saved/rnnlm_%s' % model_name
    
    # Will print status every this many seconds
    print_interval = 5

    # Clear old log directory
    shutil.rmtree("tf_summaries", ignore_errors=True)

    with tf.Graph().as_default(), tf.Session() as session:
        # Seed RNG for repeatability
        tf.set_random_seed(42)
  
        with tf.variable_scope("model", reuse=None):
            lm = rnnlm.RNNLM(model_params)
            lm.BuildCoreGraph()
            lm.BuildTrainGraph()
  
        session.run(tf.initialize_all_variables())
        saver = tf.train.Saver()
  
        for epoch in xrange(1,num_epochs+1):
            t0_epoch = time.time()
            bi = processing.batch_generator(training_data["train_ids"], batch_size, max_time)
            print "[epoch %d] Starting epoch %d" % (epoch, epoch)

            # Run a training epoch.
            lm.RunEpoch(session, bi, train=True, verbose=True, keep_prob=keep_prob)

            print "[epoch %d] Completed in %s" % (epoch, rnnlm.pretty_timedelta(since=t0_epoch))

            print ("[epoch %d]" % epoch),
            lm.ScoreDataset(session, training_data["test_ids"], name="Test set")
            print ""

            # Save a checkpoint
            saver.save(session, 'tf_saved/rnnlm', global_step=epoch)
    
        # Save final model
        saver.save(session, trained_filename)
        
    print "Done training."

In [3]:
print "Loading datasets..."
with open("/usr/src/app/data/ka-comments-balanced.pickle", "rb") as f:
    comments_dataset = pickle.load(f)
print "Done."

Loading datasets...
Done.


In [4]:
reload(rnnlm)
reload(processing)
model_params = rnnlm.RNNParams(V=1000, H=100, num_layers=1)
train_params = {
    "max_time": 20,
    "batch_size": 50,
    "learning_rate": 0.25,
    "keep_prob": 0.5,
    "num_epochs": 1
}

In [17]:
reload(rnnlm)
reload(processing)
corpus = processing.Corpus(comments_dataset["train_data"]["content"], model_params.V)

Building vocabulary...
Done.


In [19]:
reload(rnnlm)
reload(processing)
true_training_data = corpus.generate_training_data(
    comments_dataset["train_data"][comments_dataset["train_data"]["hasVotes"] == True]["content"])

Finding sentences...
Processing sentences...
Loaded 600538 sentences (1.02437e+07 tokens)
Training set: 480430 sentences (8195446 tokens)
Test set: 120108 sentences (2048280 tokens)
Done.


In [22]:
reload(rnnlm)
reload(processing)
train_model(corpus, true_training_data, "true_all", model_params, **train_params)

[epoch 1] Starting epoch 1
[batch 98]: seen 99000 words at 9853 wps, loss = 4.690
[batch 208]: seen 209000 words at 10379 wps, loss = 4.451
[batch 319]: seen 320000 words at 10606 wps, loss = 4.331
[batch 430]: seen 431000 words at 10706 wps, loss = 4.258
[batch 545]: seen 546000 words at 10848 wps, loss = 4.199
[batch 659]: seen 660000 words at 10928 wps, loss = 4.152
[batch 771]: seen 772000 words at 10964 wps, loss = 4.121
[batch 877]: seen 878000 words at 10915 wps, loss = 4.094
[batch 991]: seen 992000 words at 10958 wps, loss = 4.070
[batch 1093]: seen 1094000 words at 10876 wps, loss = 4.051
[batch 1192]: seen 1193000 words at 10783 wps, loss = 4.035
[batch 1300]: seen 1301000 words at 10783 wps, loss = 4.019
[batch 1408]: seen 1409000 words at 10782 wps, loss = 4.003
[batch 1522]: seen 1523000 words at 10820 wps, loss = 3.989
[batch 1623]: seen 1624000 words at 10770 wps, loss = 3.977
[batch 1737]: seen 1738000 words at 10804 wps, loss = 3.965
[batch 1850]: seen 1851000 words a

In [23]:
reload(rnnlm)
reload(processing)
false_training_data = corpus.generate_training_data(
    comments_dataset["train_data"][comments_dataset["train_data"]["hasVotes"] == False]["content"])

Finding sentences...
Processing sentences...
Loaded 464182 sentences (7.00312e+06 tokens)
Training set: 371345 sentences (5602901 tokens)
Test set: 92837 sentences (1400223 tokens)
Done.


In [24]:
reload(rnnlm)
reload(processing)
train_model(corpus, false_training_data, "false_all", model_params, **train_params)

[epoch 1] Starting epoch 1
[batch 110]: seen 111000 words at 11020 wps, loss = 4.585
[batch 218]: seen 219000 words at 10893 wps, loss = 4.411
[batch 325]: seen 326000 words at 10803 wps, loss = 4.322
[batch 425]: seen 426000 words at 10579 wps, loss = 4.252
[batch 532]: seen 533000 words at 10594 wps, loss = 4.205
[batch 644]: seen 645000 words at 10685 wps, loss = 4.163
[batch 748]: seen 749000 words at 10636 wps, loss = 4.132
[batch 856]: seen 857000 words at 10656 wps, loss = 4.105
[batch 960]: seen 961000 words at 10621 wps, loss = 4.086
[batch 1064]: seen 1065000 words at 10591 wps, loss = 4.068
[batch 1171]: seen 1172000 words at 10593 wps, loss = 4.049
[batch 1253]: seen 1254000 words at 10394 wps, loss = 4.034
[batch 1341]: seen 1342000 words at 10270 wps, loss = 4.020
[batch 1437]: seen 1438000 words at 10222 wps, loss = 4.009
[batch 1542]: seen 1543000 words at 10238 wps, loss = 3.996
[batch 1641]: seen 1642000 words at 10212 wps, loss = 3.987
[batch 1736]: seen 1737000 word

In [25]:
reload(rnnlm)
reload(processing)
sents = ["the quick brown fox jumps over the lazy dog",
         "the fox quick brown jumps over the lazy dog",
        "the fox quick brown jumps dog over the lazy"]
print "Scores for TRUE model:"
rnnlm.load_and_score([s.split() for s in sents], corpus, model_params, "tf_saved/rnnlm_true_all")
print "Scores for FALSE model:"
rnnlm.load_and_score([s.split() for s in sents], corpus, model_params, "tf_saved/rnnlm_false_all")

Scores for TRUE model:
"the quick brown fox jumps over the lazy dog" : -17.79
"the fox quick brown jumps over the lazy dog" : -17.79
"the fox quick brown jumps dog over the lazy" : -18.07
Scores for FALSE model:
"the quick brown fox jumps over the lazy dog" : -21.76
"the fox quick brown jumps over the lazy dog" : -21.76
"the fox quick brown jumps dog over the lazy" : -21.76


In [28]:
import scipy.sparse

class RNNClassifierFeatureExtractor(object):
    def __init__(self, corpus, model_params):
        self.corpus = corpus
        self.model_params = model_params
        
    def score_sentences(self, inputs, trained_filename):
        with tf.Graph().as_default(), tf.Session() as session:
            with tf.variable_scope("model", reuse=None):
                lm = rnnlm.RNNLM(self.model_params)
                lm.BuildCoreGraph()

            # Load the trained model
            saver = tf.train.Saver()
            saver.restore(session, trained_filename)

            # Actually run scoring
            results = []
            for idx, s in enumerate(inputs):
                score = 0
                for sent in processing.tokenize_sentences(s):
                    score -= lm.ScoreSeq(session, sent, self.corpus.vocab)
                results.append(score)
                if idx % 1000 == 0:
                    print "  %d / %d" % (idx, len(inputs))

        return results
        
    def train(self, train_data):
        return self.transform(train_data)
        
    def transform(self, test_data):
        print "True scores..."
        true_scores = self.score_sentences(test_data["content"], "tf_saved/rnnlm_true_all")
        print "False scores..."
        false_scores = self.score_sentences(test_data["content"], "tf_saved/rnnlm_false_all")
        return scipy.sparse.csr_matrix([[x, y] for x, y in zip(true_scores, false_scores)])

In [29]:
reload(rnnlm)
reload(processing)
reload(common)
common.extract_features(
    "ka-comments-balanced",
    RNNClassifierFeatureExtractor(corpus, model_params),
    "all-rnn", sampling=0.1)

Loading ka-comments-balanced dataset.
Training feature extractor all-rnn.
True scores...
  0 / 48824
  1000 / 48824
  2000 / 48824
  3000 / 48824
  4000 / 48824
  5000 / 48824
  6000 / 48824
  7000 / 48824
  8000 / 48824
  9000 / 48824
  10000 / 48824
  11000 / 48824
  12000 / 48824
  13000 / 48824
  14000 / 48824
  15000 / 48824
  16000 / 48824
  17000 / 48824
  18000 / 48824
  19000 / 48824
  20000 / 48824
  21000 / 48824
  22000 / 48824
  23000 / 48824
  24000 / 48824
  25000 / 48824
  26000 / 48824
  27000 / 48824
  28000 / 48824
  29000 / 48824
  30000 / 48824
  31000 / 48824
  32000 / 48824
  33000 / 48824
  34000 / 48824
  35000 / 48824
  36000 / 48824
  37000 / 48824
  38000 / 48824
  39000 / 48824
  40000 / 48824
  41000 / 48824
  42000 / 48824
  43000 / 48824
  44000 / 48824
  45000 / 48824
  46000 / 48824
  47000 / 48824
  48000 / 48824
False scores...
  0 / 48824
  1000 / 48824
  2000 / 48824
  3000 / 48824
  4000 / 48824
  5000 / 48824
  6000 / 48824
  7000 / 48824
  8000 

In [30]:
reload(common)
common.test_features("all-rnn")

Loading features all-rnn.
Training models.
##        MultinomialNB         all-rnn accuracy: 55.7 %
##            LinearSVC         all-rnn accuracy: 50.0 %
##                  MLP         all-rnn accuracy: 59.2 %
##                 MLP2         all-rnn accuracy: 60.2 %
