In [5]:
import numpy as np
import tensorflow as tf
import pickle

In [6]:
# load existing data
with open('train_data_vec.p', 'rb') as f:
    train_data = pickle.load(f)
with open('dev_data_vec.p', 'rb') as f:
    dev_data = pickle.load(f)
with open('test_data_vec.p', 'rb') as f:
    test_data = pickle.load(f)

In [14]:
len(train_data[0]['dataVector'])

200

In [4]:
def batchGenerator(dataset, batch_size, success_ratio=0.8):
    for i in xrange(0, len(dataset), batch_size):
        batch_data = dataset[i:i+batch_size]
        x = []
        y = []
        raw = []
        for data in batch_data:
            x.append(data['dataVector'])
            helpful_ratio = data["helpful"][0]*1.0/data["helpful"][1]
            raw.append(data["helpful"])
            if helpful_ratio > success_ratio:
                y.append(1)
            else:
                y.append(0)
        yield (x, y, raw)

In [3]:
class TextModel(object):
    
    def __init__(self):
        self.learning_rate_ = tf.constant(0.01, name="learning_rate")
        
        self.hidden1_size = 300
        self.hidden2_size = 200
        self.input_size = 200
        
        self.input_w_ = tf.placeholder(tf.float32, [None, None], name="w")
        self.target_y_ = tf.placeholder(tf.float32, [None], name="y")
        
        with tf.variable_scope("hidden1"):
            self.w1 = tf.get_variable("w1", shape=[self.input_size, self.hidden1_size], dtype=tf.float32,
                                         initializer=tf.contrib.layers.xavier_initializer())
            self.b1 = tf.get_variable("b1", dtype=tf.float32, 
                         initializer=tf.zeros_initializer([self.hidden1_size]))
            self.h1 = tf.tanh(tf.matmul(self.input_w_, self.w1) + self.b1, name="h1")
            
        with tf.variable_scope("hidden2"):
            self.w2 = tf.get_variable("w2", shape=[self.hidden1_size, self.hidden2_size], dtype=tf.float32,
                                         initializer=tf.contrib.layers.xavier_initializer())
            self.b2 = tf.get_variable("b2", dtype=tf.float32, 
                         initializer=tf.zeros_initializer([self.hidden2_size]))
            self.h2 = tf.tanh(tf.matmul(self.h1, self.w2) + self.b2, name="h2")
            
        with tf.variable_scope("output_layer"):
            self.w_out = tf.get_variable("W_out", shape=[self.hidden2_size, 1], dtype=tf.float32, 
                         initializer=tf.contrib.layers.xavier_initializer())
            self.b_out = tf.get_variable("b_out", dtype=tf.float32, 
                           initializer=tf.zeros_initializer([1]))
            self.logits_ = tf.add(tf.matmul(self.h2, self.w_out), self.b_out, name="logits")
            
        with tf.name_scope("loss_function"):
            self.point_loss_ = tf.nn.sigmoid_cross_entropy_with_logits(tf.squeeze(self.logits_), self.target_y_)
            self.loss_ = tf.reduce_mean(self.point_loss_)
            
        with tf.name_scope("train_ops"):
            optimizer = tf.train.GradientDescentOptimizer(self.learning_rate_)
            self.train_step_ = optimizer.minimize(self.loss_)
            
        with tf.name_scope("Prediction"):
            self.pred_proba_ = tf.sigmoid(self.logits_, name="pred_proba")

In [2]:
import sklearn.metrics as metrics
def score_batch(pred_probs, targets):
    pred = [1 if p>0.5 else 0 for p in pred_probs]
    accuracy = metrics.accuracy_score(targets, pred)
    precision = metrics.precision_score(targets, pred)
    recall = metrics.recall_score(targets, pred)
    f1 = metrics.f1_score(targets, pred)
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [26]:
# baseline score - no training
def baselineScore(dataset):
    # test 2 batches on the first 10 training set
    bi = batchGenerator(dataset, len(dataset), success_ratio=0.8)
    
    with tf.Graph().as_default(), tf.Session() as session:
        with tf.variable_scope("model", reuse=None):
            lm = TextModel()
        
        session.run(tf.initialize_all_variables())
        
        for i,(w,y, raw) in enumerate(bi):
            print("batch #%s"%i)
            feed_dict = { lm.input_w_:w}
            pred_prob = session.run(lm.pred_proba_, feed_dict)
            #print(pred_prob)
            #print(y)
            #print(raw)
            print(score_batch(pred_prob, y))
print("train set baseline")
baselineScore(train_data)
print("dev set baseline")
baselineScore(dev_data)
print("test set baseline")
baselineScore(test_data)

train set baseline
batch #0
{'f1': 0.64309725281094243, 'recall': 0.94097693351424694, 'precision': 0.48846627927452019, 'accuracy': 0.48683333333333334}
dev set baseline
batch #0
{'f1': 0.43810610382201942, 'recall': 0.41423948220064727, 'precision': 0.46489104116222763, 'accuracy': 0.50749999999999995}
test set baseline
batch #0
{'f1': 0.63852961198093949, 'recall': 0.99893503727369537, 'precision': 0.46923461730865434, 'accuracy': 0.46899999999999997}


In [35]:
# run training
trained_filename = 'tf_saved/nn_text_classifier'
batch_size = 5
learning_rate = 0.1
num_epochs = 100
import time
import utils; reload(utils)

def runTraining(print_interval=20):
    with tf.Graph().as_default(), tf.Session() as session:
        tf.set_random_seed(42)
        with tf.variable_scope("model", reuse=None):
            lm = TextModel()
        session.run(tf.initialize_all_variables())
        saver = tf.train.Saver()
        
        for epoch in xrange(1,num_epochs+1):
            t0_epoch = time.time()
            bi = batchGenerator(train_data, batch_size, success_ratio=0.8)
            if epoch == 1 or epoch%print_interval == 0:
                print "[epoch %d] Starting epoch %d" % (epoch, epoch)
            cost = 0.0
            for (w, y, _) in bi:
                feed_dict = {
                    lm.learning_rate_: learning_rate,
                    lm.input_w_: w,
                    lm.target_y_: y
                }
                _, loss_val = session.run([lm.train_step_, lm.loss_], feed_dict)
                cost += loss_val
            
            if epoch%print_interval == 0:
                print "%s: total loss: %.03f" % ("Training", cost)
                print "[epoch %d] Completed in %s" % (epoch, utils.pretty_timedelta(since=t0_epoch))

                print("train score")
                bi = batchGenerator(train_data, len(train_data), 0.8)
                for (w,y, _) in bi:
                    feed_dict = { lm.input_w_:w}
                    pred_prob = session.run(lm.pred_proba_, feed_dict)
                    print(score_batch(pred_prob, y))

                print("dev score")
                bi = batchGenerator(dev_data, len(dev_data), 0.8)
                for (w,y, _) in bi:
                    feed_dict = { lm.input_w_:w}
                    pred_prob = session.run(lm.pred_proba_, feed_dict)
                    print(score_batch(pred_prob, y))
                
        print("train score")
        bi = batchGenerator(train_data, len(train_data), 0.8)
        for (w,y, _) in bi:
            feed_dict = { lm.input_w_:w}
            pred_prob = session.run(lm.pred_proba_, feed_dict)
            print(score_batch(pred_prob, y))

        print("dev score")
        bi = batchGenerator(dev_data, len(dev_data), 0.8)
        for (w,y, _) in bi:
            feed_dict = { lm.input_w_:w}
            pred_prob = session.run(lm.pred_proba_, feed_dict)
            print(score_batch(pred_prob, y))
        # Save final model
        saver.save(session, trained_filename)
runTraining()

[epoch 1] Starting epoch 1
[epoch 20] Starting epoch 20
Training: total loss: 831.909
[epoch 20] Completed in 0:00:01
train score
{'f1': 0.58932446116013315, 'recall': 0.69097693351424694, 'precision': 0.51374527112232027, 'accuracy': 0.52683333333333338}
dev score
{'f1': 0.57015590200445432, 'recall': 0.6903991370010788, 'precision': 0.48558421851289835, 'accuracy': 0.51749999999999996}
[epoch 40] Starting epoch 40
Training: total loss: 830.038
[epoch 40] Completed in 0:00:01
train score
{'f1': 0.58044444444444432, 'recall': 0.66451831750339208, 'precision': 0.51525512887953706, 'accuracy': 0.52800000000000002}
dev score
{'f1': 0.55882352941176472, 'recall': 0.65587918015102487, 'precision': 0.48678943154523618, 'accuracy': 0.52000000000000002}
[epoch 60] Starting epoch 60
Training: total loss: 828.756
[epoch 60] Completed in 0:00:01
train score
{'f1': 0.55997494519260882, 'recall': 0.60651289009497966, 'precision': 0.52006980802792324, 'accuracy': 0.53166666666666662}
dev score
{'f1'

In [7]:
# test score
trained_filename = 'tf_saved/nn_text_classifier'
with tf.Graph().as_default(), tf.Session() as session:
    with tf.variable_scope("model", reuse=None):
        lm = TextModel()
        session.run(tf.initialize_all_variables())
        saver = tf.train.Saver()
    saver = tf.train.Saver()
    saver.restore(session, trained_filename)
    
    print("test score")
    bi = batchGenerator(test_data, len(test_data), 0.8)
    for (w,y, _) in bi:
        feed_dict = { lm.input_w_:w}
        pred_prob = session.run(lm.pred_proba_, feed_dict)
        print(score_batch(pred_prob, y))

test score
{'f1': 0.53402646502835538, 'recall': 0.6017039403620873, 'precision': 0.48003398470688191, 'accuracy': 0.50700000000000001}
