In [1]:
import numpy as np
import tensorflow as tf
import nltk
import vocabulary
import utils

In [2]:
# Load the dataset
V = 10000
vocab, _, _ = utils.load_corpus("brown", split=0.8, V=V, shuffle=42)

Loaded 57340 sentences (1.16119e+06 tokens)
Training set: 45872 sentences (924077 tokens)
Test set: 11468 sentences (237115 tokens)


In [3]:
# Model Params:
trained_filename = 'tf_saved/rnnlm_trained'
model_params = dict(V=V, H=100, num_layers=1)

# Training parameters
max_time = 20
batch_size = 50
learning_rate = 0.5
keep_prob = 1.0
num_epochs = 5

In [4]:
def matmul3d(X, W):
    Xr = tf.reshape(X, [-1, tf.shape(X)[2]])
    XWr = tf.matmul(Xr, W)
    newshape = [tf.shape(X)[0], tf.shape(X)[1], tf.shape(W)[1]]
    return tf.reshape(XWr, newshape)

class LanguageModel(object):
    def makeCell(self, H, keep_prob, num_layers=1):
        cell_layers = []
        for i in range(num_layers):
            cell_layers.append(tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.LSTMCell(num_units=H, forget_bias=0.0, initializer=tf.contrib.layers.xavier_initializer(), state_is_tuple=True), output_keep_prob=keep_prob))
        return tf.nn.rnn_cell.MultiRNNCell(cell_layers, state_is_tuple=True)
    
    def __init__(self, V, H, num_layers=1):
        self.V = V
        self.H = H
        self.num_layers = num_layers
        
        with tf.name_scope("Training_Parameters"):
            self.learning_rate_ = tf.constant(0.1, name="learning_rate")
            self.dropout_keep_prob_ = tf.constant(0.5, name="dropout_keep_prob")
            self.max_grad_norm_ = 5.0
        
        self.input_w_ = tf.placeholder(tf.int32, [None, None], name="w")
        self.target_y_ = tf.placeholder(tf.int32, [None, None], name="y")
        
        with tf.name_scope("batch_size"):
            self.batch_size_ = tf.shape(self.input_w_)[0]
        with tf.name_scope("max_time"):
            self.max_time_ = tf.shape(self.input_w_)[1]
        self.ns_ = tf.tile([self.max_time_], [self.batch_size_,], name="ns")
        
        embedding = tf.get_variable("embedding", [self.V, self.H], dtype=tf.float32
                                   ,initializer=tf.contrib.layers.xavier_initializer())
        inputs = tf.nn.dropout(tf.nn.embedding_lookup(embedding, self.input_w_), self.dropout_keep_prob_)
        
        lstm_cell = self.makeCell(self.H, self.dropout_keep_prob_, self.num_layers)
        self.initial_h_ = lstm_cell.zero_state(self.batch_size_, dtype=tf.float32)
        outputs, state = tf.nn.dynamic_rnn(lstm_cell, inputs, initial_state=self.initial_h_)
        self.lstm_output = outputs
        self.final_state = state
        
        self.output_w = tf.get_variable("output_w", [self.H, self.V], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
        self.output_b = tf.get_variable("output_b", [self.V], dtype=tf.float32, initializer=tf.constant_initializer(value=0))
        
        self.logits_ = matmul3d(self.lstm_output, self.output_w)+self.output_b
        self.loss_ = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits_, self.target_y_, name="full_loss"))
        
        # train op
        self.train_loss_ = tf.reduce_sum(tf.nn.sampled_softmax_loss(tf.transpose(self.output_w), self.output_b, tf.reshape(self.lstm_output, [-1, self.H]), tf.reshape(self.target_y_, [-1, 1]), num_sampled=100, num_classes=self.V, num_true=1))
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.train_loss_, tvars),self.max_grad_norm_)
        optimizer = tf.train.GradientDescentOptimizer(self.learning_rate_)
        self.train_step_ = optimizer.apply_gradients(zip(grads, tvars))
        
    def makePredictionGraph(self, num_samples):
        self.pred_samples = tf.reshape(tf.multinomial(tf.reshape(self.logits_, [-1, self.V]), num_samples), [self.batch_size_, self.max_time_, num_samples])

In [113]:
def getFinalState(input_list):
    def _flatten(sentences):
        final_sent = []
        for sent in sentences:
            final_sent.append("<s>")
            for word in sent:
                final_sent.append(word)
        final_sent.append("<s>")
        return final_sent
        
    with tf.Graph().as_default(), tf.Session() as session:
        with tf.variable_scope("model", reuse=None):
            lm = LanguageModel(**model_params)
        saver = tf.train.Saver()
        saver.restore(session, trained_filename)
        
        results = []
        for inputs in input_list:
            padded_ids = vocab.words_to_ids(utils.canonicalize_words(_flatten(inputs), wordset=vocab.word_to_id))
            w = np.reshape(padded_ids, [1,-1])
            state = session.run(lm.initial_h_, {lm.input_w_: w[:,0:1], lm.dropout_keep_prob_: 1.0})
            
            for i in range(len(padded_ids)):
                feed_dict = {
                    lm.input_w_: w[:,i:i+1],
                    lm.dropout_keep_prob_: 1.0
                }
                for j,(c,h) in enumerate(lm.initial_h_):
                    feed_dict[c] = state[j].c
                    feed_dict[h] = state[j].h
                state = session.run(lm.final_state, feed_dict)
                
            results.append(np.hstack([state[0].h[0],state[0].c[0]]))
        return results

In [31]:
def getFinalStateBatched(inputBatch):
    def _flatten(sentences):
        final_sent = []
        for sent in sentences:
            final_sent.append("<s>")
            for word in sent:
                final_sent.append(word)
        final_sent.append("<s>")
        return final_sent
        
    with tf.Graph().as_default(), tf.Session() as session:
        with tf.variable_scope("model", reuse=None):
            lm = LanguageModel(**model_params)
        saver = tf.train.Saver()
        saver.restore(session, trained_filename)
        
        w = []
        for inputs in inputBatch:
            padded_ids = vocab.words_to_ids(utils.canonicalize_words(_flatten(inputs),
                                                                     wordset=vocab.word_to_id))
            w.append(padded_ids)
        #print(w)
            
        h = session.run(lm.initial_h_, {lm.input_w_: w})
        #print(h)
        feed_dict = { lm.input_w_:w,
               lm.initial_h_:h,
               lm.dropout_keep_prob_: 1.0}
                
        state = session.run(lm.final_state, feed_dict)
        return state

In [32]:
getFinalStateBatched([train_data[6]['reviewSentTokens'], train_data[6]['reviewSentTokens']])

(LSTMStateTuple(c=array([[ -4.52896056e-04,  -6.52337298e-02,  -2.02672064e-01,
         -1.18163507e-02,  -8.18470895e-01,  -1.08129680e+00,
         -1.03290379e+00,  -2.69454509e-01,   8.94485414e-01,
          9.36745286e-01,   9.05345529e-02,   3.33277941e-01,
          3.09180766e-01,   6.19303644e-01,   1.33814716e+00,
         -3.72797623e-02,   8.51642013e-01,   1.55151427e-01,
         -1.06458470e-01,  -6.28919125e-01,   1.41737107e-02,
          1.84937522e-01,   3.78194660e-01,   5.01458228e-01,
          2.14116685e-02,  -2.14317989e+00,   7.57640973e-02,
         -9.94137049e-01,   8.07621598e-01,   1.28478572e-01,
         -3.94692197e-02,  -1.00476921e+00,  -2.81708050e+00,
          9.04995024e-01,  -7.59235799e-01,  -1.01941276e+00,
         -9.55294251e-01,   5.05580641e-02,   5.53285033e-02,
         -2.20284760e-02,  -9.21310663e-01,  -3.19792449e-01,
          9.79135633e-01,   9.15995300e-01,  -4.53105927e-01,
         -3.74995857e-01,   1.71896607e-01,   1.4741

In [115]:
getFinalState([train_data[3]['reviewSentTokens'], train_data[4]['reviewSentTokens']])
#train_data[0]['reviewSentTokens']

[array([  8.39560954e-08,  -1.94323738e-03,  -3.79751041e-03,
         -1.45918625e-06,  -2.95379907e-01,  -9.81683334e-05,
         -7.59375170e-02,  -5.96834952e-03,   3.34183306e-01,
          6.85884833e-01,   7.00229430e-04,   3.34813818e-02,
          1.00655006e-02,   1.56047111e-02,   9.39357221e-01,
         -5.80073647e-05,   1.32441835e-03,   3.05647845e-03,
         -2.11497657e-02,  -2.08649077e-02,  -3.44171412e-02,
          3.23171262e-03,   2.12429091e-02,   1.58758402e-01,
          1.24644686e-07,  -3.24105704e-03,   9.03009786e-04,
         -3.78764421e-03,   7.46508241e-02,   2.77384606e-05,
          7.30262743e-03,  -7.56139696e-01,  -2.29340258e-05,
          5.03067786e-05,  -6.63713668e-04,  -7.02029169e-01,
         -4.70972554e-06,   1.73757728e-02,  -4.94100200e-03,
          2.13381354e-04,  -2.02868105e-05,  -6.71020627e-01,
          7.04710046e-03,   1.73677539e-03,  -7.73197826e-06,
         -4.60054241e-02,  -6.01041131e-04,   4.49516607e-04,
        

In [168]:
len(word_tokenize(train_data[500]['reviewText']))

929

In [7]:
data = 'reviews_Books_5.json.gz'

import gzip
def parse(path, topN=None):
    with gzip.open(path, 'r') as g:
        lc = 0
        for l in g:
            lc += 1
            yield eval(l)
            if topN != None and lc == topN: break
                
def extractWithFeedback(data, n, minTotalFeedbacks=10):
    reviews_w_fb = []
    for i in parse(data):
        if len(i['reviewText']) < 10:
            continue
        if i['helpful'][1] > minTotalFeedbacks:
            reviews_w_fb.append(i)
        if len(reviews_w_fb) == n:
            break
    return reviews_w_fb

def getData(data, totalSample, useCache=True, minTotalFeedbacks=10, split=[0.6,0.2,0.2]):
    import pickle
    import os
    if useCache and os.path.isfile("train_data.p") and os.path.isfile("dev_data.p") and os.path.isfile("test_data.p"):
        print("using cached data")
        with open("train_data.p", "rb") as f:
            train_data = pickle.load(f)
        with open("dev_data.p", "rb") as f:
            dev_data = pickle.load(f)
        with open("test_data.p", "rb") as f:
            test_data = pickle.load(f)
        return train_data, dev_data, test_data
            
    # probably not needed, but shuffle the data just to be safe
    samples = np.random.permutation(extractWithFeedback(data, totalSample, minTotalFeedbacks))
    split_idx1 = int(split[0]*len(samples))
    split_idx2 = split_idx1+int(split[1]*len(samples))
    train_data = samples[:split_idx1]
    dev_data = samples[split_idx1:split_idx2]
    test_data = samples[split_idx2:]
    
    with open("train_data.p", "wb") as f:
        pickle.dump(train_data, f)
    with open("dev_data.p", "wb") as f:
        pickle.dump(dev_data, f)
    with open("test_data.p", "wb") as f:
        pickle.dump(test_data, f)
    
    return train_data, dev_data, test_data

In [8]:
total_sample = 10000
train_data, dev_data, test_data = getData(data, total_sample, useCache=True)
train_targets_prob = [d["helpful"][0]*1.0/d["helpful"][1] for d in train_data]
dev_targets_prob = [d["helpful"][0]*1.0/d["helpful"][1] for d in dev_data]
test_targets_prob = [d["helpful"][0]*1.0/d["helpful"][1] for d in test_data]

using cached data


In [9]:
# tokenize the datasets
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
def tokenizeAll():
    for dataset in (train_data, dev_data, test_data):
        for data in dataset:
            #data['reviewWordTokens'] = word_tokenize(data['reviewText'])
            data['reviewSentences'] = sent_tokenize(data['reviewText'])
            data['reviewSentTokens'] = [word_tokenize(s) for s in data['reviewSentences']]
tokenizeAll()

In [126]:
import pickle

In [127]:
# make fix vectors from input text
def makeFixedSizeVectors():
    print("making train data vector...")
    vec = getFinalState([d['reviewSentTokens'] for d in train_data])
    for i,v in enumerate(vec):
        train_data[i]['dataVector'] = v
    with open("train_data_vec.p", "wb") as f:
        pickle.dump(train_data, f)
        
    print("making test data vector...")
    vec = getFinalState([d['reviewSentTokens'] for d in test_data])
    for i,v in enumerate(vec):
        test_data[i]['dataVector'] = v
    with open("test_data_vec.p", "wb") as f:
        pickle.dump(test_data, f)
        
    print("making dev data vector...")
    vec = getFinalState([d['reviewSentTokens'] for d in dev_data])
    for i,v in enumerate(vec):
        dev_data[i]['dataVector'] = v
    with open("dev_data_vec.p", "wb") as f:
        pickle.dump(dev_data, f)
    
makeFixedSizeVectors()

making train data vector...
making test data vector...
making dev data vector...


In [158]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

def trainModel():
    #model = LogisticRegression()
    model = GradientBoostingClassifier()
    targets = [1 if p>0.8 else 0 for p in train_targets_prob]
    x = [d['dataVector'] for d in train_data]
    model.fit(x, targets)
    return model

model = trainModel()

In [159]:
import sklearn.metrics as metrics
def scoreModel(model, score_data_list, targets):
    pred = model.predict(score_data_list)
    accuracy = metrics.accuracy_score(targets, pred)
    precision = metrics.precision_score(targets, pred)
    recall = metrics.recall_score(targets, pred)
    f1 = metrics.f1_score(targets, pred)
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [160]:
print("train data")
print(scoreModel(model, [d['dataVector'] for d in train_data], [1 if p>0.8 else 0 for p in train_targets_prob]))
print("test data")
print(scoreModel(model, [d['dataVector'] for d in test_data], [1 if p>0.8 else 0 for p in test_targets_prob]))
print("dev data")
print(scoreModel(model, [d['dataVector'] for d in dev_data], [1 if p>0.8 else 0 for p in dev_targets_prob]))

train data
{'f1': 0.74376098418277681, 'recall': 0.71777476255088191, 'precision': 0.77169948942377831, 'accuracy': 0.75700000000000001}
test data
{'f1': 0.51029252437703143, 'recall': 0.50159744408945683, 'precision': 0.51929437706725468, 'accuracy': 0.54800000000000004}
dev data
{'f1': 0.47766133480419193, 'recall': 0.46709816612729232, 'precision': 0.48871331828442438, 'accuracy': 0.52649999999999997}


In [141]:
sum([1 if p>0.8 else 0 for p in dev_targets_prob])*1.0/len(dev_targets_prob)

0.4635