In [4]:
datafile = 'reviews_Books_5.json.gz'
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize

In [5]:
# utility functions for getting data from datafile
import gzip
def parse(path, topN=None):
    with gzip.open(path, 'r') as g:
        lc = 0
        for l in g:
            lc += 1
            yield eval(l)
            if topN != None and lc == topN: break
                
def extractWithFeedback(data, n, minTotalFeedbacks=10):
    reviews_w_fb = []
    for i in parse(data):
        if len(i['reviewText']) < 10:
            continue
        if i['helpful'][1] > minTotalFeedbacks:
            reviews_w_fb.append(i)
        if len(reviews_w_fb) == n:
            break
    return reviews_w_fb

def getData(data, totalSample, useCache=True, minTotalFeedbacks=10, split=[0.6,0.2,0.2]):
    import pickle
    import os
    if useCache and os.path.isfile("train_data.p") and os.path.isfile("dev_data.p") and os.path.isfile("test_data.p"):
        print("using cached data")
        with open("train_data.p", "rb") as f:
            train_data = pickle.load(f)
        with open("dev_data.p", "rb") as f:
            dev_data = pickle.load(f)
        with open("test_data.p", "rb") as f:
            test_data = pickle.load(f)
        return train_data, dev_data, test_data
            
    # probably not needed, but shuffle the data just to be safe
    samples = np.random.permutation(extractWithFeedback(data, totalSample, minTotalFeedbacks))
    split_idx1 = int(split[0]*len(samples))
    split_idx2 = split_idx1+int(split[1]*len(samples))
    train_data = samples[:split_idx1]
    dev_data = samples[split_idx1:split_idx2]
    test_data = samples[split_idx2:]
    
    with open("train_data.p", "wb") as f:
        pickle.dump(train_data, f)
    with open("dev_data.p", "wb") as f:
        pickle.dump(dev_data, f)
    with open("test_data.p", "wb") as f:
        pickle.dump(test_data, f)
    
    return train_data, dev_data, test_data

In [6]:
total_sample = 10000
train_data, dev_data, test_data = getData(datafile, total_sample, useCache=True)

using cached data


In [4]:
train_data[0]

{'asin': '000649885X',
 'helpful': [6, 21],
 'overall': 1.0,
 'reviewText': "After reading The Farseer Trilogy which was very promising, I decided to take another chance with the author and try The Ship of Magic series. This first book is very silly. The concept of a liveship which speaks to it's  occupants was intriguing but it loses it's appeal as the characters were  very annoying, and the magic was very unsophisticated. I tried very hard to  find some good in this novel but it falls short of anything spectacular.  Only for the extreemely imaginative and open minded. If you have your feet  planted firmly on the ground, this fantasy novel is not for you. Goodkind  fans will find this novel a hard read. The Farseer trilogy is far better.",
 'reviewTime': '05 6, 2000',
 'reviewerID': 'A3SPHSI6Q9HO1G',
 'reviewerName': 'Amazon Customer "funnicky"',
 'summary': 'For the extremely imaginative only !',
 'unixReviewTime': 957571200}

In [5]:
# tokenize and preprocess text
import utils; reload(utils)
def preprocessAll():
    for dataset in (train_data, test_data, dev_data):
        for data in dataset:
            raw_text = data['reviewText']
            sentences = sent_tokenize(raw_text)
            final_tokens = []
            for s in sentences:
                final_tokens.append('<s>')
                for w in word_tokenize(s):
                    final_tokens.append(utils.canonicalize_word(w))
            final_tokens.append('</s>')
            data['procTokens'] = final_tokens
preprocessAll()

In [6]:
# save processed text
import pickle
with open('train_data_proc.p', 'wb') as f:
    pickle.dump(train_data, f)
with open('dev_data_proc.p', 'wb') as f:
    pickle.dump(dev_data, f)
with open('test_data_proc.p', 'wb') as f:
    pickle.dump(test_data, f)

In [7]:
# load data
import pickle
with open('train_data_proc.p', 'rb') as f:
    train_data = pickle.load(f)
with open('dev_data_proc.p', 'rb') as f:
    dev_data = pickle.load(f)
with open('test_data_proc.p', 'rb') as f:
    test_data = pickle.load(f)

In [8]:
import collections
import itertools

def flatten(list_of_lists):
    """Flatten a list-of-lists into a single list."""
    return list(itertools.chain.from_iterable(list_of_lists))

class vocabulary:
    START_TOKEN = "<s>"
    END_TOKEN = "</s>"
    UNK_TOKEN = "<unk>"
    
    def __init__(self, train_data, test_data, dev_data, size):
        self.unigram_counts = collections.Counter(flatten([t['procTokens'] for t in train_data])
                                                  +flatten([t['procTokens'] for t in test_data])
                                                  +flatten([t['procTokens'] for t in dev_data]))
        top_counts = self.unigram_counts.most_common(None if size is None else (size - 1))
        vocab = ([self.UNK_TOKEN] + [w for w,c in top_counts])
        # Assign an id to each word, by frequency
        self.id_to_word = dict(enumerate(vocab))
        self.word_to_id = {v:k for k,v in self.id_to_word.iteritems()}
        self.size = len(self.id_to_word)
        if size is not None:
            assert(self.size <= size)

        # Store special IDs
        self.START_ID = self.word_to_id[self.START_TOKEN]
        self.END_ID = self.word_to_id[self.END_TOKEN]
        self.UNK_ID = self.word_to_id[self.UNK_TOKEN]

    def words_to_ids(self, words):
        return [self.word_to_id.get(w, self.UNK_ID) for w in words]

    def ids_to_words(self, ids):
        return [self.id_to_word[i] for i in ids]

    def sentence_to_ids(self, words):
        return [self.START_ID] + self.words_to_ids(words) + [self.END_ID]

    def ordered_words(self):
        """Return a list of words, ordered by id."""
        return self.ids_to_words(range(self.size))

In [9]:
V = 10000
vocab = vocabulary(train_data, test_data, dev_data, V)

In [9]:
len(vocab.word_to_id)

10000

In [10]:
# generate batch, and pad to same length
def batchGenerator(dataset, batch_size, vocab, success_ratio=0.8, maxtime=None):
    for i in xrange(0, len(dataset), batch_size):
        batch_data = dataset[i:i+batch_size]
        maxlength = max([len(d['procTokens']) for d in batch_data])
        if maxtime != None and maxtime < maxlength:
            maxlength = maxtime
        x = []
        y = []
        raw = []
        for data in batch_data:
            tokens = data['procTokens']
            if len(tokens) > maxlength:
                tokens = tokens[:maxlength]
            elif len(tokens) < maxlength:
                tokens = tokens + ['</s>']*(maxlength-len(tokens))
            x.append(vocab.words_to_ids(tokens))
            helpful_ratio = data["helpful"][0]*1.0/data["helpful"][1]
            raw.append(data["helpful"])
            if helpful_ratio > success_ratio:
                y.append(1)
            else:
                y.append(0)
        yield (x, y, raw)
        
# run like this
#result = batchGenerator(test_data, 5, vocab, 200)

In [11]:
for r in batchGenerator(test_data[0:1], 1, vocab, 200):
    print(vocab.ids_to_words(r[0][0]))

['<s>', '(', 'i', 'am', 'reviewing', 'the', 'DGDGDGDG', '<unk>', 'large', 'print', 'version', 'of', 'the', 'original', 'DGDGDGDG', 'book', 'by', '<unk>', '.', '<s>', 'illustrations', 'are', 'by', '<unk>', '<unk>', '.', '<s>', ')', 'this', 'fanciful', 'book', "'s", 'old-fashioned', 'style', 'and', 'content', 'almost', 'feels', 'as', 'if', 'it', 'were', 'written', 'at', 'the', 'turn', 'of', 'the', '19th', 'century', ',', 'and', 'the', 'james', "'", 'initial', 'misery', 'recalls', 'dickens', '.', '<s>', 'the', 'writing', "'s", 'rough', 'edges', 'make', 'it', 'seem', 'more', 'like', 'a', 'personal', 'story', ',', 'rather', 'than', 'the', 'product', 'of', 'some', 'anonymous', '<unk>', ',', 'the', 'beginning', 'of', 'the', 'book', '(', 'where', 'james', 'magically', 'escapes', 'from', 'his', '<unk>', ')', 'seems', 'contrived', ',', 'the', '<unk>', 'are', 'unbelievably', 'cruel', ',', 'and', 'the', 'writing', 'is', 'somehow', 'flat', '.', '<s>', 'however', ',', 'the', 'book', 'picks', 'up', '

In [12]:
# Model Params:
trained_filename = 'tf_saved/final_project_lstm_classifier'
model_params = dict(V=V, H=100, num_layers=2)

# Training parameters
max_time = 400
batch_size = 50
learning_rate = 0.001
keep_prob = 1.0
num_epochs = 10

In [13]:
import tensorflow as tf

In [14]:
def matmul3d(X, W):
    Xr = tf.reshape(X, [-1, tf.shape(X)[2]])
    XWr = tf.matmul(Xr, W)
    newshape = [tf.shape(X)[0], tf.shape(X)[1], tf.shape(W)[1]]
    return tf.reshape(XWr, newshape)

class LanguageModel(object):
    def makeCell(self, H, keep_prob, num_layers=1):
        cell_layers = []
        for i in range(num_layers):
            cell_layers.append(tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.LSTMCell(num_units=H, forget_bias=0.0, initializer=tf.contrib.layers.xavier_initializer(), state_is_tuple=False), output_keep_prob=keep_prob))
        return tf.nn.rnn_cell.MultiRNNCell(cell_layers, state_is_tuple=False)

    def __init__(self, V, H, num_layers=1):
        self.V = V
        self.H = H
        self.num_layers = num_layers
        
        with tf.name_scope("Training_Parameters"):
            self.learning_rate_ = tf.constant(0.1, name="learning_rate")
            self.dropout_keep_prob_ = tf.constant(1.0, name="dropout_keep_prob")
            self.max_grad_norm_ = 5.0

        self.input_w_ = tf.placeholder(tf.int32, [None, None], name="w")
        self.target_y_ = tf.placeholder(tf.float32, [None], name="y")

        with tf.name_scope("batch_size"):
            self.batch_size_ = tf.shape(self.input_w_)[0]
        with tf.name_scope("max_time"):
            self.max_time_ = tf.shape(self.input_w_)[1]
            self.ns_ = tf.tile([self.max_time_], [self.batch_size_,], name="ns")

        embedding = tf.get_variable("embedding", [self.V, self.H], dtype=tf.float32
                                    ,initializer=tf.contrib.layers.xavier_initializer())
        inputs = tf.nn.embedding_lookup(embedding, self.input_w_)

        lstm_cell = self.makeCell(self.H, self.dropout_keep_prob_, self.num_layers)
        self.initial_h_ = lstm_cell.zero_state(self.batch_size_, dtype=tf.float32)
        outputs, state = tf.nn.dynamic_rnn(lstm_cell, inputs, initial_state=self.initial_h_)

        self.lstm_output = outputs
        self.final_state = state
        
        # build a 2-layer NN on top of final state to perform text classification
        with tf.variable_scope("classification_hidden_layer"):
            self.w_c_h = tf.get_variable("w_c_h", shape=[H*2*self.num_layers, H], dtype=tf.float32,
                                         initializer=tf.contrib.layers.xavier_initializer())
            self.b_c_h = tf.get_variable("b_c_h", dtype=tf.float32, 
                         initializer=tf.zeros_initializer([H]))
            self.c_h = tf.tanh(tf.matmul(self.final_state, self.w_c_h) + self.b_c_h, name="c_h")
            
        with tf.variable_scope("output_layer"):
            self.w_out = tf.get_variable("W_out", shape=[H, 1], dtype=tf.float32, 
                         initializer=tf.contrib.layers.xavier_initializer())
            self.b_out = tf.get_variable("b_out", dtype=tf.float32, 
                           initializer=tf.zeros_initializer([1]))
            self.logits_ = tf.add(tf.matmul(self.c_h, self.w_out), self.b_out, name="logits")
            
        with tf.name_scope("loss_function"):
            self.point_loss_ = tf.nn.sigmoid_cross_entropy_with_logits(tf.squeeze(self.logits_), self.target_y_)
            #self.loss_ = tf.reduce_sum(self.point_loss_)
            self.loss_ = tf.reduce_mean(self.point_loss_)
            
        # train ops
        with tf.name_scope("train_ops"):
            #tvars = tf.trainable_variables()
            #grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss_, tvars),self.max_grad_norm_)
            #optimizer = tf.train.GradientDescentOptimizer(self.learning_rate_)
            #self.train_step_ = optimizer.apply_gradients(zip(grads, tvars))
            #optimizer = tf.train.GradientDescentOptimizer(self.learning_rate_)
            optimizer = tf.train.AdamOptimizer(self.learning_rate_)
            self.train_step_ = optimizer.minimize(self.loss_)
            
        # prediction
        with tf.name_scope("Prediction"):
            self.pred_proba_ = tf.sigmoid(self.logits_, name="pred_proba")

In [2]:
# try to dump out some intermediate data
def dumpIntermediateOutputs(inputBatch):
    def _flatten(sentences):
        final_sent = []
        for sent in sentences:
            final_sent.append("<s>")
            for word in sent:
                final_sent.append(word)
        final_sent.append("</s>")
        return final_sent
    
    with tf.Graph().as_default(), tf.Session() as session:
        with tf.variable_scope("model", reuse=None):
            lm = LanguageModel(**model_params)
        
        session.run(tf.initialize_all_variables())
        w = []
        for inputs in inputBatch:
            padded_ids = vocab.words_to_ids(inputs)
            w.append(padded_ids)
        #print(w)
            
        h = session.run(lm.initial_h_, {lm.input_w_: w})
        #print(h)
        feed_dict = { lm.input_w_:w,
               lm.initial_h_:h,
               lm.dropout_keep_prob_: 1.0,
               lm.target_y_: [1]*len(w)}
                
        pred, logits_ = session.run([lm.pred_proba_, lm.logits_], feed_dict)
        return pred, logits_

In [15]:
dumpIntermediateOutputs(["this is a test".split(), "this is another test".split()])



(array([[ 0.49952438],
        [ 0.50011986]], dtype=float32), array([[-0.00190239],
        [ 0.00047945]], dtype=float32))

In [16]:
import sklearn.metrics as metrics
def score_batch(pred_probs, targets):
    pred = [1 if p>0.5 else 0 for p in pred_probs]
    accuracy = metrics.accuracy_score(targets, pred)
    precision = metrics.precision_score(targets, pred)
    recall = metrics.recall_score(targets, pred)
    f1 = metrics.f1_score(targets, pred)
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [17]:
# baseline score - no training
def baselineScore(dataset):
    # test 2 batches on the first 10 training set
    bi = batchGenerator(dataset, len(dataset), vocab, 0.8, max_time)
    
    with tf.Graph().as_default(), tf.Session() as session:
        with tf.variable_scope("model", reuse=None):
            lm = LanguageModel(**model_params)
        
        session.run(tf.initialize_all_variables())
        
        for i,(w,y, raw) in enumerate(bi):
            print("batch #%s"%i)
            h = session.run(lm.initial_h_, {lm.input_w_: w})
            feed_dict = { lm.input_w_:w,
               lm.initial_h_:h,
               lm.dropout_keep_prob_: 1.0}

            pred_prob = session.run(lm.pred_proba_, feed_dict)
            #print(pred_prob)
            #print(y)
            #print(raw)
            print(score_batch(pred_prob, y))
print("train set baseline")
baselineScore(train_data)
print("dev set baseline")
baselineScore(dev_data)
print("test set baseline")
baselineScore(test_data)



train set baseline
batch #0




{'f1': 0.29594151751953618, 'recall': 0.19911804613297152, 'precision': 0.57605495583905786, 'accuracy': 0.53449999999999998}
dev set baseline
batch #0




{'f1': 0.29513343799058084, 'recall': 0.20280474649406688, 'precision': 0.5417867435158501, 'accuracy': 0.55100000000000005}
test set baseline
batch #0
{'f1': 0.24066390041493776, 'recall': 0.15441959531416399, 'precision': 0.54511278195488722, 'accuracy': 0.54249999999999998}


In [18]:
def run_epoch(lm, session, batch_iterator, train=False,
              verbose=False, tick_s=10, 
              keep_prob=1.0, learning_rate=0.1):
    start_time = time.time()
    tick_time = start_time
    total_cost = 0.0
    total_texts = 0
    
    if train:
        train_op = lm.train_step_
        keep_prob = keep_prob
        loss = lm.loss_
    else:
        train_op = tf.no_op()
        keep_prob = 1.0
        loss = lm.loss_
        
    for i, (w, y, _) in enumerate(batch_iterator):
        feed_dict = {
            lm.learning_rate_: learning_rate,
            lm.dropout_keep_prob_: keep_prob,
            lm.input_w_: w,
            lm.target_y_: y
        }
        state = session.run(lm.initial_h_, feed_dict)
        feed_dict[lm.initial_h_] = state
        #print(state)
        _, loss_val = session.run([train_op, loss], feed_dict)
        
        total_cost += loss_val
        total_texts += len(w)
        
        if verbose and (time.time() - tick_time >= tick_s):
            avg_cost = total_cost / total_texts
            avg_tps = total_texts / (time.time() - start_time)
            print "[batch %d]: seen %d texts at %d wps, loss = %.3f" % (i,total_texts, avg_tps, avg_cost)
            tick_time = time.time()
    #return total_cost / total_texts
    return total_cost

In [20]:
# run training
import time
import utils; reload(utils)
def runTraining(print_interval=5):
    with tf.Graph().as_default(), tf.Session() as session:
        tf.set_random_seed(42)
        with tf.variable_scope("model", reuse=None):
            lm = LanguageModel(**model_params)
        session.run(tf.initialize_all_variables())
        saver = tf.train.Saver()
        
        for epoch in xrange(1,num_epochs+1):
            t0_epoch = time.time()
            bi = batchGenerator(train_data, batch_size, vocab, 0.8, max_time)
            print "[epoch %d] Starting epoch %d" % (epoch, epoch)
            cost = run_epoch(lm, session, bi, train=True, keep_prob=keep_prob, learning_rate=learning_rate)
            print "%s: total loss: %.03f" % ("Training", cost)
            print "[epoch %d] Completed in %s" % (epoch, utils.pretty_timedelta(since=t0_epoch))
            
            print("train score")
            bi = batchGenerator(train_data, len(train_data), vocab, 0.8, max_time)
            for i,(w,y, raw) in enumerate(bi):
                h = session.run(lm.initial_h_, {lm.input_w_: w})
                feed_dict = { lm.input_w_:w,
                               lm.initial_h_:h,
                               lm.dropout_keep_prob_: 1.0}
                pred_prob = session.run(lm.pred_proba_, feed_dict)
                print(score_batch(pred_prob, y))
                
            print("dev score")
            bi = batchGenerator(dev_data, len(dev_data), vocab, 0.8, max_time)
            for i,(w,y, raw) in enumerate(bi):
                h = session.run(lm.initial_h_, {lm.input_w_: w})
                feed_dict = { lm.input_w_:w,
                               lm.initial_h_:h,
                               lm.dropout_keep_prob_: 1.0}
                pred_prob = session.run(lm.pred_proba_, feed_dict)
                print(score_batch(pred_prob, y))
        # Save final model
        saver.save(session, trained_filename)
runTraining()



[epoch 1] Starting epoch 1
Training: total loss: 82.795
[epoch 1] Completed in 0:06:21
train score
{'f1': 0.42285219649426531, 'recall': 0.33141112618724561, 'precision': 0.5839808726838015, 'accuracy': 0.55549999999999999}
dev score
{'f1': 0.40297498309668695, 'recall': 0.32146709816612729, 'precision': 0.53985507246376807, 'accuracy': 0.5585}
[epoch 2] Starting epoch 2
Training: total loss: 107.964
[epoch 2] Completed in 0:06:16
train score
{'f1': 0.65928659286592872, 'recall': 1.0, 'precision': 0.49174311926605507, 'accuracy': 0.49216666666666664}
dev score
{'f1': 0.63384615384615384, 'recall': 1.0, 'precision': 0.46396396396396394, 'accuracy': 0.46450000000000002}
[epoch 3] Starting epoch 3
Training: total loss: 83.899
[epoch 3] Completed in 0:02:45
train score
{'f1': 0.65891819400983453, 'recall': 1.0, 'precision': 0.49133333333333334, 'accuracy': 0.49133333333333334}
dev score
{'f1': 0.63341305090536393, 'recall': 1.0, 'precision': 0.46350000000000002, 'accuracy': 0.4635000000000

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


{'f1': 0.0, 'recall': 0.0, 'precision': 0.0, 'accuracy': 0.50866666666666671}
dev score
{'f1': 0.0, 'recall': 0.0, 'precision': 0.0, 'accuracy': 0.53649999999999998}
[epoch 10] Starting epoch 10
Training: total loss: 83.284
[epoch 10] Completed in 0:02:45
train score
{'f1': 0.0, 'recall': 0.0, 'precision': 0.0, 'accuracy': 0.50866666666666671}
dev score
{'f1': 0.0, 'recall': 0.0, 'precision': 0.0, 'accuracy': 0.53649999999999998}


In [154]:
def predictClass(w):
    with tf.Graph().as_default(), tf.Session() as session:
        with tf.variable_scope("model", reuse=None):
            lm = LanguageModel(**model_params)
        saver = tf.train.Saver()
        saver.restore(session, trained_filename)
        
        h = session.run(lm.initial_h_, {lm.input_w_: w})
        feed_dict = {lm.input_w_:w,
               lm.initial_h_:h,
               lm.dropout_keep_prob_: 1.0}
        pred, logits = session.run([lm.pred_proba_, lm.logits_], feed_dict)
    return pred, logits

In [155]:
bi = batchGenerator(dev_data[100:110], 1, vocab, 0.8, max_time)
for (w,y,raw) in bi:
    print(y)
    #print(raw)
    print(predictClass(w))



[1]




(array([[ 0.07615406]], dtype=float32), array([[-2.49578691]], dtype=float32))
[0]




(array([[ 0.07615406]], dtype=float32), array([[-2.49578691]], dtype=float32))
[1]




(array([[ 0.07615406]], dtype=float32), array([[-2.49578691]], dtype=float32))
[0]




(array([[ 0.07615406]], dtype=float32), array([[-2.49578691]], dtype=float32))
[1]




(array([[ 0.07615406]], dtype=float32), array([[-2.49578691]], dtype=float32))
[0]




(array([[ 0.07615406]], dtype=float32), array([[-2.49578691]], dtype=float32))
[1]




(array([[ 0.07615406]], dtype=float32), array([[-2.49578691]], dtype=float32))
[0]




(array([[ 0.07615406]], dtype=float32), array([[-2.49578691]], dtype=float32))
[1]




(array([[ 0.07615406]], dtype=float32), array([[-2.49578691]], dtype=float32))
[1]
(array([[ 0.07615406]], dtype=float32), array([[-2.49578691]], dtype=float32))
