# Neural Network Model (CNNRNN)

In [1]:
import numpy as np
import tensorflow as tf
import pickle
import gensim
import os, shutil, time
import model.CNNRNN as CNNRNN
from importlib import reload

  from ._conv import register_converters as _register_converters


In [2]:
# Load word2vec pre-trained model
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('./model/GoogleNews-vectors-negative300.bin', binary=True)
word2vec_matrix = word2vec_model.syn0

  This is separate from the ipykernel package so we can avoid doing imports until


In [3]:
# Add <UNK> and <PAD> vectors to word2vec matrix
unk_vec = np.random.uniform(-0.25,0.25,300) 
pad_vec = np.zeros(300) 

word2vec_matrix = np.vstack((word2vec_matrix, unk_vec))
word2vec_matrix = np.vstack((word2vec_matrix, pad_vec))

word2vec_words = [k for k in word2vec_model.vocab.keys()]
word2vec_words = word2vec_words + ['<unk>', '<pad>']

id_to_word = dict(enumerate(word2vec_words))
word_to_id = {v:k for k,v in id_to_word.items()}

In [4]:
def input_generator(data, max_sent, max_word):
    
    # This function takes the input data in tokens and converted into matrix with indices
    
    inputs_x = np.zeros([len(data), max_sent, max_word])
    inputs_y = np.zeros([len(data), 2])
    num_doc = 0
    
    for i in data.index:
        inputs_y[num_doc] = [0, 1] if data.loc[i, 'y'] == 1 else [1, 0]
        
        inputs_x_sent = np.zeros([max_sent, max_word])
        inputs_x_sent.fill(3000001)
        
        sents = ' '.join(data.loc[i, 'text']).split('</s>')
        num_sent = 0
        
        for s in sents:
            if num_sent < max_sent:
                if len(s) > 0:
                    my_sent = s.rstrip().lstrip().split()
                    num_word = 0
                    for w in my_sent:
                        if num_word < max_word:
                            inputs_x_sent[num_sent, num_word] = word_to_id.get(w)
                            num_word += 1
                        else:
                            break

                    num_sent += 1
            else:
                break

        inputs_x[num_doc] = inputs_x_sent
        
        num_doc += 1
    
    return inputs_x, inputs_y

def batch_generator(inputs_x, inputs_y, batch_size):
    num_batch = (len(inputs_y)) // batch_size
    for i in range(num_batch):
        yield inputs_x[i*batch_size:(i+1)*batch_size], inputs_y[i*batch_size:(i+1)*batch_size]
        
def score_dataset(lm, session, inputs_x, inputs_y, name="Train"):
    # For scoring, we can use larger batches to speed things up.
    bi = batch_generator(inputs_x, inputs_y, batch_size=1000)
    cost = run_epoch(lm, session, bi, 
                     learning_rate=0.0, train=False, 
                     verbose=False, tick_s=3600)
    print("{:s}: avg. loss: {:.03f}  (perplexity: {:.02f})".format(name, cost, np.exp(cost)))
    return cost

def pretty_timedelta(fmt="%d:%02d:%02d", since=None, until=None):
    """Pretty-print a timedelta, using the given format string."""
    since = since or time.time()
    until = until or time.time()
    delta_s = until - since
    hours, remainder = divmod(delta_s, 3600)
    minutes, seconds = divmod(remainder, 60)
    return fmt % (hours, minutes, seconds)

In [5]:
# Load data
train_data = pickle.load(open("train.p", "rb"))
dev_data = pickle.load(open("dev.p", "rb"))
test_data = pickle.load(open("test.p", "rb"))

In [6]:
# Specify the maximum sentence length per doc 
# and maximum number of words per sentence

max_sent_length = 200
sequence_length = 100

train_x, train_y = input_generator(train_data, max_sent_length, sequence_length)
dev_x, dev_y = input_generator(dev_data, max_sent_length, sequence_length)
test_x, test_y = input_generator(test_data, max_sent_length, sequence_length)

In [7]:
print(train_x.shape)
print(train_y.shape)
print(dev_x.shape)
print(dev_y.shape)
print(test_x.shape)
print(test_y.shape)

(17052, 200, 100)
(17052, 2)
(2131, 200, 100)
(2131, 2)
(2133, 200, 100)
(2133, 2)


In [8]:
def run_epoch(lm, session, batch_iterator,
              train=False, verbose=False,
              tick_s=10, learning_rate=None):
    assert(learning_rate is not None)
    start_time = time.time()
    tick_time = start_time  # for showing status
    total_cost = 0.0  # total cost, summed over all words
    total_accuracy = 0.0
    total_batches = 0
    total_words = 0

    if train:
        train_op = lm.train_step_
    else:
        train_op = tf.no_op()
    
    loss = lm.loss_ 
    accuracy = lm.accuracy_

    for i, (x, y) in enumerate(batch_iterator):
        cost = 0.0
        pred_accuracy = 0.0
        
        feed_dict = {lm.input_x_ : x,
                     lm.input_y_ : y,
                     lm.learning_rate_ : learning_rate}
        
        cost, pred_accuracy, _ = session.run([loss, accuracy, train_op], feed_dict=feed_dict)

        total_cost += cost
        total_accuracy += pred_accuracy
        total_batches = i + 1

        # Print average loss-so-far for epoch
        if verbose and (time.time() - tick_time >= tick_s):
            avg_cost = total_cost / total_batches
            avg_accuracy = total_accuracy / total_batches
            print("[batch {:d}]: loss = {:.3f}, accuracy = {:.3f}".format(i, avg_cost, avg_accuracy))
            tick_time = time.time()  # reset time ticker

    return total_cost / total_batches

In [11]:
# Training parameters
max_time = 25
batch_size = 200
learning_rate = 0.01
num_epochs = 5

# Model parameters
model_params = dict(max_sent_length=train_x.shape[1],
                    batch_size=batch_size,
                    num_classes=2,
                    sequence_length=train_x.shape[2], 
                    vocab_size=word2vec_matrix.shape[0],
                    embedding_size=word2vec_matrix.shape[1],
                    filter_sizes=[1, 2, 3],
                    conv_output_len=50,
                    num_rnn_units=128)

TF_SAVEDIR = "./model/temp"
checkpoint_filename = os.path.join(TF_SAVEDIR, "model")
trained_filename = os.path.join(TF_SAVEDIR, "model_trained")

In [None]:
reload(CNNRNN)

# Will print status every this many seconds
print_interval = 5

lm = CNNRNN.CNNRNN(**model_params)
lm.BuildCoreGraph()
lm.BuildTrainGraph()

# Explicitly add global initializer and variable saver to LM graph
with lm.graph.as_default():
    initializer = tf.global_variables_initializer()
    saver = tf.train.Saver()
    
# Clear old log directory
# shutil.rmtree(TF_SAVEDIR, ignore_errors=True)
if not os.path.isdir(TF_SAVEDIR):
    os.makedirs(TF_SAVEDIR)

with tf.Session(graph=lm.graph) as session:
    # Seed RNG for repeatability
    tf.set_random_seed(42)

    session.run(initializer, feed_dict={lm.embedding_: word2vec_matrix})
    
    for epoch in range(1,num_epochs+1):
        t0_epoch = time.time()
        bi = batch_generator(train_x, train_y, batch_size)
        
        print("[epoch {:d}] Starting epoch {:d}".format(epoch, epoch))

        # Run a training epoch.
        run_epoch(lm, session, bi, train=True, verbose=True, tick_s=10, learning_rate=learning_rate)
        
        print("[epoch {:d}] Completed in {:s}".format(epoch, pretty_timedelta(since=t0_epoch)))
    
        # Save a checkpoint
        saver.save(session, checkpoint_filename, global_step=epoch)
    
    # Save final model
    saver.save(session, trained_filename)
    
    score_dataset(lm, session, train_x, train_y, 'Train Set')
    score_dataset(lm, session, dev_x, dev_y, 'Dev Set')

[epoch 1] Starting epoch 1
[batch 0]: loss = 1.325, accuracy = 0.435
[batch 1]: loss = 5.842, accuracy = 0.540
[batch 2]: loss = 4.467, accuracy = 0.603
[batch 3]: loss = 4.698, accuracy = 0.544
[batch 4]: loss = 3.990, accuracy = 0.536
[batch 5]: loss = 3.604, accuracy = 0.551
[batch 6]: loss = 3.253, accuracy = 0.569
[batch 7]: loss = 2.949, accuracy = 0.569
[batch 8]: loss = 2.788, accuracy = 0.541
[batch 9]: loss = 2.597, accuracy = 0.534
[batch 10]: loss = 2.443, accuracy = 0.545
[batch 11]: loss = 2.320, accuracy = 0.556
[batch 12]: loss = 2.232, accuracy = 0.559
[batch 13]: loss = 2.137, accuracy = 0.563
[batch 14]: loss = 2.046, accuracy = 0.560
[batch 15]: loss = 1.990, accuracy = 0.550
[batch 16]: loss = 1.929, accuracy = 0.539
[batch 17]: loss = 1.862, accuracy = 0.541
[batch 18]: loss = 1.804, accuracy = 0.546
[batch 19]: loss = 1.757, accuracy = 0.552
[batch 20]: loss = 1.715, accuracy = 0.556
[batch 21]: loss = 1.679, accuracy = 0.558
[batch 22]: loss = 1.634, accuracy = 

In [None]:
score_dataset(lm, session, test_x, test_y, name="Test set")