# Neural Network Model (BOW)

In [1]:
import numpy as np
import tensorflow as tf
import pickle
import gensim
import os, shutil, time, datetime
from importlib import reload
import pandas as pd
from helpers import patched_numpy_io
import model.NeuralBOW as NeuralBOW
import model.NeuralBOW_withEmbeddings as NeuralBOW_withEmbeddings

### Data Setup

In [None]:
# Load word2vec pre-trained model
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('./model/GoogleNews-vectors-negative300.bin', binary=True)
word2vec_matrix = word2vec_model.syn0

  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
# Add <UNK> and <PAD> vectors to word2vec matrix
unk_vec = np.random.uniform(-0.25,0.25,300) 
pad_vec = np.zeros(300) 

word2vec_matrix = np.vstack((word2vec_matrix, unk_vec))
word2vec_matrix = np.vstack((word2vec_matrix, pad_vec))

word2vec_words = [k for k in word2vec_model.vocab.keys()]
word2vec_words = word2vec_words + ['<unk>', '<pad>']

id_to_word = dict(enumerate(word2vec_words))
word_to_id = {v:k for k,v in id_to_word.items()}

In [None]:
def input_generator(data, max_word):
    
    # This function takes the input data in tokens and converted into matrix with indices
    
    inputs_x = np.zeros([len(data), max_word])
    inputs_ns = np.zeros([len(data)])
    inputs_y = np.zeros([len(data)])
    num_doc = 0
    
    for i in data.index:
        inputs_y[num_doc] = 1 if data.loc[i, 'y'] == 1 else 0
        
        # loop through words up to max words (clips longer docs)
        words = data.loc[i, 'text']
        num_word = 0
        for w in words:
            if num_word < max_word:
                inputs_x[num_doc][num_word] = word_to_id.get(w)
                num_word += 1
            else:
                break
        
        inputs_ns[num_doc]=num_word
        
        num_doc += 1
    
    return inputs_x, inputs_ns, inputs_y

def batch_generator(inputs_x, inputs_ns, inputs_y, batch_size):
    num_batch = (len(inputs_y)) // batch_size
    for i in range(num_batch):
        yield inputs_x[i*batch_size:(i+1)*batch_size], inputs_ns[i*batch_size:(i+1)*batch_size], inputs_y[i*batch_size:(i+1)*batch_size]

def calc_acc(pred_y, true_y):
    return np.sum(pred_y==true_y) / len(true_y)
        
def pretty_timedelta(fmt="%d:%02d:%02d", since=None, until=None):
    """Pretty-print a timedelta, using the given format string."""
    since = since or time.time()
    until = until or time.time()
    delta_s = until - since
    hours, remainder = divmod(delta_s, 3600)
    minutes, seconds = divmod(remainder, 60)
    return fmt % (hours, minutes, seconds)

In [None]:
# Load data
train_data = pickle.load(open("train.p", "rb"))
dev_data = pickle.load(open("dev.p", "rb"))
test_data = pickle.load(open("test.p", "rb"))

In [None]:
# Specify the maximum word length per doc 
max_doc_length = 3000

train_x, train_ns, train_y = input_generator(train_data, max_doc_length)
dev_x, dev_ns, dev_y = input_generator(dev_data, max_doc_length)
test_x, test_ns, test_y = input_generator(test_data, max_doc_length)

In [None]:
print(train_x.shape)
print(train_ns.shape)
print(train_y.shape)
print(dev_x.shape)
print(dev_ns.shape)
print(dev_y.shape)
print(test_x.shape)
print(test_ns.shape)
print(test_y.shape)

### Train and Evaluate Model Without Pretrained Embeddings

In [None]:
reload(NeuralBOW)

x, ns, y = train_x, train_ns, train_y
batch_size = 196 # Note the number of examples must be evenly divisible by this number
num_epochs = 25

# Specify model hyperparameters as used by model_fn
model_params = dict(V=word2vec_matrix.shape[0], embed_dim=word2vec_matrix.shape[1], hidden_dims=[100,50],
                    num_classes=2,
                    encoder_type='bow', dropout_rate=0.25,
                    lr=0.3, optimizer='adagrad', beta=0.01)
model_fn = NeuralBOW.classifier_model_fn

total_batches = 0
with tf.Graph().as_default(), tf.Session() as sess:
    ##
    # Construct the graph here. No session.run calls - just wiring up Tensors.
    ##
    # Add placeholders so we can feed in data.
    x_ph_  = tf.placeholder(tf.int32, shape=[None, x.shape[1]])  # [batch_size, max_len]
    ns_ph_ = tf.placeholder(tf.int32, shape=[None])              # [batch_size]
    y_ph_  = tf.placeholder(tf.int32, shape=[None])              # [batch_size]
    
    # Construct the graph using model_fn
    features = {"ids": x_ph_, "ns": ns_ph_}  # note that values are Tensors
    estimator_spec = model_fn(features, labels=y_ph_, mode=tf.estimator.ModeKeys.TRAIN,
                              params=model_params)
    loss_     = estimator_spec.loss
    train_op_ = estimator_spec.train_op
    predictions_     = estimator_spec.predictions
    
    ##
    # Done constructing the graph, now we can make session.run calls.
    ##
    sess.run(tf.global_variables_initializer())
    
    # Train the graph
    for epoch in range(1,num_epochs+1):
        t0_epoch = time.time()
        total_loss = 0
        train_predict_prob = np.zeros(batch_size)
        train_predict_class = np.zeros(batch_size)
        batch_num = 0
        
        print("[epoch {:d}] Starting epoch {:d}".format(epoch, epoch))

        # Run a training epoch.
        for (bx, bns, by) in batch_generator(x, ns, y, batch_size):
            # feed NumPy arrays into the placeholder Tensors
            feed_dict = {x_ph_: bx, ns_ph_: bns, y_ph_: by}
            batch_loss, predictions, _ = sess.run([loss_, predictions_, train_op_], feed_dict=feed_dict)
        
            # Compute some statistics
            total_batches += 1
            total_loss += batch_loss * len(bx)  # re-scale, since batch loss is mean
            
            if batch_num == 0:
                train_predict_prob = predictions['proba']
                train_predict_class = predictions['max']
            else:
                train_predict_prob = np.concatenate((train_predict_prob, predictions['proba']))
                train_predict_class = np.concatenate((train_predict_class, predictions['max']))
            
            batch_num += 1
        
        print("[epoch {:d}] Completed in {:s}".format(epoch, pretty_timedelta(since=t0_epoch)))
        print("Total loss for epoch: %0.2f" % total_loss)
        print("Accuracy on training data is: %0.2f" % calc_acc(train_predict_class, train_y))
    
    
    
    # Generate accuracy and predicted probabilities for the test set
    # Note that this runs the training step, but only updates after it has already made predictions
    # Doing this because re-instantiating this with PREDICT mode later generates a new graph
    
    x, ns, y = test_x, test_ns, test_y
    test_predict_prob = np.zeros(batch_size)
    test_predict_class = np.zeros(batch_size)
    batch_num = 0
    
    for (bx, bns, by) in batch_generator(x, ns, y, 237):
        # feed NumPy arrays into the placeholder Tensors
        feed_dict = {x_ph_: bx, ns_ph_: bns, y_ph_: by}
        batch_loss, predictions, _ = sess.run([loss_, predictions_, train_op_], feed_dict=feed_dict)
        
        if batch_num == 0:
            test_predict_prob = predictions['proba']
            test_predict_class = predictions['max']
        else:
            test_predict_prob = np.concatenate((test_predict_prob, predictions['proba']))
            test_predict_class = np.concatenate((test_predict_class, predictions['max']))
            
        batch_num += 1
    print("Accuracy on testing data is: %0.2f" % calc_acc(test_predict_class, test_y))
    

In [127]:
out, _ = pd.cut([int(x) for x in train_data.year_filed], 10, retbins=True)
bins = pd.cut([int(x) for x in train_data.year_filed], 10, labels=False)

preds_df = pd.DataFrame({'bins': bins, 'log_prob': np.log(train_predict_prob)})

agg_df = preds_df.groupby('bins').agg(lambda x: -np.mean(x.log_prob))
agg_df.index = out.categories
agg_df.columns = ['cross_entropy_loss']

agg_df

Unnamed: 0,cross_entropy_loss
"(1791.774, 1814.6]",1.061668
"(1814.6, 1837.2]",1.015342
"(1837.2, 1859.8]",1.049687
"(1859.8, 1882.4]",1.074655
"(1882.4, 1905.0]",1.050141
"(1905.0, 1927.6]",1.068226
"(1927.6, 1950.2]",1.005111
"(1950.2, 1972.8]",0.917637
"(1972.8, 1995.4]",0.821081
"(1995.4, 2018.0]",0.845767


### Create and Train Estimator

Note: consistently getting an error ''Tensor' object has no attribute 'sparse_read'. Possibly due to not writing a vocab?

In [104]:
reload(NeuralBOW)

# Specify model hyperparameters as used by model_fn
model_params = dict(V=word2vec_matrix.shape[0], embed_dim=word2vec_matrix.shape[1], hidden_dims=[25],
                    num_classes=2,
                    encoder_type='bow', dropout_rate=0.25,
                    lr=0.1, optimizer='adagrad', beta=0.01)

checkpoint_dir = "/tmp/tf_bow_" + datetime.datetime.now().strftime("%Y%m%d-%H%M")
if os.path.isdir(checkpoint_dir):
    shutil.rmtree(checkpoint_dir)
    
# Write vocabulary to file, so TensorBoard can label embeddings.
# creates checkpoint_dir/projector_config.pbtxt and checkpoint_dir/metadata.tsv
#ds.vocab.write_projector_config(checkpoint_dir, "Encoder/Embedding_Layer/W_embed")

model = tf.estimator.Estimator(model_fn=NeuralBOW.classifier_model_fn, 
                               params=model_params,
                               model_dir=checkpoint_dir)
print("")
print("To view training (once it starts), run:\n")
print("    tensorboard --logdir='{:s}' --port 6006".format(checkpoint_dir))
print("\nThen in your browser, open: http://localhost:6006")

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tf_bow_20180810-0621', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fbe0e7db4e0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

To view training (once it starts), run:

    tensorboard --logdir='/tmp/tf_bow_20180810-0621' --port 6006

Then in your browser, open: http://localhost:6006


In [105]:
# Training params, just used in this cell for the input_fn-s
train_params = dict(batch_size=196, total_epochs=6, eval_every=2)
assert(train_params['total_epochs'] % train_params['eval_every'] == 0)

# Construct and train the model, saving checkpoints to the directory above.
# Input function for training set batches
# Do 'eval_every' epochs at once, followed by evaluating on the dev set.
# NOTE: use patch_numpy_io.numpy_input_fn instead of tf.estimator.inputs.numpy_input_fn
train_input_fn = patched_numpy_io.numpy_input_fn(
                    x={"ids": train_x, "ns": train_ns}, y=train_y,
                    batch_size=train_params['batch_size'], 
                    num_epochs=train_params['eval_every'], shuffle=True, seed=42
                 )

# Input function for dev set batches. As above, but:
# - Don't randomize order
# - Iterate exactly once (one epoch)
dev_input_fn = tf.estimator.inputs.numpy_input_fn(
                    x={"ids": dev_x, "ns": dev_ns}, y=dev_y,
                    batch_size=196, num_epochs=1, shuffle=False
                )

for _ in range(train_params['total_epochs'] // train_params['eval_every']):
    # Train for a few epochs, then evaluate on dev
    model.train(input_fn=train_input_fn)
    eval_metrics = model.evaluate(input_fn=dev_input_fn, name="dev")

INFO:tensorflow:Calling model_fn.


TypeError: Value passed to parameter 'indices' has DataType float64 not in list of allowed values: int32, int64

### Train and Evaluate Model with word2vec Embeddings

In [133]:
reload(NeuralBOW_withEmbeddings)

x, ns, y = train_x, train_ns, train_y
batch_size = 196 # Note the number of examples must be evenly divisible by this number
num_epochs = 25

# Specify model hyperparameters as used by model_fn
model_params = dict(V=word2vec_matrix.shape[0], embed_dim=word2vec_matrix.shape[1], hidden_dims=[100,50],
                    num_classes=2,
                    encoder_type='bow', dropout_rate=0.25,
                    lr=0.1, optimizer='adagrad', beta=0.01)
model_fn = NeuralBOW_withEmbeddings.classifier_model_fn

total_batches = 0
with tf.Graph().as_default(), tf.Session() as sess:
    ##
    # Construct the graph here. No session.run calls - just wiring up Tensors.
    ##
    # Add placeholders so we can feed in data.
    x_ph_  = tf.placeholder(tf.int32, shape=[None, x.shape[1]])  # [batch_size, max_len]
    ns_ph_ = tf.placeholder(tf.int32, shape=[None])              # [batch_size]
    y_ph_  = tf.placeholder(tf.int32, shape=[None])              # [batch_size]
    embedding_  = tf.placeholder(tf.float32, shape=(word2vec_matrix.shape[0], word2vec_matrix.shape[1])) # [V, embed_dim]
    
    # Construct the graph using model_fn
    features = {"ids": x_ph_, "ns": ns_ph_}  # note that values are Tensors
    estimator_spec = model_fn(features, labels=y_ph_, mode=tf.estimator.ModeKeys.TRAIN,
                              params=model_params)
    loss_     = estimator_spec.loss
    train_op_ = estimator_spec.train_op
    predictions_     = estimator_spec.predictions
    
    ##
    # Done constructing the graph, now we can make session.run calls.
    ##
    sess.run(tf.global_variables_initializer(), feed_dict={embedding_: word2vec_matrix})
    
    # Train the graph
    for epoch in range(1,num_epochs+1):
        t0_epoch = time.time()
        total_loss = 0
        train_predict_prob = np.zeros(batch_size)
        train_predict_class = np.zeros(batch_size)
        batch_num = 0
        
        print("[epoch {:d}] Starting epoch {:d}".format(epoch, epoch))

        # Run a training epoch.
        for (bx, bns, by) in batch_generator(x, ns, y, batch_size):
            # feed NumPy arrays into the placeholder Tensors
            feed_dict = {x_ph_: bx, ns_ph_: bns, y_ph_: by}
            batch_loss, predictions, _ = sess.run([loss_, predictions_, train_op_], feed_dict=feed_dict)
        
            # Compute some statistics
            total_batches += 1
            total_loss += batch_loss * len(bx)  # re-scale, since batch loss is mean
            
            if batch_num == 0:
                train_predict_prob = predictions['proba']
                train_predict_class = predictions['max']
            else:
                train_predict_prob = np.concatenate((train_predict_prob, predictions['proba']))
                train_predict_class = np.concatenate((train_predict_class, predictions['max']))
            
            batch_num += 1
        
        print("[epoch {:d}] Completed in {:s}".format(epoch, pretty_timedelta(since=t0_epoch)))
        print("Total loss for epoch: %0.2f" % total_loss)
        print("Accuracy on training data is: %0.2f" % calc_acc(train_predict_class, train_y))
    
    
    
    # Generate accuracy and predicted probabilities for the test set
    # Note that this runs the training step, but only updates after it has already made predictions
    # Doing this because re-instantiating this with PREDICT mode later generates a new graph
    
    x, ns, y = test_x, test_ns, test_y
    test_predict_prob = np.zeros(batch_size)
    test_predict_class = np.zeros(batch_size)
    batch_num = 0
    
    for (bx, bns, by) in batch_generator(x, ns, y, 237):
        # feed NumPy arrays into the placeholder Tensors
        feed_dict = {x_ph_: bx, ns_ph_: bns, y_ph_: by}
        batch_loss, predictions, _ = sess.run([loss_, predictions_, train_op_], feed_dict=feed_dict)
        
        if batch_num == 0:
            test_predict_prob = predictions['proba']
            test_predict_class = predictions['max']
        else:
            test_predict_prob = np.concatenate((test_predict_prob, predictions['proba']))
            test_predict_class = np.concatenate((test_predict_class, predictions['max']))
            
        batch_num += 1
    print("Accuracy on testing data is: %0.2f" % calc_acc(test_predict_class, test_y))
    

InvalidArgumentError: You must feed a value for placeholder tensor 'Encoder/Embedding_Layer/Placeholder' with dtype float and shape [3000002,300]
	 [[Node: Encoder/Embedding_Layer/Placeholder = Placeholder[dtype=DT_FLOAT, shape=[3000002,300], _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

Caused by op 'Encoder/Embedding_Layer/Placeholder', defined at:
  File "/home/alyssa_yerukhimov/anaconda3/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/alyssa_yerukhimov/anaconda3/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/alyssa_yerukhimov/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/alyssa_yerukhimov/anaconda3/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/alyssa_yerukhimov/anaconda3/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 478, in start
    self.io_loop.start()
  File "/home/alyssa_yerukhimov/anaconda3/lib/python3.6/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/home/alyssa_yerukhimov/anaconda3/lib/python3.6/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/home/alyssa_yerukhimov/anaconda3/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/alyssa_yerukhimov/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/home/alyssa_yerukhimov/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/alyssa_yerukhimov/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/home/alyssa_yerukhimov/anaconda3/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/alyssa_yerukhimov/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/alyssa_yerukhimov/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/alyssa_yerukhimov/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/alyssa_yerukhimov/anaconda3/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/alyssa_yerukhimov/anaconda3/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/alyssa_yerukhimov/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/alyssa_yerukhimov/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2850, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/alyssa_yerukhimov/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-133-4e9b41807a3c>", line 28, in <module>
    params=model_params)
  File "/home/alyssa_yerukhimov/final/model/NeuralBOW_withEmbeddings.py", line 178, in classifier_model_fn
    **params)
  File "/home/alyssa_yerukhimov/final/model/NeuralBOW_withEmbeddings.py", line 147, in BOW_encoder
    xs_ = embedding_layer(ids_, V, embed_dim)  # replace with a call to embedding_layer
  File "/home/alyssa_yerukhimov/final/model/NeuralBOW_withEmbeddings.py", line 25, in embedding_layer
    embedding_ = tf.placeholder(tf.float32, shape=(V, embed_dim))
  File "/home/alyssa_yerukhimov/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py", line 1808, in placeholder
    return gen_array_ops.placeholder(dtype=dtype, shape=shape, name=name)
  File "/home/alyssa_yerukhimov/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py", line 4848, in placeholder
    "Placeholder", dtype=dtype, shape=shape, name=name)
  File "/home/alyssa_yerukhimov/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/alyssa_yerukhimov/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3392, in create_op
    op_def=op_def)
  File "/home/alyssa_yerukhimov/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1718, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): You must feed a value for placeholder tensor 'Encoder/Embedding_Layer/Placeholder' with dtype float and shape [3000002,300]
	 [[Node: Encoder/Embedding_Layer/Placeholder = Placeholder[dtype=DT_FLOAT, shape=[3000002,300], _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]


In [127]:
out, _ = pd.cut([int(x) for x in train_data.year_filed], 10, retbins=True)
bins = pd.cut([int(x) for x in train_data.year_filed], 10, labels=False)

preds_df = pd.DataFrame({'bins': bins, 'log_prob': np.log(train_predict_prob)})

agg_df = preds_df.groupby('bins').agg(lambda x: -np.mean(x.log_prob))
agg_df.index = out.categories
agg_df.columns = ['cross_entropy_loss']

agg_df

Unnamed: 0,cross_entropy_loss
"(1791.774, 1814.6]",1.061668
"(1814.6, 1837.2]",1.015342
"(1837.2, 1859.8]",1.049687
"(1859.8, 1882.4]",1.074655
"(1882.4, 1905.0]",1.050141
"(1905.0, 1927.6]",1.068226
"(1927.6, 1950.2]",1.005111
"(1950.2, 1972.8]",0.917637
"(1972.8, 1995.4]",0.821081
"(1995.4, 2018.0]",0.845767
