In [1]:
#DROPOUT + L2-REGULARIZATION
from __future__ import print_function
from functools import reduce
import json
import os
import re
import tarfile
import tempfile
import math

import numpy as np
np.random.seed(1337)  

import keras
import keras.backend as K
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import merge, recurrent, Dense, Input, Dropout, TimeDistributed
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.layers.recurrent import LSTM, GRU, SimpleRNN
from keras.layers.wrappers import Bidirectional
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.regularizers import l2
from keras.utils import np_utils

Using TensorFlow backend.


In [2]:
def extract_tokens_from_binary_parse(parse):
    return parse.replace('(', ' ').replace(')', ' ').replace('-LRB-', '(').replace('-RRB-', ')').split()

def yield_examples(fn, skip_no_majority=True, limit=None):
  for i, line in enumerate(open(fn)):
    if limit and i > limit:
      break
    data = json.loads(line)
    label = data['gold_label']
    s1 = bytes(' '.join(extract_tokens_from_binary_parse(data['sentence1_binary_parse'])))
    s2 = bytes(' '.join(extract_tokens_from_binary_parse(data['sentence2_binary_parse'])))
    if skip_no_majority and label == '-':
      continue
    yield (label, s1, s2)

def get_data(fn, limit=None):
  raw_data = list(yield_examples(fn=fn, limit=limit))
  left = [s1 for _, s1, s2 in raw_data]
  right = [s2 for _, s1, s2 in raw_data]
  print(max(len(x.split()) for x in left))
  print(max(len(x.split()) for x in right))

  LABELS = {'contradiction': 0, 'neutral': 1, 'entailment': 2}
  Y = np.array([LABELS[l] for l, s1, s2 in raw_data])
  Y = np_utils.to_categorical(Y, len(LABELS))

  return left, right, Y

In [3]:
training = get_data('data/snli_1.0_train.jsonl')
validation = get_data('data/snli_1.0_dev.jsonl')
test = get_data('data/snli_1.0_test.jsonl')

tokenizer = Tokenizer(lower=False, filters='')
tokenizer.fit_on_texts(training[0] + training[1])
VOCAB = len(tokenizer.word_counts) + 1
LABELS = {'contradiction': 0, 'neutral': 1, 'entailment': 2}


USE_GLOVE = True
TRAIN_EMBED = False
EMBED_HIDDEN_SIZE = 300
MAX_LEN = 42

82
62
59
55
57
30


In [4]:
to_seq = lambda X: pad_sequences(tokenizer.texts_to_sequences(X), maxlen=MAX_LEN)
prepare_data = lambda data: (to_seq(data[0]), to_seq(data[1]), data[2])

training = prepare_data(training)
validation = prepare_data(validation)
test = prepare_data(test)

print('Vocab size =', VOCAB)

Vocab size = 42391


In [5]:
GLOVE_STORE = 'data/precomputed_glove.weights'
if USE_GLOVE:
  if not os.path.exists(GLOVE_STORE):
    print('Computing GloVe')
  
    embeddings_index = {}
    f = open('glove.840B.300d.txt')
    for line in f:
      values = line.split()
      word = values[0]
      coefs = np.asarray(values[1:], dtype='float32')
      embeddings_index[word] = coefs
    f.close()
    
    embedding_matrix = np.zeros((VOCAB, EMBED_HIDDEN_SIZE))
    for word, i in tokenizer.word_index.iteritems():
      embedding_vector = embeddings_index.get(word)
      if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
      else:
        print('Missing from GloVe: {}'.format(word))
  
    np.save(open(GLOVE_STORE, 'w'), embedding_matrix)

  print('Loading GloVe')
  embedding_matrix = np.load(open(GLOVE_STORE))

  print('Total number of null word embeddings:')
  print(np.sum(np.sum(embedding_matrix, axis=1) == 0))

  embed = Embedding(VOCAB, EMBED_HIDDEN_SIZE, weights=[embedding_matrix], input_length=MAX_LEN, trainable=TRAIN_EMBED)
else:
  embed = Embedding(VOCAB, EMBED_HIDDEN_SIZE, input_length=MAX_LEN)

Loading GloVe
Total number of null word embeddings:
4043


In [6]:
import tensorflow as tf
import numpy as np
from tensorflow.contrib import rnn
from tensorflow.examples.tutorials.mnist import input_data


input_vec_size = 300
lstm_size = 300
time_step_size = 42



trXP, trXH, trY, teXP, teXH, teY = training[0], training[1], training[2], validation[0], validation[1], validation[2]
teXP1, teXH1, teY1 = test[0], test[1], test[2]

print(trXP.shape, trXH.shape, trY.shape, teXP.shape, teXH.shape, teY.shape)
print(teXP1.shape, teXH1.shape, teY1.shape)

(549367, 42) (549367, 42) (549367, 3) (9842, 42) (9842, 42) (9842, 3)
(9824, 42) (9824, 42) (9824, 3)


In [7]:
def init_weights(shape):
    return tf.Variable(tf.random_normal(shape, stddev=0.01))

def model(X_P, X_H, lstm_size, dropout):
    # X, input shape: (batch_size, time_step_size, input_vec_size)
    print("X_P.get_shape")
    print(X_P.get_shape)
    XT_P = tf.transpose(X_P, [1, 0, 2])  # permute time_step_size and batch_size
    print("XT_P.get_shape")
    print(XT_P.get_shape)
    # XT shape: (time_step_size, batch_size, input_vec_size)
    XR_P = tf.reshape(XT_P, [-1, input_vec_size])
    print("XR_P.get_shape")
    print(XR_P.get_shape)
    # XR shape: (time_step_size * batch_size, input_vec_size)
    X_split_P = tf.split(XR_P, time_step_size, 0) # split them to time_step_size (28 arrays)
    XT_H = tf.transpose(X_H, [1, 0, 2])  # permute time_step_size and batch_size
    XR_H = tf.reshape(XT_H, [-1, input_vec_size])
    X_split_H = tf.split(XR_H, time_step_size, 0) # split them to time_step_size (28 arrays)

    with tf.variable_scope('premise'):
        lstm1 = tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0, state_is_tuple=True)
        cell1 = tf.contrib.rnn.DropoutWrapper(lstm1, output_keep_prob=dropout)
        outputs_P, _states_P =  tf.contrib.rnn.static_rnn(cell1, X_split_P, dtype=tf.float32)
        #outputs_P_batch_norm = tf.contrib.layers.batch_norm(outputs_P, center=True, scale=True, is_training=phase, scope='bn')

    with tf.variable_scope('hypothesis'):
        lstm2 = tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0,  state_is_tuple=True)
        cell2 = tf.contrib.rnn.DropoutWrapper(lstm2, output_keep_prob=dropout)
        outputs_H, _states_H =  tf.contrib.rnn.static_rnn(cell2, X_split_H, dtype=tf.float32)
        #outputs_H_batch_norm = tf.contrib.layers.batch_norm(outputs_H, center=True, scale=True, is_training=phase, scope='bn')

    # Linear activation
    # Get the last output
    #l0 = tf.concat([outputs_P_batch_norm[-1], outputs_H_batch_norm[-1]] ,1 )
    l0 = tf.concat([outputs_P[-1], outputs_H[-1]] ,1 )
    l0_drop = tf.nn.dropout(l0, dropout)
    #print("l0.get_shape()", l0.get_shape())

    W_1 = tf.Variable(tf.truncated_normal([lstm_size*2, lstm_size*2], stddev=1.0 / math.sqrt(lstm_size*2)))
    B_1 = tf.Variable(tf.zeros([lstm_size*2]))
    W_2 = tf.Variable(tf.truncated_normal([lstm_size*2, lstm_size*2], stddev=1.0 / math.sqrt(lstm_size*2)))
    B_2 = tf.Variable(tf.zeros([lstm_size*2]))
    W_3 = tf.Variable(tf.truncated_normal([lstm_size*2, lstm_size*2], stddev=1.0 / math.sqrt(lstm_size*2)))
    B_3 = tf.Variable(tf.zeros([lstm_size*2]))
    W_4 = tf.Variable(tf.truncated_normal([lstm_size*2, 3], stddev=1.0 / math.sqrt(3)))
    B_4 = tf.Variable(tf.zeros([3]))

    l1 = tf.nn.relu(tf.matmul(l0_drop, W_1) + B_1)
    l1_drop = tf.nn.dropout(l1, dropout)
    #l1_batch_norm = tf.contrib.layers.batch_norm(l1_drop, center=True, scale=True, is_training=phase, scope='bn')

    l2 = tf.nn.relu(tf.matmul(l1_drop, W_2) + B_2)
    l2_drop = tf.nn.dropout(l2, dropout)
    #l2_batch_norm = tf.contrib.layers.batch_norm(l2_drop, center=True, scale=True, is_training=phase, scope='bn')

    l3 = tf.nn.relu(tf.matmul(l2_drop, W_3) + B_3)
    l3_drop = tf.nn.dropout(l3, dropout)
    #l3_batch_norm = tf.contrib.layers.batch_norm(l3_drop, center=True, scale=True, is_training=phase, scope='bn')

    l4 = tf.matmul(l3_drop, W_4) + B_4
    regularizer = tf.nn.l2_loss(W_1) + tf.nn.l2_loss(W_2) + tf.nn.l2_loss(W_3)
    print("regularizer.get_shape() :", regularizer.get_shape())
    return l4, regularizer


In [8]:
XP = tf.placeholder("int32", [None, 42])
XH = tf.placeholder("int32", [None, 42])
Y = tf.placeholder("int32", [None, 3])
dropout = tf.placeholder(tf.float32)
phase = tf.placeholder(tf.bool, name='phase')
beta = 4e-6

import tensorflow as tf
inputsH = tf.nn.embedding_lookup(embedding_matrix, XP)
inputsP = tf.nn.embedding_lookup(embedding_matrix, XH)

X1 = inputsH
X_P = tf.to_float(X1, name='ToFloat')

X2 = inputsP
X_H = tf.to_float(X2, name='ToFloat')

py_x, regularizer = model(X_P, X_H, lstm_size, dropout)
#py_x = model(X_P, X_H, lstm_size, dropout)

X_P.get_shape
<bound method Tensor.get_shape of <tf.Tensor 'ToFloat:0' shape=(?, 42, 300) dtype=float32>>
XT_P.get_shape
<bound method Tensor.get_shape of <tf.Tensor 'transpose:0' shape=(42, ?, 300) dtype=float32>>
XR_P.get_shape
<bound method Tensor.get_shape of <tf.Tensor 'Reshape:0' shape=(?, 300) dtype=float32>>
regularizer.get_shape() : ()


In [9]:
with tf.name_scope('cost'):
    #loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels) )
    cost1 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=py_x, labels=Y))
    print("cost1.get_shape() :", cost1.get_shape())
    print("regularizer.get_shape() :", regularizer.get_shape())
    cost = (cost1 + beta * regularizer)
    print("cost.get_shape() :", cost.get_shape())
    
#train_op = tf.train.RMSPropOptimizer(0.002, 0.9).minimize(cost)
train_op = tf.train.RMSPropOptimizer(0.001, 0.9).minimize(cost)
#train_op = tf.train.AdagradOptimizer(0.001).minimize(cost)
predict_op = tf.argmax(py_x, 1)

cost1.get_shape() : ()
regularizer.get_shape() : ()
cost.get_shape() : ()


In [10]:
correct_pred = tf.equal(tf.argmax(Y, 1), tf.argmax(py_x, 1)) # Count correct predictions
with tf.name_scope('accuracy'):
    acc_op = tf.reduce_mean(tf.cast(correct_pred, "float")) # Cast boolean to float to average

In [11]:
# trXP = trXP[:2000]
# trXH = trXH[:2000]
# trY = trY[:2000]
# teXP = teXP[:500]
# teXH = teXH[:500]
# teY = teY[:500]
# teXP1 = teXP1[:500]
# teXH1 = teXH1[:500]
# teY1 = teY1[:500]

In [12]:
print(trXP.shape, trXH.shape, trY.shape, teXP.shape, teXH.shape, teY.shape)
print(teXP1.shape, teXH1.shape, teY1.shape)

(549367, 42) (549367, 42) (549367, 3) (9842, 42) (9842, 42) (9842, 3)
(9824, 42) (9824, 42) (9824, 3)


In [13]:
batch_size = 128*2*2
dropout_1 = 0.8

In [14]:
def eval(XP1, XH1, Y1, batch_size1):
    accuracy = 0.0
    loss = 0.0
    iter_batch = 0.0
    for start, end in zip(range(0, len(XP1), batch_size1), range(batch_size1, len(XP1)+1, batch_size1)):
        feed_dict = {XP: XP1[start:end], XH: XH1[start:end], Y: Y1[start:end], dropout:1, phase:0}
        accuracy_batch, loss_batch = sess.run([acc_op, cost], feed_dict=feed_dict)
        accuracy = accuracy + accuracy_batch
        loss = loss + loss_batch
        iter_batch = iter_batch + 1
    accuracy = float(accuracy) / iter_batch
    loss = float(loss) / iter_batch
    return accuracy, loss

In [None]:
import time
with tf.Session() as sess:
    tf.global_variables_initializer().run()
    time_init = time.time()
    localtime1 = time.asctime( time.localtime(time.time()))
    localtime1 = localtime1.replace(" ", "_")
    localtime1 = localtime1.replace(":", "_")
    fo = open("results_"+str(localtime1)+".csv", "a", 0)
    fo.write("epoch,train_loss,val_loss,test_loss,train_acc,val_acc,test_acc,epoch_time\n")
    for i in range(45):
        time_epoch_start = time.time()
        print("epoch" + str(i) + "\n")
        for start, end in zip(range(0, len(trXP), batch_size), range(batch_size, len(trXP)+1, batch_size)):
            #if start%32768 == 0 and start > 32767:
                #print("train", str(i), str(start), str(end))
            feed_dict = {XP: trXP[start:end], XH: trXH[start:end], Y: trY[start:end], dropout:dropout_1, phase:1}
            sess.run(train_op, feed_dict=feed_dict)
            #_, costA, costB, costC = sess.run([train_op, cost1, regularizer, cost], feed_dict=feed_dict)
            #print(costA, costB, costC)
            #print("\n")
        print("\n")
        train_end = time.time()
        accuracy_train, loss_train = eval(trXP, trXH, trY, batch_size)
        print("epoch: ", i, "training:", loss_train, accuracy_train)
        train_eval_end = time.time()
        accuracy_val, loss_val = eval(teXP, teXH, teY, batch_size)
        print("epoch: ", i, "validation:", loss_val, accuracy_val)
        val_eval_end = time.time()
        accuracy_test, loss_test = eval(teXP1, teXH1, teY1, batch_size)
        print("epoch: ", i, "test:", loss_test, accuracy_test)
        test_eval_end = time.time()
        train_time = train_end - time_epoch_start
        train_eval_time = train_eval_end - train_end
        val_eval_time = val_eval_end - train_eval_end
        test_eval_time = test_eval_end - val_eval_end
        total_time = test_eval_end - time_epoch_start
        print("epoch: ", i, train_time, train_eval_time, val_eval_time, test_eval_time, total_time)
        print("\n")
        line = str(i) + "," + str(loss_train) + "," + str(loss_val) + "," + str(loss_test) + "," + str(accuracy_train) + "," + str(accuracy_val) + "," + str(accuracy_test) + "," + str(total_time) + "\n"
        fo.write(line);
    all_epoch_end = time.time()
    all_epoch_time = all_epoch_end - time_init
    print("all time taken for epochs :", all_epoch_time)
    fo.close()

epoch0



epoch:  0 training: 0.729360590731 0.683081783465
epoch:  0 validation: 0.7262852537 0.690789473684
epoch:  0 test: 0.721689983418 0.683490953947
epoch:  0 328.715721846 113.906031132 2.01069092751 2.01269412041 446.645138025


epoch1



epoch:  1 training: 0.626383134011 0.739817149604
epoch:  1 validation: 0.634005835182 0.736636513158
epoch:  1 test: 0.627824253158 0.742084703947
epoch:  1 327.665489912 113.44345808 2.01019406319 2.01326489449 445.13240695


epoch2



epoch:  2 training: 0.569779799473 0.766942266208
epoch:  2 validation: 0.581414285459 0.766344572368
epoch:  2 test: 0.579275099855 0.768400493421
epoch:  2 327.767707109 113.47705698 2.01182389259 2.0151951313 445.271783113


epoch3



epoch:  3 training: 0.555964714305 0.776523874767
epoch:  3 validation: 0.575789112794 0.770970394737
epoch:  3 test: 0.574106828163 0.768503289474
epoch:  3 327.756392956 113.476868153 2.01390385628 2.01338601112 445.260550976


epoch4



epoch:  4 training: 0.517676199383 0