This notebook focus on the application of LSTM to embed the sentence and measure the similarity between question pairs
<br>
Reference: ***Siamese Recurrent Architectures for Learning Sentence Similarity***
<br>http://www.mit.edu/~jonasm/info/MuellerThyagarajan_AAAI16.pdf

In [1]:
import pandas as pd
import numpy as np
from gensim import corpora, models, similarities
import re
import _pickle as pickle 
import tensorflow as tf
from tensorflow.contrib import rnn
from tensorflow import nn
from sys import getsizeof
import time
from tflearn.data_utils import pad_sequences

Load data

In [2]:
w2v_model = pickle.load(open("w2v_model.dat", "rb"))

w2v_table = {}

for i in w2v_model.wv.vocab.keys():
    w2v_table[i] = w2v_model[i].tolist()

w2v_table["1992.0"] = [0]*300

In [4]:
pickle.dump(w2v_table, open("w2v_table.dat", "wb"))

In [14]:
train_q1 = np.array(pickle.load(open("../data/train_q1.dat", "rb"))[0:10000])
train_q2 = np.array(pickle.load(open("../data/train_q2.dat", "rb"))[0:10000])

# test_q1 = np.array(pickle.load(open("../data/train_q1.dat", "rb"))[0:50000])
# test_q2 = np.array(pickle.load(open("../data/train_q2.dat", "rb"))[0:50000])

target = pickle.load(open("../data/y.dat", "rb"))[0:10000]
target = np.array(target).reshape(-1,1)
target_inverse = 1-target
target = np.concatenate([target, target_inverse], 1)

LSTM

In [15]:
# model parameters
n_hidden = 64
n_input = 300
n_classes = 2
n_dense1 = 256
# n_dense2 = 256

learning_rate = 0.01
lambda_loss_amount = 0.001
batch = 512
training_iters = 20*batch
display_iter = 100

In [16]:
# reset all parameters
tf.reset_default_graph()

In [17]:
# Graph input
x1 = tf.placeholder(tf.float64, [None, None, n_input])
x2 = tf.placeholder(tf.float64, [None, None, n_input])
y_ = tf.placeholder(tf.float64, [None, n_classes])
sentence_1_length = tf.placeholder(tf.int32, [None])
sentence_2_length = tf.placeholder(tf.int32, [None])

# dense layer variable
W = tf.Variable(tf.random_normal([2*n_hidden,n_classes],dtype=tf.float64), name="W1")
b = tf.Variable(tf.random_normal([n_classes],dtype=tf.float64), name="b1")

In [18]:
def last_relevant(output, length):
    batch_size = tf.shape(output)[0]
    max_length = tf.shape(output)[1]
    out_size = int(output.get_shape()[2])
    index = tf.range(0, batch_size) * max_length + (length - 1)
    flat = tf.reshape(output, [-1, out_size])
    relevant = tf.gather(flat, index)
    return relevant

In [19]:
# RNN embedding
with tf.variable_scope('embedding_1'):
    rnn_cell_1 = rnn.GRUCell(n_hidden)
    outputs_1, _ = nn.dynamic_rnn(rnn_cell_1, x1, sequence_length = sentence_1_length, dtype=tf.float64)

with tf.variable_scope('embedding_2'):
    rnn_cell_2 = rnn.GRUCell(n_hidden)
    outputs_2, _ = nn.dynamic_rnn(rnn_cell_2, x2, sequence_length = sentence_2_length, dtype=tf.float64)

# get similarity measure
last1 = last_relevant(outputs_1, sentence_1_length)
last2 = last_relevant(outputs_2, sentence_2_length)

# diff = tf.subtract(last1, last2)
# l1_distance = tf.abs(diff)

# summed_l1_distance = tf.reduce_sum(l1_distance, axis=1,
#                                    keep_dims=True)

# positive_class_probs = tf.exp(-summed_l1_distance)
# negative_class_probs = 1 - positive_class_probs

sentence_embedding = tf.concat([last1, last2], axis=1)
y = tf.matmul(sentence_embedding,W) + b
# y = tf.concat([negative_class_probs, positive_class_probs], 1)

In [20]:
# loss
cross_entropy = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))

# SGD
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(cross_entropy)

# Evaluate model
correct_pred = tf.equal(tf.argmax(y_,1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float64))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


rum model

In [21]:
# test
batch_test_1 = train_q1[0:1000]
batch_test_len_1 = [len(x) for x in batch_test_1]
batch_test_pad_1 = pad_sequences(batch_test_1, value = 1992, dtype = "object").astype("str")
batch_test_trans_1 = [[w2v_table[w] for w in s] for s in batch_test_pad_1]

batch_test_2 = train_q2[0:1000]
batch_test_len_2 = [len(x) for x in batch_test_2]
batch_test_pad_2 = pad_sequences(batch_test_2, value = 1992, dtype = "object").astype("str")
batch_test_trans_2 = [[w2v_table[w] for w in s] for s in batch_test_pad_2]

batch_test_y = target[0:1000]

In [22]:
init_op = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init_op)
    step = 1
    # batch generating
    N = target.shape[0]
    batch_index = np.arange(0,N)
    batch_start = np.append(np.arange(0,N,batch),N)
    print("------------------------------------")
    while step <= 100:
        
        # shuffle data
        np.random.shuffle(batch_index)
        
        for i in range(len(batch_start)-1):
            # Batch processing
            batch_x_1 = train_q1[batch_index[batch_start[i]:batch_start[i+1]]].tolist()
            batch_x_len_1 = [len(x) for x in batch_x_1]
            batch_x_pad_1 = pad_sequences(batch_x_1, value = 1992, dtype = "object").astype("str")
            batch_x_trans_1 = [[w2v_table[w] for w in s] for s in batch_x_pad_1]
            
            batch_x_2 = train_q2[batch_index[batch_start[i]:batch_start[i+1]]].tolist()
            batch_x_len_2 = [len(x) for x in batch_x_2]
            batch_x_pad_2 = pad_sequences(batch_x_2, value = 1992, dtype = "object").astype("str")
            batch_x_trans_2 = [[w2v_table[w] for w in s] for s in batch_x_pad_2]
            
            batch_y = target[batch_index[batch_start[i]:batch_start[i+1]],:]
            
            # train a hidden layer
            sess.run(train_step, feed_dict={x1:batch_x_trans_1,
                                            x2:batch_x_trans_2,
                                            sentence_1_length:batch_x_len_1,
                                            sentence_2_length:batch_x_len_2,
                                            y_: batch_y})
        # print loss and accuracy
        if step % 1 == 0:
            print("Train Cross-Entropy Loss", sess.run(cross_entropy, feed_dict={x1:batch_test_trans_1,
                                                x2:batch_test_trans_2,
                                                sentence_1_length:batch_test_len_1,
                                                sentence_2_length:batch_test_len_2,
                                                y_: batch_test_y}))        
            print("Train Accuracy:", sess.run(accuracy, feed_dict={x1:batch_test_trans_1,
                                                x2:batch_test_trans_2,
                                                sentence_1_length:batch_test_len_1,
                                                sentence_2_length:batch_test_len_1,
                                                y_: batch_test_y})) 
#             print(sess.run(last1,feed_dict={x1:batch_test_trans_1,
#                                                 x2:batch_test_trans_2,
#                                                 sentence_1_length:batch_test_len_1,
#                                                 sentence_2_length:batch_test_len_1,
#                                                 y_: batch_test_y})) 
            print("------------------------------------")
        # update batch
        step += 1

------------------------------------
Train Cross-Entropy Loss 0.80719291013
Train Accuracy: 0.594
------------------------------------
Train Cross-Entropy Loss 0.768096237247
Train Accuracy: 0.607
------------------------------------
Train Cross-Entropy Loss 0.738199390254
Train Accuracy: 0.621
------------------------------------
Train Cross-Entropy Loss 0.714342500671
Train Accuracy: 0.634
------------------------------------
Train Cross-Entropy Loss 0.702135510837
Train Accuracy: 0.632
------------------------------------
Train Cross-Entropy Loss 0.683664856809
Train Accuracy: 0.643
------------------------------------
Train Cross-Entropy Loss 0.674908399836
Train Accuracy: 0.647
------------------------------------
Train Cross-Entropy Loss 0.662560507469
Train Accuracy: 0.648
------------------------------------
Train Cross-Entropy Loss 0.6564954168
Train Accuracy: 0.648
------------------------------------
Train Cross-Entropy Loss 0.647746679053
Train Accuracy: 0.652
-------------