# RNN embedding
This notebook focus on the application of LSTM to embed the sentence and measure the similarity between question pairs
<br>
Reference: ***Siamese Recurrent Architectures for Learning Sentence Similarity***
<br>http://www.mit.edu/~jonasm/info/MuellerThyagarajan_AAAI16.pdf

In [1]:
import pandas as pd
import numpy as np
from gensim import corpora, models, similarities
import re
import pickle
import tensorflow as tf
from tensorflow.contrib import rnn
from tensorflow import nn
from sys import getsizeof

In [2]:
w2v_model = pickle.load(open("w2v_model.dat", "rb"))

# Load and preprocess

In [232]:
# # clean the sentence
# def txt_clean(sentence):
#     s = re.sub("[^a-zA-Z0-9]", " ", str(sentence))
#     s_list = s.lower().split()
#     return s_list

# train = pd.read_csv("../data/train.csv")
# test = pd.read_csv("../data/test.csv")

# # transform to get split sentence
# train_q1 = train.apply(lambda x: txt_clean(x["question1"]),axis = 1).tolist()
# train_q2 = train.apply(lambda x: txt_clean(x["question2"]),axis = 1).tolist()
# test_q1 = test.apply(lambda x: txt_clean(x["question1"]),axis = 1).tolist()
# test_q2 = test.apply(lambda x: txt_clean(x["question2"]),axis = 1).tolist()
# y = train.is_duplicate.tolist()

# pickle.dump(train_q1, open("../data/train_q1.dat", "wb"))
# pickle.dump(train_q2, open("../data/train_q2.dat", "wb"))
# pickle.dump(test_q1, open("../data/test_q1.dat", "wb"))
# pickle.dump(test_q2, open("../data/test_q2.dat", "wb"))
# pickle.dump(y, open("../data/y.dat", "wb"))

In [3]:
train_q1 = pickle.load(open("../data/train_q1.dat", "rb"))[0:5000]
train_q2 = pickle.load(open("../data/train_q2.dat", "rb"))[0:5000]
# test_q1 = pickle.load(open("../data/test_q1.dat", "rb"))[0:1000]
# test_q1 = pickle.load(open("../data/test_q1.dat", "rb"))[0:1000]

In [4]:
target = pickle.load(open("../data/y.dat", "rb"))[0:5000]

target = np.array(target).reshape(-1,1)

target_inverse = 1-target

target = np.concatenate([target, target_inverse], 1)

# Build a LSTM for embedding

1.Test the word2vec model

In [87]:
# trainsform to word vectors
for w in train_q1[1]:
    print(w2v_model.wv[w][0:5])

[ 0.28979221 -0.18480104 -0.32667643 -0.17523807 -0.11447719]
[ 0.2155818   0.08308727 -0.12565269  0.11223465 -0.3529726 ]
[ 0.27783838 -0.04633152 -0.26982322  0.00828765 -0.09711678]
[ 0.10206932 -0.0277025  -0.05544248 -0.09461203 -0.0744462 ]
[ 0.2376662   0.01902849 -0.14263637 -0.19002569 -0.12894249]
[ 0.0527588   0.00629033 -0.09461353 -0.06562302 -0.04679567]
[ 0.05161272  0.0124112  -0.08588935 -0.06794626 -0.04058785]
[ 0.09765382 -0.15506659 -0.23301676 -0.07342669 -0.04687157]
[ 0.03986795  0.01472408 -0.0949952  -0.06355751 -0.04129621]
[ 0.04446325  0.00925547 -0.09746711 -0.07145177 -0.04257616]


The model is OK to transform the word to vectors

2.padding the list to the same length

In [88]:
len(train_q1)

1000

In [89]:
train_q1[0]

['what',
 'is',
 'the',
 'step',
 'by',
 'step',
 'guide',
 'to',
 'invest',
 'in',
 'share',
 'market',
 'in',
 'india']

In [90]:
train_q1[2]

['how',
 'can',
 'i',
 'increase',
 'the',
 'speed',
 'of',
 'my',
 'internet',
 'connection',
 'while',
 'using',
 'a',
 'vpn']

In [91]:
testdata = train_q1[0:10]

In [5]:
# functions for padding list
def getMaxLength(sentences):
    "setences: list of list of words"
    maxLen = -1
    for s in sentences:
        if len(s) > maxLen:
            maxLen = len(s)
    return maxLen

def padding(sentences, maxLen):
    padContent = "thisispadding"
    sentencesPad = []
    for s in sentences:
        s.extend([padContent]*(maxLen-len(s)))
        sentencesPad.append(s)
    return sentencesPad

def transToVec(sentences,maxLen):
    transSentence = []
    for s in sentences:
        sen = []
        for j in range(maxLen):
            if s[j] == "thisispadding":
                sen.append([0] * n_input)
            else:
                sen.append(w2v_model.wv[s[j]].tolist())
        transSentence.append(sen)
    return np.array(transSentence) 

In [108]:
# for loop to padd
maxLen = getMaxLength(testdata)

testdatapad = padding(testdata, maxLen)

transdata = transToVec(testdatapad,maxLen)

In [109]:
transdata.shape

(10, 17, 300)

Using traditional RNN cell first

1.Model graph

In [21]:
# reset all parameters
tf.reset_default_graph()

In [15]:
# model parameters
n_hidden = 64
n_input = 300
n_classes = 2

learning_rate = 0.01
lambda_loss_amount = 0.001
batch_size = 128
training_iters = 20*batch_size
display_iter = 100

In [22]:
# Graph input
x1 = tf.placeholder(tf.float64, [None, None, n_input])
x2 = tf.placeholder(tf.float64, [None, None, n_input])
y_ = tf.placeholder(tf.float64, [None, n_classes])

# dense layer variable
W = tf.Variable(tf.random_normal([2*n_hidden,n_classes],dtype=tf.float64))
b = tf.Variable(tf.random_normal([n_classes],dtype=tf.float64))

In [24]:
# RNN embedding
def RNN(x1, x2, weights, bias):
    
    with tf.variable_scope('embedding_1'):
        rnn_cell_1 = rnn.BasicRNNCell(n_hidden)
        outputs_1, states_1 = nn.dynamic_rnn(rnn_cell_1, x1, dtype=tf.float64)
        
    with tf.variable_scope('embedding_2'):
        rnn_cell_2 = rnn.BasicRNNCell(n_hidden)
        outputs_2, states_2 = nn.dynamic_rnn(rnn_cell_2, x2, dtype=tf.float64)
    
    # concat two embedding into one
    sentence_embedding = tf.concat([states_1, states_2], axis=1)
        
    return tf.matmul(sentence_embedding,weights) + bias

In [25]:
# prediction
y = RNN(x1, x2, W, b)

# loss
cross_entropy = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))

# SGD
train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy)

In [29]:
# Evaluate model
correct_pred = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float64))

In [27]:
# batch generating
N = target.shape[0]
batch_index = np.arange(0,N)
batch_start = np.append(np.arange(0,N,batch_size),N)

model test with 5000 training queries

In [31]:
init_op = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init_op)
    step = 1
    while step < 10:
        
        # shuffle data
        np.random.shuffle(batch_index)
        
        for i in range(len(batch_start)-1):
            
            # Batch processing 
            batch_x_1= train_q1[batch_start[i]:batch_start[i+1]]
            maxLen_1 = getMaxLength(batch_x_1)
            batch_x_pad_1 = padding(batch_x_1, maxLen_1)
            batch_x_trans_1 = transToVec(batch_x_pad_1,maxLen_1)
            batch_x_2= train_q2[batch_start[i]:batch_start[i+1]]
            maxLen_2 = getMaxLength(batch_x_2)
            batch_x_pad_2 = padding(batch_x_2, maxLen_2)
            batch_x_trans_2 = transToVec(batch_x_pad_2,maxLen_2)
            batch_y = target[batch_start[i]:batch_start[i+1],:]

            # train a hidden layer
            sess.run(train_step, feed_dict={x1:batch_x_trans_1,
                                            x2:batch_x_trans_2,
                                            y_: batch_y})

        # print loss and accuracy
        print("Train Cross-Entropy Loss", sess.run(cross_entropy, feed_dict={x1:batch_x_trans_1,
                                            x2:batch_x_trans_2,
                                            y_: batch_y}))
        
        print("Train Accuracy:", sess.run(accuracy, feed_dict={x1:batch_x_trans_1,
                                                                x2:batch_x_trans_2,
                                                                y_: batch_y}))
        # update batch
        step += 1

Train Accuracy: 0.75
Train Accuracy: 0.75
Train Accuracy: 0.875
Train Accuracy: 1.0
Train Accuracy: 1.0
Train Accuracy: 1.0
Train Accuracy: 1.0
Train Accuracy: 1.0
Train Accuracy: 1.0
