In [1]:
import tensorflow as tf
import numpy as np
import re
import pandas as pd
import nltk
from nltk import tokenize

### Getting input data ready

In [2]:
filePath = "/home/ubuntu/quora_duplicate_questions.tsv"
df = pd.read_csv(filePath,delimiter="\t")
df.question1 = df.question1.fillna("")
df.question1 = df.question1.apply(str.lower)
df.question2 = df.question2.fillna("")
df.question2 = df.question2.apply(str.lower)

Finding unique words in dataset to create vocabulary

In [3]:
tokzr = nltk.tokenize.TreebankWordTokenizer()
uniqueQuestions = df.question1.unique()
tokenizedQns = [tokzr.tokenize(sentence) for sentence in uniqueQuestions]
words = [word for tokWords in tokenizedQns for word in tokWords]

words2 = df.question2.unique()
words2 = [tokzr.tokenize(sentence) for sentence in words2]
words2 = [word for tokWords in words2 for word in tokWords]
words.extend(words2)

Adding PAD as filler for normalizing sentence length and UNK for unkown tokens

In [4]:
words = set(words)
vocabulary = dict(zip(words,range(2,len(words)+2)))
vocabulary['PAD'] = 0
vocabulary['UNK'] = 1
print("Vocabulary Size including PAD and UNK: ",len(vocabulary))

('Vocabulary Size including PAD and UNK: ', 122188)


Each question represented as list of index in the vocabulary

In [5]:
def tokenizeAndIndex(sentence):
    words = tokzr.tokenize(sentence)
    retVal = [vocabulary[word] if word in vocabulary else vocabulary['UNK'] for word in words]
    return retVal
df['Q1Indexed'] = df.question1.apply(tokenizeAndIndex)
df['Q2Indexed'] = df.question2.apply(tokenizeAndIndex)

Threshold questions with total words <= 50

In [6]:
seqLength = 50
df = df[df.Q1Indexed.apply(len) <= seqLength]
df = df[df.Q2Indexed.apply(len) <= seqLength]

def normalizeSequenceLength(sequence):
    if(len(sequence) < seqLength):
        padding = [vocabulary['PAD'] for i in range(seqLength - len(sequence))]
        sequence.extend(padding)
    return sequence
df.Q1Indexed = df.Q1Indexed.apply(normalizeSequenceLength)
df.Q2Indexed = df.Q2Indexed.apply(normalizeSequenceLength)

### Building the network

Creating setence embedding

In [7]:
vocab_size = len(vocabulary)
embedding_size = 100

W = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),name="W")

q1Input = tf.placeholder(tf.int32, [None, seqLength], name="q1Input")
q1Embeddings = tf.nn.embedding_lookup(W, q1Input)
q1Embeddings = tf.reduce_sum(q1Embeddings, 1)

q2Input = tf.placeholder(tf.int32, [None, seqLength], name="q1Input")
q2Embeddings = tf.nn.embedding_lookup(W, q2Input)
q2Embeddings = tf.reduce_sum(q2Embeddings, 1)

sentenceEmbedding = tf.concat([q1Embeddings,q2Embeddings],axis=1,name='sentenceEmbedding')

Dense layers and output

In [8]:
dense1 = tf.layers.dense(inputs=sentenceEmbedding, units=embedding_size*2, activation=tf.nn.tanh,name='dense1')
dense2 = tf.layers.dense(inputs=dense1, units=embedding_size*2, activation=tf.nn.tanh,name='dense2')
dense3 = tf.layers.dense(inputs=dense2, units=embedding_size*2, activation=tf.nn.tanh,name='dense3')
logits = tf.layers.dense(inputs=dense3, units=2,name='logits')
predictions = tf.argmax(input=tf.nn.softmax(logits=logits,dim=-1,name='softmax'),axis=1,name='output')

Loss and gradient updates

In [9]:
num_classes = 2
labels = tf.placeholder(tf.int32,[None,num_classes],name='labels')

loss = None
train_op = None

# Calculate loss for both TRAIN and EVAL modes
loss = tf.losses.softmax_cross_entropy(onehot_labels=labels, logits=logits)
train_op = tf.contrib.layers.optimize_loss(loss=loss,
                                           global_step=tf.contrib.framework.get_global_step(),
                                           learning_rate=0.001,
                                           optimizer="Adam")
correct_prediction = tf.equal(predictions, tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

Prepare variables for training epoch

In [10]:
positiveSamples = df[df.is_duplicate==1]
negativeSamples = df[df.is_duplicate==0]

#Testing data
positiveTest = positiveSamples.sample(frac=0.3)
negativeTest = negativeSamples.sample(frac=0.3)
testData = positiveTest.append(negativeTest)
print("Number of test samples: {0}".format(len(testData)))
#Training data
trainData = df[df.id.isin(testData.id) == False]
print("Number of train samples: {0}".format(len(trainData)))

Number of test samples: 120852
Number of train samples: 281986


In [11]:
with tf.Session() as session:
    
    fetches = {'eval_op':train_op,'accuracy':accuracy}
    
    print("Starting...")
    session.run(tf.global_variables_initializer())
    
    noEpisodes = 10
    batchSize = 1000
    noEpochs = len(trainData) / batchSize
    
    testLabels = tf.one_hot(testData.is_duplicate.values,on_value=1,
                            off_value=0,depth=2,axis=-1,name='one_hot_labels')
    testLabels = testLabels.eval(session=session)
    testQ1Indices = np.array(list(testData.Q1Indexed.values),dtype=np.int32)
    testQ2Indices = np.array(list(testData.Q2Indexed.values),dtype=np.int32)
    
    lTest = len(testQ1Indices) / 2
    testFeed1 = {q1Input:testQ1Indices[:lTest],q2Input:testQ2Indices[:lTest],labels:testLabels[:lTest]}
    testFeed2 = {q1Input:testQ1Indices[lTest:],q2Input:testQ2Indices[lTest:],labels:testLabels[lTest:]}
    
    for episode in range(noEpisodes):
        episodeData = trainData.iloc[np.random.permutation(len(trainData))]
        
        startIdx = 0
        episodeLoss = 0
        for epoch in range(noEpochs):
            batch = episodeData.iloc[startIdx:startIdx+batchSize]
            startIdx += batchSize
            
            oneHotLabels = tf.one_hot(batch.is_duplicate.values,
                              on_value=1,off_value=0,depth=2,axis=-1,name='one_hot_labels')
            oneHotLabels = oneHotLabels.eval(session=session)
            q1Indices = np.array(list(batch.Q1Indexed.values),dtype=np.int32)
            q2Indices = np.array(list(batch.Q2Indexed.values),dtype=np.int32)
            feed_dict = {q1Input:q1Indices,q2Input:q2Indices,labels:oneHotLabels}

            trainMetrics = session.run(fetches,feed_dict)
            
            episodeLoss += trainMetrics['eval_op']
        
        episodeLoss /= noEpochs
        
        print "Episode: ",episode
        print "\t Training Loss: {0}".format(episodeLoss)
        print "\n"

        lTest = len(testQ1Indices) / 2
        fetches = {'loss':loss, 'accuracy':accuracy}
        testMetrics1 = session.run(fetches,testFeed1)
        testMetrics2 = session.run(fetches,testFeed2)

        testLoss = (testMetrics1['loss'] + testMetrics2['loss'])/2
        testAccuracy = 50*(testMetrics1['accuracy'] + testMetrics2['accuracy'])
        print "\t Test Loss: {0}".format(testLoss)
        print "\t Test accucary: {0}".format(testAccuracy)

        fetches = {'eval_op':train_op,'accuracy':accuracy}

Starting...
Episode:  0
	 Training Loss: 0.625947948665


	 Test Loss: 0.586109280586
	 Test accucary: 69.139111042
Episode:  1
	 Training Loss: 0.559817612595


	 Test Loss: 0.552389085293
	 Test accucary: 72.1576750278
Episode:  2
	 Training Loss: 0.523507411794


	 Test Loss: 0.531595468521
	 Test accucary: 74.0815162659
Episode:  3
	 Training Loss: 0.496339569631


	 Test Loss: 0.515952825546
	 Test accucary: 74.6930062771
Episode:  4
	 Training Loss: 0.471485107395


	 Test Loss: 0.520262002945
	 Test accucary: 75.0190317631
Episode:  5
	 Training Loss: 0.452938687441


	 Test Loss: 0.512930512428
	 Test accucary: 75.2689182758
Episode:  6
	 Training Loss: 0.434044698803


	 Test Loss: 0.501904070377
	 Test accucary: 75.8770942688
Episode:  7
	 Training Loss: 0.41747579021


	 Test Loss: 0.519470989704
	 Test accucary: 75.724029541
Episode:  8
	 Training Loss: 0.399334875608


	 Test Loss: 0.517299473286
	 Test accucary: 74.4414627552
Episode:  9
	 Training Loss: 0.388827730752




In [13]:
saver = tf.train.Saver(name='sumModel_',restore_sequentially=True,reshape=False,sharded=False)

In [None]:
saver = tf.train.Saver(name='sumModel_',restore_sequentially=True,reshape=False,sharded=False)
with tf.Session() as session:
    
    fetches = {'eval_op':train_op,'accuracy':accuracy}
    
    print("Starting...")
    session.run(tf.global_variables_initializer())
    
    noEpisodes = 10
    batchSize = 1000
    noEpochs = len(trainData) / batchSize
    
    testLabels = tf.one_hot(testData.is_duplicate.values,on_value=1,
                            off_value=0,depth=2,axis=-1,name='one_hot_labels')
    testLabels = testLabels.eval(session=session)
    testQ1Indices = np.array(list(testData.Q1Indexed.values),dtype=np.int32)
    testQ2Indices = np.array(list(testData.Q2Indexed.values),dtype=np.int32)
    
    lTest = len(testQ1Indices) / 2
    testFeed1 = {q1Input:testQ1Indices[:lTest],q2Input:testQ2Indices[:lTest],labels:testLabels[:lTest]}
    testFeed2 = {q1Input:testQ1Indices[lTest:],q2Input:testQ2Indices[lTest:],labels:testLabels[lTest:]}
    
    for episode in range(noEpisodes):
        episodeData = trainData.iloc[np.random.permutation(len(trainData))]
        
        startIdx = 0
        episodeLoss = 0
        for epoch in range(noEpochs):
            batch = episodeData.iloc[startIdx:startIdx+batchSize]
            startIdx += batchSize
            
            oneHotLabels = tf.one_hot(batch.is_duplicate.values,
                              on_value=1,off_value=0,depth=2,axis=-1,name='one_hot_labels')
            oneHotLabels = oneHotLabels.eval(session=session)
            q1Indices = np.array(list(batch.Q1Indexed.values),dtype=np.int32)
            q2Indices = np.array(list(batch.Q2Indexed.values),dtype=np.int32)
            feed_dict = {q1Input:q1Indices,q2Input:q2Indices,labels:oneHotLabels}

            trainMetrics = session.run(fetches,feed_dict)
            
            episodeLoss += trainMetrics['eval_op']
        
        episodeLoss /= noEpochs
        
        saver.save(session,save_path='/home/ubuntu/QuestionPairs/SumModel/',
                   global_step=episode,write_meta_graph=True)
        
        print "Episode: ",episode
        print "\t Training Loss: {0}".format(episodeLoss)
        print "\n"

        lTest = len(testQ1Indices) / 2
        fetches = {'loss':loss, 'accuracy':accuracy}
        testMetrics1 = session.run(fetches,testFeed1)
        testMetrics2 = session.run(fetches,testFeed2)

        testLoss = (testMetrics1['loss'] + testMetrics2['loss'])/2
        testAccuracy = 50*(testMetrics1['accuracy'] + testMetrics2['accuracy'])
        print "\t Test Loss: {0}".format(testLoss)
        print "\t Test accucary: {0}".format(testAccuracy)

        fetches = {'eval_op':train_op,'accuracy':accuracy}

Starting...
