In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import spacy

### Getting input data ready

In [None]:
filePath = "data/quora_duplicate_questions.tsv"
df = pd.read_csv(filePath,delimiter="\t")
df.question1 = df.question1.fillna("")
df.question1 = df.question1.apply(str.lower)
df.question2 = df.question2.fillna("")
df.question2 = df.question2.apply(str.lower)

Finding unique words in dataset to create vocabulary

In [None]:
def tokenize(s,nlp):
    doc = nlp(s)
    tokSen = []
    for word in doc:
        tokSen.append(word.text)
    return tokSen

nlp = spacy.load('en')
uniqueQuestions = df.question1.unique()
tokenizedQns = [tokenize(unicode(sentence,'utf8'),nlp) for sentence in uniqueQuestions]
words = [word for tokWords in tokenizedQns for word in tokWords]

words2 = df.question2.unique()
words2 = [tokenize(unicode(sentence,'utf8'),nlp) for sentence in words2]
words2 = [word for tokWords in words2 for word in tokWords]
words.extend(words2)

Adding PAD as filler for normalizing sentence length and UNK for unkown tokens

In [None]:
words = set(words)
vocabulary = dict(zip(words,range(2,len(words)+2)))
vocabulary['PAD'] = 0
vocabulary['UNK'] = 1
print("Vocabulary Size including PAD and UNK: ",len(vocabulary))

Each question represented as list of index in the vocabulary

In [None]:
def loadWordVectors(filePath,vocab):
    txt = open('data/wiki.en.vec')
    wordVecs = np.zeros((len(vocab),300),dtype=float)
    for line in txt:
        splitData = line.split(" ")
        word = splitData[0]
        word = unicode(word,'utf8')
        if(word not in vocab):
            continue
        vector = splitData[1:len(splitData)-1]
        wordVecs[vocab[word]] = np.array(vector,dtype=float)
    return wordVecs
wordVecSize = 300
wordVecs = loadWordVectors('wiki/wiki.en.vec',vocabulary)

In [None]:
idx = 0 
for w in wordVecs:
    if(w is None):
        count += 1
        wordVecs[idx] = 2 * np.random.random_sample(wordVecSize) - 1

In [None]:
def tokenizeAndIndex(sentence):
    words = tokenize(unicode(sentence,'utf8'),nlp)
    retVal = [vocabulary[word] if word in vocabulary else vocabulary['UNK'] for word in words]
    return retVal
df['Q1Indexed'] = df.question1.apply(tokenizeAndIndex)
df['Q2Indexed'] = df.question2.apply(tokenizeAndIndex)

Threshold questions with total words <= 50

In [None]:
seqLength = 50
df = df[df.Q1Indexed.apply(len) <= seqLength]
df = df[df.Q2Indexed.apply(len) <= seqLength]

def normalizeSequenceLength(sequence):
    if(len(sequence) < seqLength):
        padding = [vocabulary['PAD'] for i in range(seqLength - len(sequence))]
        sequence.extend(padding)
    return sequence
df.Q1Indexed = df.Q1Indexed.apply(normalizeSequenceLength)
df.Q2Indexed = df.Q2Indexed.apply(normalizeSequenceLength)

In [None]:
positiveSamples = df[df.is_duplicate==1]
negativeSamples = df[df.is_duplicate==0]

#Testing data
positiveTest = positiveSamples.sample(frac=0.3)
negativeTest = negativeSamples.sample(frac=0.3)
testData = positiveTest.append(negativeTest)
print("Number of test samples: {0}".format(len(testData)))
#Training data
trainData = df[df.id.isin(testData.id) == False]
print("Number of train samples: {0}".format(len(trainData)))

positiveVal = positiveTest.sample(frac=0.5)
negativeVal = negativeTest.sample(frac=0.5)
valData = positiveVal.append(negativeVal)

positiveTest = positiveTest[positiveTest.id.isin(positiveVal.id) == False]
negativeTest = negativeTest[negativeTest.id.isin(negativeVal.id) == False]
testData = positiveTest.append(negativeTest)

totalLen = float(len(df))
print("Split ratio: {}:{}:{}".format(len(trainData) / totalLen, len(valData) / totalLen, len(testData) / totalLen))
print("Total Samples: {}:{}:{}".format(len(trainData), len(valData), len(testData)))
#print(float(len(valData)) / len(df))

Saving processed data to file

In [None]:
df.to_pickle('data/ProcessedData.pkl')
trainData.to_pickle("data/TrainData.pkl")
testData.to_pickle("data/TestData.pkl")
valData.to_pickle("data/ValData.pkl")
np.save('data/wordVecs.npy',wordVecs)

Loading processed data from file

In [3]:
df = pd.read_pickle('data/ProcessedData.pkl')
trainData = pd.read_pickle('data/TrainData.pkl')
testData = pd.read_pickle('data/TestData.pkl')
valData = pd.read_pickle('data/ValData.pkl')
wordVecs = np.load('data/wordVecs.npy')
wordVecSize = 300
seqLength = 50

### Building the network

Creating setence embedding

In [13]:
tf.reset_default_graph()

In [14]:
wordVecSize = 100
vocab_size = len(wordVecs)

with tf.variable_scope("Words") as scope:
    W = tf.Variable(wordVecs,name="W")
    #W = tf.Variable(tf.random_uniform([vocab_size, wordVecSize], -1.0, 1.0),name="W")
    
    q1Input = tf.placeholder(tf.int32, [None, seqLength], name="q1Input")
    q1Embeddings = tf.nn.embedding_lookup(W, q1Input)
    q1SeqLen = tf.placeholder(tf.int32, [None], name="q1SequenceLength")
    
    q2Input = tf.placeholder(tf.int32, [None, seqLength], name="q2Input")
    q2SeqLen = tf.placeholder(tf.int32, [None], name="q2SequenceLength")
    q2Embeddings = tf.nn.embedding_lookup(W, q2Input)

with tf.variable_scope("Sentence") as scope:
    cell = tf.contrib.rnn.BasicLSTMCell(wordVecSize, forget_bias=1.0, state_is_tuple=True)
    q1Rep,_ = tf.nn.dynamic_rnn(cell,q1Embeddings,dtype=tf.float64,swap_memory=True,sequence_length=q1SeqLen)
    q1Rep = tf.expand_dims(q1Rep,2)
    q1Rep = tf.nn.max_pool(q1Rep,[1,seqLength,1,1],strides=[1,1,1,1],padding='VALID',data_format='NHWC',name='AvgPool')
    q1Rep = tf.squeeze(q1Rep,axis=[1,2])
    #q1Rep = q1Rep[:,-1,:]
    scope.reuse_variables()
    q2Rep,_ = tf.nn.dynamic_rnn(cell,q2Embeddings,dtype=tf.float64,swap_memory=True,sequence_length=q2SeqLen)
    q2Rep = tf.expand_dims(q2Rep,2)
    q2Rep = tf.nn.max_pool(q2Rep,[1,seqLength,1,1],strides=[1,1,1,1],padding='VALID',data_format='NHWC',name='AvgPool')
    q2Rep = tf.squeeze(q2Rep,axis=[1,2])
    #q2Rep = q2Rep[:,-1,:]

    sentenceEmbedding = tf.concat([q1Rep,q2Rep],axis=1,name='sentenceEmbedding')

TypeError: Value passed to parameter 'input' has DataType float64 not in list of allowed values: float32, float16

In [18]:
print(q1Rep.get_shape())

(?, 50, 100)


Dense layers and output

In [6]:
with tf.variable_scope("DenseLayers") as scope:
    dense1 = tf.layers.dense(inputs=sentenceEmbedding, units=wordVecSize*2, activation=tf.nn.tanh,name='dense1')
    dropoutD1 = tf.nn.dropout(x=dense1,keep_prob=0.7)
    dense2 = tf.layers.dense(inputs=dropoutD1, units=wordVecSize*2, activation=tf.nn.tanh,name='dense2')
    dropoutD2 = tf.nn.dropout(x=dense2,keep_prob=0.7)
    dense3 = tf.layers.dense(inputs=dropoutD2, units=wordVecSize*2, activation=tf.nn.tanh,name='dense3')
    logits = tf.layers.dense(inputs=dense3, units=2,name='logits')

with tf.variable_scope("Prediction") as scope:
    predictions = tf.argmax(input=tf.nn.softmax(logits=logits,dim=-1,name='softmax'),axis=1,name='output')

Loss and gradient updates

In [7]:
num_classes = 2
labels = tf.placeholder(tf.int32,[None,num_classes],name='labels')

loss = None
train_op = None

# Calculate loss for both TRAIN and EVAL modes
loss = tf.losses.softmax_cross_entropy(onehot_labels=labels, logits=logits)
train_op = tf.contrib.layers.optimize_loss(loss=loss,
                                           global_step=tf.contrib.framework.get_global_step(),
                                           learning_rate=0.001,
                                           optimizer="Adam")
correct_prediction = tf.equal(predictions, tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

Prepare variables for training epoch

In [8]:
session = tf.InteractiveSession()

In [11]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [12]:
saver = tf.train.Saver(restore_sequentially=True,reshape=False,sharded=False)
fetches = {'eval_op':train_op,'accuracy':accuracy}
    
print("Starting...")
session.run(tf.global_variables_initializer())

noEpisodes = 10
batchSize = 1000
noEpochs = len(trainData) / batchSize

valLabels = tf.one_hot(valData.is_duplicate.values,on_value=1,
                        off_value=0,depth=2,axis=-1,name='one_hot_labels')
valLabels = valLabels.eval(session=session)
valQ1Indices = np.array(list(valData.Q1Indexed.values),dtype=np.int32)
valQ1Len = valData.Q1Length.values.astype(np.int32)
valQ2Indices = np.array(list(valData.Q2Indexed.values),dtype=np.int32)
valQ2Len = valData.Q2Length.values.astype(np.int32)

testLabels = tf.one_hot(testData.is_duplicate.values,on_value=1,
                        off_value=0,depth=2,axis=-1,name='one_hot_labels')
testLabels = testLabels.eval(session=session)
testQ1Indices = np.array(list(testData.Q1Indexed.values),dtype=np.int32)
testQ1Len = testData.Q1Length.values.astype(np.int32)
testQ2Indices = np.array(list(testData.Q2Indexed.values),dtype=np.int32)
testQ2Len = testData.Q2Length.values.astype(np.int32)

noTestBatches = 100
testSzPerBatch = len(valQ1Indices) / noTestBatches

print("Episode\ttrain loss\tval loss\tval accuracy")
for episode in range(noEpisodes):
    episodeData = trainData.iloc[np.random.permutation(len(trainData))]

    startIdx = 0
    episodeLoss = 0
    for epoch in range(noEpochs):
        batch = episodeData.iloc[startIdx:startIdx+batchSize]
        startIdx += batchSize

        oneHotLabels = tf.one_hot(batch.is_duplicate.values,
                          on_value=1,off_value=0,depth=2,axis=-1,name='one_hot_labels')
        oneHotLabels = oneHotLabels.eval(session=session)
        q1Indices = np.array(list(batch.Q1Indexed.values),dtype=np.int32)
        q1Len = batch.Q1Length.values.astype(np.int32)
        q2Indices = np.array(list(batch.Q2Indexed.values),dtype=np.int32)
        q2Len = batch.Q2Length.values.astype(np.int32)
        feed_dict = {q1Input:q1Indices,q1SeqLen:q1Len,q2Input:q2Indices,q2SeqLen:q2Len,labels:oneHotLabels}

        trainMetrics = session.run(fetches,feed_dict)

        episodeLoss += trainMetrics['eval_op']

    episodeLoss /= noEpochs

    testLoss = 0
    testAccuracy = 0
    fetches = {'loss':loss, 'accuracy':accuracy}
    for subTest in range(noTestBatches):
        startIdx = subTest*testSzPerBatch
        endIdx = startIdx + testSzPerBatch
        if(subTest == noTestBatches-1):
            endIdx = len(testQ1Indices)
        testFeed = {q1Input:valQ1Indices[startIdx:endIdx],
                    q1SeqLen:valQ1Len[startIdx:endIdx],
                    q2Input:valQ2Indices[startIdx:endIdx],
                    q2SeqLen:valQ2Len[startIdx:endIdx],
                    labels:valLabels[startIdx:endIdx]}
        testMetrics = session.run(fetches,testFeed)
        testLoss += testMetrics['loss']
        testAccuracy += testMetrics['accuracy']

    testLoss = testLoss/float(noTestBatches)
    testAccuracy = (100.0 / noTestBatches) * testAccuracy
    print("{}\t{}\t{}\t{}".format(episode,episodeLoss,testLoss,testAccuracy))
    fetches = {'eval_op':train_op,'accuracy':accuracy}

Starting...
Episode	train loss	val loss	val accuracy


InvalidArgumentError: No OpKernel was registered to support Op 'AvgPool' with these attrs.  Registered devices: [CPU,GPU], Registered kernels:
  device='GPU'; T in [DT_FLOAT]
  device='GPU'; T in [DT_HALF]
  device='CPU'; T in [DT_HALF]
  device='CPU'; T in [DT_FLOAT]

	 [[Node: Sentence/AvgPool_1 = AvgPool[T=DT_DOUBLE, data_format="NHWC", ksize=[1, 50, 1, 1], padding="VALID", strides=[1, 1, 1, 1]](Sentence/ExpandDims_1)]]

Caused by op u'Sentence/AvgPool_1', defined at:
  File "/usr/lib/python2.7/runpy.py", line 162, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
  File "/usr/lib/python2.7/runpy.py", line 72, in _run_code
    exec code in run_globals
  File "/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python2.7/dist-packages/traitlets/config/application.py", line 589, in launch_instance
    app.start()
  File "/usr/local/lib/python2.7/dist-packages/ipykernel/kernelapp.py", line 442, in start
    ioloop.IOLoop.instance().start()
  File "/usr/local/lib/python2.7/dist-packages/zmq/eventloop/ioloop.py", line 162, in start
    super(ZMQIOLoop, self).start()
  File "/usr/local/lib/python2.7/dist-packages/tornado/ioloop.py", line 883, in start
    handler_func(fd_obj, events)
  File "/usr/local/lib/python2.7/dist-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/usr/local/lib/python2.7/dist-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/usr/local/lib/python2.7/dist-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/usr/local/lib/python2.7/dist-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "/usr/local/lib/python2.7/dist-packages/ipykernel/kernelbase.py", line 391, in execute_request
    user_expressions, allow_stdin)
  File "/usr/local/lib/python2.7/dist-packages/ipykernel/ipkernel.py", line 199, in do_execute
    shell.run_cell(code, store_history=store_history, silent=silent)
  File "/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py", line 2723, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py", line 2825, in run_ast_nodes
    if self.run_code(code, result):
  File "/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py", line 2885, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-5-323854c3be4d>", line 26, in <module>
    q2Rep = tf.nn.avg_pool(q2Rep,[1,seqLength,1,1],strides=[1,1,1,1],padding='VALID',data_format='NHWC',name='AvgPool')
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/nn_ops.py", line 1793, in avg_pool
    name=name)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_nn_ops.py", line 50, in _avg_pool
    data_format=data_format, name=name)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 768, in apply_op
    op_def=op_def)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2336, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1228, in __init__
    self._traceback = _extract_stack()

InvalidArgumentError (see above for traceback): No OpKernel was registered to support Op 'AvgPool' with these attrs.  Registered devices: [CPU,GPU], Registered kernels:
  device='GPU'; T in [DT_FLOAT]
  device='GPU'; T in [DT_HALF]
  device='CPU'; T in [DT_HALF]
  device='CPU'; T in [DT_FLOAT]

	 [[Node: Sentence/AvgPool_1 = AvgPool[T=DT_DOUBLE, data_format="NHWC", ksize=[1, 50, 1, 1], padding="VALID", strides=[1, 1, 1, 1]](Sentence/ExpandDims_1)]]


Testing restore and predictions

In [None]:
with tf.Session() as sess:
    saver = tf.train.import_meta_graph('/home/ubuntu/QuestionPairs/SumModel/-9.meta')
    saver.restore(sess, '/home/ubuntu/QuestionPairs/SumModel/-9')
    
    temp = predictions.eval(session=sess,feed_dict=testFeed1)

In [None]:
#np.argmax(testLabels[:lTest],axis=1)
actual = np.argmax(testLabels[:lTest],axis=1)
predicted = temp

In [None]:
y = actual - predicted
print "%age of non duplicates classified as duplicates: ", float(len(y[y==-1])) / float(len(y))
print "%age of duplicates classified as non duplicates: ", float(len(y[y==1])) / float(len(y))