In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import os
import numpy as np
import pandas as pd
import random as rnd
import tensorflow as tf

# Set random seeds
rnd.seed(34)

In [2]:
data = pd.read_csv("data/questions.csv")
N = len(data)
print('Number of question pairs: ', N)
data.head()

Number of question pairs:  404351


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [3]:
N_train = int(N * 0.8)
N_test = N - N_train
data_train = data[:N_train]
data_test = data[N_train:]
print("Train set:", len(data_train), "Test set:", len(data_test))

Train set: 323480 Test set: 80871


In [4]:
td_index = data_train['is_duplicate'] == 1
td_index = [i for i, x in enumerate(td_index) if x]
print('Number of duplicate questions: ', len(td_index))
print('Indexes of first ten duplicate questions:', td_index[:10])

Number of duplicate questions:  120210
Indexes of first ten duplicate questions: [5, 7, 11, 12, 13, 15, 16, 18, 20, 29]


In [5]:
print(data_train['question1'][5])
print(data_train['question2'][5])
print('is_duplicate: ', data_train['is_duplicate'][5])

Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?
I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?
is_duplicate:  1


In [6]:
Q1_train = np.array(data_train['question1'][td_index])
Q2_train = np.array(data_train['question2'][td_index])
y_train = np.array(data_train['is_duplicate'][td_index])

Q1_test = np.array(data_test['question1'])
Q2_test = np.array(data_test['question2'])
y_test  = np.array(data_test['is_duplicate'])

In [7]:
print('TRAINING QUESTIONS:\n')
print('Question 1: ', Q1_train[0])
print('Question 2: ', Q2_train[0], '\n')
print('is_duplicate =', y_train[0], '\n')

print('TESTING QUESTIONS:\n')
print('Question 1: ', Q1_test[0])
print('Question 2: ', Q2_test[0], '\n')
print('is_duplicate =', y_test[0], '\n')

TRAINING QUESTIONS:

Question 1:  Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?
Question 2:  I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me? 

is_duplicate = 1 

TESTING QUESTIONS:

Question 1:  What Type of sex position would you like to do?
Question 2:  What type of Quora swag would you be most likely to actually wear/use? 

is_duplicate = 0 



In [8]:
# Splitting the data
cut_off = int(len(Q1_train) * 0.8)
train_Q1, train_Q2 = Q1_train[:cut_off], Q2_train[:cut_off]
val_Q1, val_Q2 = Q1_train[cut_off:], Q2_train[cut_off:]
print('Number of duplicate questions: ', len(Q1_train))
print("The length of the training set is:  ", len(train_Q1))
print("The length of the validation set is: ", len(val_Q1))

Number of duplicate questions:  120210
The length of the training set is:   96168
The length of the validation set is:  24042


# Encoding

In [9]:
print(Q1_train.shape)
print(Q2_train.shape)
print(np.concatenate((Q1_train, Q2_train)).shape)


(120210,)
(120210,)
(240420,)


In [10]:
tf.random.set_seed(0)
text_vectorization = tf.keras.layers.TextVectorization(output_mode='int', split='whitespace',  standardize='strip_punctuation')
text_vectorization.adapt(np.concatenate((Q1_train, Q2_train)))

In [11]:
print(Q1_train[0])
print(text_vectorization(Q1_train[0]))

Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?
tf.Tensor(
[ 7030     6   178    10  9364  2401 36549   773    13  6454 29179    30
    28   481    45    98], shape=(16,), dtype=int64)


# Model

In [12]:
def Siamese(text_vectorizer, vocab_size, d_feature=128):
    branch = tf.keras.models.Sequential(name='seq')
    branch.add(text_vectorizer)
    branch.add(tf.keras.layers.Embedding(vocab_size, d_feature))
    branch.add(tf.keras.layers.LSTM(d_feature, return_sequences=True))
    branch.add(tf.keras.layers.GlobalAveragePooling1D())
    branch.add(tf.keras.layers.Lambda(lambda x: tf.math.l2_normalize(x, axis=1)))

    input1 = tf.keras.layers.Input(shape=(1,), dtype='string')
    input2 = tf.keras.layers.Input(shape=(1,), dtype='string')

    branch1 = branch(input1)
    branch2 = branch(input2)

    conc = tf.keras.layers.Concatenate(axis=1)([ branch1, branch2 ])

    return tf.keras.models.Model(inputs=[input1, input2], outputs=conc)

In [13]:
model = Siamese(text_vectorization, vocab_size=text_vectorization.vocabulary_size())
model.build(input_shape=None)

In [14]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 seq (Sequential)               (None, 128)          4919040     ['input_1[0][0]',                
                                                                  'input_2[0][0]']                
                                                                                                  
 concatenate (Concatenate)      (None, 256)          0           ['seq[0][0]',                

In [15]:
model.get_layer('seq').summary()

Model: "seq"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, None)             0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, None, 128)         4787456   
                                                                 
 lstm (LSTM)                 (None, None, 128)         131584    
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 lambda (Lambda)             (None, 128)               0         
                                                                 
Total params: 4,919,040
Trainable params: 4,919,040
Non-trainab

## Triplet Loss function
Source: NLP Specialization - Coursera

In [16]:
def TripletLossFn(v1, v2,  margin=0.25):
    scores = tf.linalg.matmul(v1, v2, transpose_b = True)
    batch_size = tf.cast(tf.shape(v1)[0], scores.dtype) 

    # Mean negative
    positive = tf.linalg.diag_part(scores)
    negative_zero_on_duplicate = scores - tf.linalg.diag(positive)
    mean_negative = tf.math.reduce_sum(negative_zero_on_duplicate, axis=1) / (batch_size - 1)

    # Closest negative
    diagonal_mask = tf.eye(batch_size) == 1
    larger_than_diagonal_mask = negative_zero_on_duplicate > tf.expand_dims(positive, axis=1)
    mask_exclude_positives = tf.cast(diagonal_mask | larger_than_diagonal_mask, scores.dtype)
    negative_without_positive = negative_zero_on_duplicate - (2.0 * mask_exclude_positives)
    closest_negative = tf.math.reduce_max(negative_without_positive, axis=1)

    # Calculate triplet loss
    triplet_loss1 = tf.maximum(0.0, margin - positive + closest_negative)
    triplet_loss2 = tf.maximum(0.0, margin - positive + mean_negative)
    triplet_loss = tf.math.reduce_sum(triplet_loss1 + triplet_loss2)
    
    return triplet_loss

In [17]:
def TripletLoss(labels, out, margin=0.25):
    _, out_size = out.shape # get embedding size
    v1 = out[:,:int(out_size/2)] # Extract v1 from out
    v2 = out[:,int(out_size/2):] # Extract v2 from out
    return TripletLossFn(v1, v2, margin=margin)

# Train

In [18]:
train_dataset = tf.data.Dataset.from_tensor_slices(((train_Q1, train_Q2),tf.constant([1]*len(train_Q1))))
val_dataset = tf.data.Dataset.from_tensor_slices(((val_Q1, val_Q2),tf.constant([1]*len(val_Q1))))

In [19]:
def train_model(Siamese, TripletLoss, text_vectorizer, train_dataset, val_dataset, lr, train_steps):
    model = Siamese(text_vectorizer, vocab_size=text_vectorizer.vocabulary_size())

    model.compile(loss=TripletLoss, optimizer=tf.keras.optimizers.Adam(learning_rate=lr))

    model.fit(train_dataset, epochs=train_steps, validation_data=val_dataset)

    return model

In [20]:
train_steps = 10
batch_size = 256
lr = 0.01

train_generator = train_dataset.shuffle(
    len(train_Q1),
    seed=7, 
    reshuffle_each_iteration=True
).batch(batch_size=batch_size)

val_generator = val_dataset.shuffle(
    len(val_Q1),
    seed=7,
    reshuffle_each_iteration=True
).batch(batch_size=batch_size)

model = train_model(
    Siamese, 
    TripletLoss,
    text_vectorization, 
    train_generator, 
    val_generator, 
    lr,
    train_steps
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [21]:
def classify(test_Q1, test_Q2, y_test, threshold, model, batch_size=64, verbose=True):
    y_pred = []
    test_gen = tf.data.Dataset.from_tensor_slices(((test_Q1, test_Q2), None)).batch(batch_size=batch_size)
    
    pred = model.predict(test_gen)
    _, n_feat = pred.shape
    v1 = pred[:, :n_feat//2]
    v2 = pred[:, n_feat//2:]
    
    d  = tf.reduce_sum(tf.multiply(v1, v2), axis=1) / (tf.norm(v1, axis=1) * tf.norm(v2, axis=1))
    y_pred = tf.cast(d > threshold, tf.float64)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(y_pred, y_test), tf.float64))
    
    return accuracy

In [28]:
def predict(question1, question2, threshold=0.7, model=model, verbose=True):
    generator = tf.data.Dataset.from_tensor_slices((([question1], [question2]),None)).batch(batch_size=1)
    
    v1v2 = model.predict(generator)
    _, n_feat = v1v2.shape
    v1 = v1v2[:, :n_feat//2]
    v2 = v1v2[:, n_feat//2:]

    d = tf.reduce_sum(tf.multiply(v1, v2), axis=1) / (tf.norm(v1, axis=1) * tf.norm(v2, axis=1))

    res = d > threshold

    if(verbose):
        print("Q1  = ", question1, "\nQ2  = ", question2)
        print("d   = ", d.numpy())
        print("res = ", res.numpy())

    return res.numpy()

In [29]:
predict('When will I see you?', 'When can I see you?')

Q1  =  When will I see you? 
Q2  =  When can I see you?
d   =  [0.9817116]
res =  [ True]


array([ True])

In [34]:
predict('Is it possible to fly?', 'Is it feasible to fly?')

Q1  =  Is it possible to fly? 
Q2  =  Is it feasible to fly?
d   =  [0.86791974]
res =  [ True]


array([ True])

In [35]:
predict('What time is it?', 'How old are you?')

Q1  =  What time is it? 
Q2  =  How old are you?
d   =  [-0.02421507]
res =  [False]


array([False])