# Data import & preparation

In [1]:
import os
import numpy as np
import pandas as pd
import random as rnd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [2]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
cd /content/drive/MyDrive/SequenceModelsCoursera/W3_Siamese/Question Duplicates /Files/tf

/content/drive/MyDrive/SequenceModelsCoursera/W3_Siamese/Question Duplicates /Files/tf


In [4]:
data = pd.read_csv("questions.csv")
N = len(data)

print(f"Num of question pairs: {N}")
data.head()

Num of question pairs: 404351


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [5]:
N_train = 300000
N_test = 10240
data_train = data[:N_train]
data_test = data[N_train:N_train + N_test]
print("Train set:", len(data_train), "Test set:", len(data_test))
del (data)  # remove to free memory

Train set: 300000 Test set: 10240


In [6]:
#Row indices of entries where the duplicate is true
td_index = data_train['is_duplicate'] == 1
td_index = [i for i, x in enumerate(td_index) if x ]
print(f"Number of duplicates: {len(td_index)}")
print(f"Indices of the first ten duplicates: {td_index[:10]}")

Number of duplicates: 111486
Indices of the first ten duplicates: [5, 7, 11, 12, 13, 15, 16, 18, 20, 29]


In [7]:
print(data_train['question1'][5])
print(data_train['question2'][5])
print('is_duplicate: ', data_train['is_duplicate'][5])

Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?
I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?
is_duplicate:  1


In [8]:
#Retreiving only the duplicate data because the hard negative mining will account for negative examples for the model to train
Q1_train = np.array(data_train['question1'][td_index])
Q2_train = np.array(data_train['question2'][td_index])

Q1_test = np.array(data_test['question1'])
Q2_test = np.array(data_test['question2'])
y_test = np.array(data_test['is_duplicate'])

In [9]:
y_test.shape

(10240,)

In [10]:
print('TRAINING QUESTIONS:\n')
print('Question 1: ', Q1_train[0])
print('Question 2: ', Q2_train[0], '\n')
print('Question 1: ', Q1_train[5])
print('Question 2: ', Q2_train[5], '\n')

print('TESTING QUESTIONS:\n')
print('Question 1: ', Q1_test[0])
print('Question 2: ', Q2_test[0], '\n')
print('is_duplicate =', y_test[0], '\n')

TRAINING QUESTIONS:

Question 1:  Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?
Question 2:  I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me? 

Question 1:  What would a Trump presidency mean for current international master’s students on an F1 visa?
Question 2:  How will a Trump presidency affect the students presently in US or planning to study in US? 

TESTING QUESTIONS:

Question 1:  How do I prepare for interviews for cse?
Question 2:  What is the best way to prepare for cse? 

is_duplicate = 0 



In [11]:
#Splitting the data for train & val set respectively
cut_off = int(len(Q1_train) * 0.8)
train_Q1, train_Q2 = Q1_train[:cut_off], Q2_train[:cut_off]
val_Q1, val_Q2 = Q1_train[cut_off:], Q2_train[cut_off:]
print(f"Number of duplicate questions: {len(Q1_train)}")
print(f"The length of the training set: {len(train_Q1)}")
print(f"The length of the validation set: {len(val_Q1)}")

Number of duplicate questions: 111486
The length of the training set: 89188
The length of the validation set: 22298


## Encoding text & padding

### Punctuation stripping

In [12]:
import string

def strip_punctuation(text):
    stripped_sequences = []

    #Converting a single string to a list if it was passed
    if isinstance(text, str):
        text = [text]

    #Stripping punctuation
    for sequence in text:
        stripped_sequences.append(''.join(char for char in sequence if char not in string.punctuation))

    return stripped_sequences

In [13]:
samples = ["Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?", "I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?"]
print(f"Before stripping: {samples}")
print(f"Post stripping: {strip_punctuation(samples)}")

Before stripping: ['Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?', "I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?"]
Post stripping: ['Astrology I am a Capricorn Sun Cap moon and cap risingwhat does that say about me', 'Im a triple Capricorn Sun Moon and ascendant in Capricorn What does this say about me']


In [14]:
sample = "Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?"
print(f"Before stripping: {samples}")
print(f"Post stripping: {strip_punctuation(sample)}")

Before stripping: ['Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?', "I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?"]
Post stripping: ['Astrology I am a Capricorn Sun Cap moon and cap risingwhat does that say about me']


## The actual text encoder & padder

In [15]:
from torch.nn.utils.rnn import pad_sequence
from collections import Counter


class SentenceVectorizer:
    """
    Custom word-level text encoder
    """
    #Initializing needed variables
    def __init__(self, pad_token="", unk_token="[UNK]"):
        self.pad_token = pad_token
        self.unk_token = unk_token
        self.word2idx = {pad_token: 0, unk_token: 1}
        self.idx2word = {0: pad_token, 1: unk_token}
        self.vocab = [pad_token, unk_token]


    def fit(self, sentences):
        #Converting the single string if passed to a list for further processing
        if isinstance(sentences, str):
            sentences = [sentences]

        #Stripping punctuation
        sentences = strip_punctuation(sentences)

        #Populating the dictionary with our vocabulary
        word_counts = Counter(word for sentence in sentences for word in sentence.split())
        for word, _ in word_counts.items():
            if word not in self.word2idx:
                self.word2idx[word] = len(self.word2idx)
                self.idx2word[len(self.idx2word)] = word
                self.vocab.append(word)


    def transform(self, sentences):
        #Same here as in the previous method
        if isinstance(sentences, str):
            sentences = [sentences]

        #Stripping punctuation
        sentences = strip_punctuation(sentences)

        #Vectorizing the words by pulling the values from the dictionary, if none is found -> assign the UNK token
        vectorized = [[self.word2idx.get(word, self.word2idx[self.unk_token])
                       for word in sentence.split()]
                      for sentence in sentences]

        #Padding to the biggest sequence received
        return pad_sequence([torch.tensor(sentence) for sentence in vectorized],
                            batch_first=True,
                            padding_value=self.word2idx[self.pad_token])


def get_sentence_vectorizer(sentences):
    torch.manual_seed(33)

    # Creating the object of the Vectorizer
    sentence_vectorizer = SentenceVectorizer()

    #Building vocabulary
    sentence_vectorizer.fit(sentences)

    # Get the vocabulary
    vocab = sentence_vectorizer.vocab

    return sentence_vectorizer, vocab


#Creating the vectorizer object & vocab size
vectorizer, vocab = get_sentence_vectorizer(np.concatenate((Q1_train, Q2_train)))
vocab_size = len(vocab)


def tokenize_sentences(sentences):
    #Tokenizing the passed sentence
    encoded_sentences = vectorizer.transform(sentences)

    return encoded_sentences

In [16]:
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 36224


In [17]:
print('first question in the train set:\n')
print(Q1_train[:5], '\n')
print('encoded version:')
print(tokenize_sentences(Q1_train[:5]),'\n')

print('first question in the test set:\n')
print(Q1_test[:5], '\n')
print('encoded version:')
print(tokenize_sentences(Q1_test[:5]) )

first question in the train set:

['Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?'
 'How can I be a good geologist?'
 'How do I read and find my YouTube comments?'
 'What can make Physics easy to learn?'
 'What was your first sexual experience like?'] 

encoded version:
tensor([[ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17],
        [18, 19,  3, 20,  5, 21, 22,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [18, 23,  3, 24, 10, 25, 26, 27, 28,  0,  0,  0,  0,  0,  0,  0],
        [29, 19, 30, 31, 32, 33, 34,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [29, 35, 36, 37, 38, 39, 40,  0,  0,  0,  0,  0,  0,  0,  0,  0]]) 

first question in the test set:

['How do I prepare for interviews for cse?'
 'What is the best bicycle to buy under 10k?'
 'How do I become Mutual funds distributer for all company mutual funds?'
 'Will this relationship work?' 'How does Brexit affect India?'] 

encoded version:
tensor([[  18,   23,    3,   70,  

# Custom Dataset loader

In [18]:
class CustomData(Dataset):
    """
    Custom dataset class that transforms all the data at init step
    """

    def __init__(self, sequences_Q1, sequences_Q2, labels, transform=tokenize_sentences):
        super().__init__()
        self.sequences_Q1 = transform(sequences_Q1)
        self.sequences_Q2 = transform(sequences_Q2)
        self.labels = labels

    def __len__(self):
        return len(self.sequences_Q1)

    def __getitem__(self, idx):
        return self.sequences_Q1[idx], self.sequences_Q2[idx], self.labels[idx]

In [19]:
"""
Preparing the data (for train and val set labels are just placeholders,
as we will have the custom loss (The Triplet Loss))
"""

train_set = CustomData(train_Q1, train_Q2, [1] * len(train_Q1))
val_set = CustomData(val_Q1, val_Q2, [1] * len(val_Q1))
test_set = CustomData(Q1_test, Q2_test, y_test)

BATCH_SIZE = 256

train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_set, batch_size=512)

# Defining the Siamese model

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [87]:
"""
Creating a custom L2 norm class to later use it as a layer in our NN
"""
class NormalizationLayer(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return F.normalize(x)


"""
Base NN class
"""
class BaseNetwork(nn.Module):
    def __init__(self, text_vectorizer=tokenize_sentences, vocab_size=vocab_size, d_feature=128):
        super().__init__()

        self.Embedding = nn.Embedding(vocab_size, d_feature)
        self.LSTM = nn.LSTM(d_feature, d_feature, batch_first=True)
        self.AdaptiveAvgPooling = nn.AdaptiveAvgPool1d(1)
        self.l2norm = NormalizationLayer()

    def forward(self, x):
        x = self.Embedding(x)
        x, _ = self.LSTM(x)
        #Transposing to change the dimensions of the sequence length and embedding_dims
        #So that we can global average pool over the sequence length dimension
        x = torch.transpose(x, 1, 2)
        x = self.AdaptiveAvgPooling(x)
        x = self.l2norm(x)
        #Removing the 1 dim for further shape convenience
        x = torch.squeeze(x)

        return x


class SiameseNetwork(nn.Module):
    """
    Siamese Network class to use the NN twice on two different inputs
    """
    def __init__(self):
        super().__init__()
        #Creating an object of the NN
        self.nn = BaseNetwork(vectorizer, vocab_size)

    def forward(self, input1, input2):
        #Two vectors of sequences
        output1 = self.nn(input1)
        output2 = self.nn(input2)

        #Accounting for the testing phase where there's only 1 dim
        if output1.dim() == 1:
            conc = torch.cat((output1, output2), dim=0)
        else:
            conc = torch.cat((output1, output2), dim=1)

        return conc

In [88]:
model = SiameseNetwork().to(device)

# Hard negative mining

In [23]:
def TripletLossFn(v1, v2, margin=0.25):
    """
    Custom Loss function
    """
    #Cosine similarities
    scores = torch.matmul(v2, v1.T).to(device)
    #Row size (sample size)
    batch_size = v1.size(0)
    #Positives (on diagonal)
    positive = torch.diagonal(scores).to(device)
    #Only negatives (canceling out the positives)
    negative_zero_on_duplicate = scores - torch.diag(positive).to(device)
    #Calculating mean negatives
    mean_negative = torch.sum(negative_zero_on_duplicate, dim=1) / (batch_size - 1)
    #Masking the positives and the negatives that are bigger than the positives
    mask_exclude_positives = (torch.eye(batch_size) == 1).to(device) | (negative_zero_on_duplicate > torch.unsqueeze(positive, 1))
    #Extracting the masked elements from the original matrix
    negative_without_positives = negative_zero_on_duplicate - mask_exclude_positives * 2
    #Finding the closest negative
    closest_negative = torch.max(negative_without_positives, dim=1).values
    #Triplet loss 1
    triplet_loss1 = torch.maximum(-(positive) + closest_negative + margin, torch.tensor(0))
    #Triples loss 2
    triplet_loss2 = torch.maximum(-(positive) + mean_negative + margin, torch.tensor(0))
    #Triplet loss 3
    triplet_loss3 = torch.sum(triplet_loss1 + triplet_loss2)

    return triplet_loss3

In [24]:
v1 = torch.tensor([[0.26726124, 0.53452248, 0.80178373],[0.5178918 , 0.57543534, 0.63297887]])
v2 = torch.tensor([[ 0.26726124,  0.53452248, 0.80178373],[-0.5178918 , -0.57543534, -0.63297887]])
print("Triplet Loss:", TripletLossFn(v1,v2))

Triplet Loss: tensor(0.7035, device='cuda:0')


In [25]:
def TripletLoss(labels, output, margin=0.25):
    """
    Extracting the two tensors of sequences received from the model
    so that we can compute the Triplet loss on the two sequences
    """
    _, embedding_dim = output.size()
    v1 = output[:, :int(embedding_dim / 2)]
    v2 = output[:, int(embedding_dim / 2):]

    return TripletLossFn(v1, v2, margin)

### Loss fn, optimizer,

In [26]:
criterion = TripletLoss
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)

# Training the model

In [27]:
NUM_EPOCHS = 2

for epoch in range(NUM_EPOCHS):
    running_loss = 0.0
    model.train()

    for i, (data1, data2, labels) in enumerate(train_loader):
        data1, data2, labels = data1.to(device), data2.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(data1, data2)
        loss = criterion(labels, outputs)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        if i % 50 == 49:
            print(f"[{epoch + 1}, {i+1}], running_loss: {running_loss:.4f}")
            running_loss = 0.0

[1, 50], running_loss: 2188.2154
[1, 100], running_loss: 941.9193
[1, 150], running_loss: 779.4593
[1, 200], running_loss: 672.8524
[1, 250], running_loss: 618.6795
[1, 300], running_loss: 573.0292
[2, 50], running_loss: 400.9674
[2, 100], running_loss: 395.5428
[2, 150], running_loss: 416.0395
[2, 200], running_loss: 413.1872
[2, 250], running_loss: 419.2199
[2, 300], running_loss: 406.6781


## Validation

In [28]:
running_loss = 0.0
model.eval()

with torch.no_grad():
    for i, (data1, data2, labels) in enumerate(val_loader):
        data1, data2, labels = data1.to(device), data2.to(device), labels.to(device)

        outputs = model(data1, data2)
        loss = criterion(labels, outputs)
        running_loss += loss.item()

        if i % 20 == 19:
            print(f"[{i+1}], validation_loss: {running_loss:.4f}")
            running_loss = 0.0

[20], validation_loss: 189.8702
[40], validation_loss: 193.6596
[60], validation_loss: 180.8987
[80], validation_loss: 181.7615


# Evaluation

## Classify

In [41]:
from sklearn.metrics import confusion_matrix

def classify(threshold, model):
    """
    Calculating the accuracy
    and confusion matrix on the test set
    """
    y_pred = []
    model.eval()

    with torch.no_grad():
        for inputs1, inputs2, y_test in test_loader:
            inputs1, inputs2, y_test = inputs1.to(device), inputs2.to(device), y_test.to(device)

            outputs = model(inputs1, inputs2)
            _, n_feat = outputs.size()
            #Extracting the two sequences
            v1 = outputs[:, :int(n_feat/2)]
            v2 = outputs[:, int(n_feat/2):]
            #Cosine similarity
            d = torch.sum(v1 * v2, dim=1)
            #Checking against the threshold
            y_pred = (d > threshold).to(dtype=torch.float64)
            y_test = y_test.to(dtype=torch.float64)

            accuracy = torch.sum((y_pred == y_test).to(dtype=torch.float32)) / len(y_test)

            cm = confusion_matrix(y_test.cpu().numpy(), y_pred.cpu().numpy())

            return accuracy, cm

In [43]:
accuracy, cm = classify(0.7, model)

print(f"Accuracy: {accuracy}\n")
print(f"Confusion matrix:\n {cm}")

Accuracy: 0.732421875

Confusion matrix:
 [[257  72]
 [ 65 118]]


## Predict

In [85]:
def predict(question1, question2, threshold, model, verbose=False):
    """
    Predicting similarity on two sequences
    """
    #Encoding
    question1 = tokenize_sentences(question1).to(device)
    question2 = tokenize_sentences(question2).to(device)
    #Passing to the model
    outputs = model(question1, question2)
    n_feat = len(outputs)
    #Extracting two encoded sequences
    v1 = outputs[:int(n_feat/2)]
    v2 = outputs[int(n_feat/2):]
    #Calculating the cosine similarity
    d = torch.sum(v1*v2)
    #Checking the cosine similarity against the threshold
    res = d > threshold

    if verbose:
        print(f"Q1 = {question1}\n Q2 = {question2}\n")
        print(f"d = {d}\n")
        print(f"res = {res}")

    return res

In [89]:
# Feel free to try with your own questions
question1 = "When will I see you?"
question2 = "When can I see you again?"
# 1 means it is duplicated, 0 otherwise
predict(question1 , question2, 0.7, model, verbose = True)

Q1 = tensor([[723, 230,   3, 325,  92]], device='cuda:0')
 Q2 = tensor([[ 723,   19,    3,  325,   92, 5808]], device='cuda:0')

d = 0.7256489992141724

res = True


tensor(True, device='cuda:0')

In [92]:
# Feel free to try with your own questions
question1 = "Read a book"
question2 = "Book a table"
# 1 means it is duplicated, 0 otherwise
predict(question1 , question2, 0.7, model, verbose=True)

Q1 = tensor([[6879,    5,  143]], device='cuda:0')
 Q2 = tensor([[4445,    5, 5413]], device='cuda:0')

d = 0.34251832962036133

res = False


tensor(False, device='cuda:0')