# <font color = 'indianred'> **1. Understanding Multiple Negatives Ranking Loss** </font>

In [None]:
if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount("/content/drive")
    !pip install datasets transformers  -U -qq

Mounted at /content/drive
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m93.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.9/388.9 kB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m99.5 MB/s[0m eta [36m0:00:00[0m
[?25h

<font color = 'indianred'> *Load Libraries* </font>

In [None]:
# standard data science libraries for data handling and visualization

import numpy as np
from sklearn.metrics.pairwise import paired_cosine_distances


# New libraries introduced in this notebook

import torch
from datasets import load_dataset, DatasetDict, ClassLabel

from transformers import AutoTokenizer
from transformers import PreTrainedModel

from transformers.modeling_outputs import ModelOutput
from transformers import BertModel, BertConfig


import torch
import torch.nn as nn
import torch.nn.functional as F


# <font color = 'indianred'> **2. Load Data set**
    


**Quora Dataset**

The Quora dataset is composed of question pairs, and the task is to determine if the questions are paraphrases of each other (have the same meaning).



In [None]:
quora_dataset = load_dataset("quora")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/35.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/404290 [00:00<?, ? examples/s]

In [None]:
# Renaming 'is_duplicate' column to 'labels' to match the naming convention expected by Hugging Face Trainer
train_dataset = quora_dataset.rename_column('is_duplicate', 'labels')

# Retrieve the features of the 'train' split from the quora_dataset
features = train_dataset['train'].features

# Define the 'labels' feature as a ClassLabel with two classes: 'not_duplicate' and 'duplicate'
features['labels'] = ClassLabel(num_classes=2, names=['not_duplicate', 'duplicate'])

# Cast the 'labels' column in the dataset to the ClassLabel type, ensuring compatibility with Hugging Face's Trainer
train_dataset= train_dataset.cast(features)

Casting the dataset:   0%|          | 0/404290 [00:00<?, ? examples/s]

In [None]:
train_dataset

DatasetDict({
    train: Dataset({
        features: ['questions', 'labels'],
        num_rows: 404290
    })
})

In [None]:
train_dataset['train'][0]

{'questions': {'id': [1, 2],
  'text': ['What is the step by step guide to invest in share market in india?',
   'What is the step by step guide to invest in share market?']},
 'labels': 0}

In [None]:
train_dataset['train'][0]['questions']['text'][0]

'What is the step by step guide to invest in share market in india?'

In [None]:
train_dataset['train'][0]['questions']['text'][1]

'What is the step by step guide to invest in share market?'

<font color = 'indianred'> *Filter the dataset to include only duplicate pairs* </font>

In [None]:
# Filter the dataset to include only duplicate pairs
train_duplicates = train_dataset.filter(lambda example: example['labels'] == 1)

print(f"Original number of rows: {len(train_dataset['train'])}")
print(f"Number of duplicate rows: {len(train_duplicates['train'])}")

Filter:   0%|          | 0/404290 [00:00<?, ? examples/s]

Original number of rows: 404290
Number of duplicate rows: 149263


In [None]:
sample = train_duplicates['train'].shuffle(123).select(range(4))

In [None]:
sample

Dataset({
    features: ['questions', 'labels'],
    num_rows: 4
})

We have created the datset. The next step is to tokenize the dataset in a format so that we can pass the tokenized inputs to the pre-trained model.

# <font color = 'indianred'>**3. Tokenization**</font>



In [None]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def tokenize_fn(batch):
  question1 = []
  question2 = []
  for question_pair in batch['questions']:
    question1.append(question_pair['text'][0])
    question2.append(question_pair['text'][1])

  tokenized_question1 = tokenizer(question1, truncation=True)
  tokenized_question2 = tokenizer(question2, truncation=True)
  return {
      'input_ids_q1': tokenized_question1['input_ids'],
      'attention_mask_q1': tokenized_question1['attention_mask'],
      'input_ids_q2': tokenized_question2['input_ids'],
      'attention_mask_q2': tokenized_question2['attention_mask'],
  }


In [None]:
tokenized_sample = sample.map(tokenize_fn, batched=True).remove_columns( ['questions'])

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [None]:
tokenized_sample.set_format(type='torch')

In [None]:
tokenized_sample.features

{'labels': ClassLabel(names=['not_duplicate', 'duplicate'], id=None),
 'input_ids_q1': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'attention_mask_q1': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'input_ids_q2': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'attention_mask_q2': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}

In [None]:
print(len(tokenized_sample["input_ids_q1"][2]))
print(len(tokenized_sample["input_ids_q1"][1]))

17
8


The varying lengths in the dataset indicate that padding has not been applied yet. Instead of padding the entire dataset, we prefer processing small batches during training. Padding is done selectively for each batch based on the maximum length in the batch. We will discuss this in more detail in a later section of this notebook.

#  <font color = 'indianred'> **4. Get Model Inputs** </font>



In [None]:
class SiameseDataCollatorWithPadding:
    def __init__(self, tokenizer, padding=True):
        """
        Custom data collator for Siamese network structure with separate tokenization for two inputs.

        Args:
        tokenizer (PreTrainedTokenizer): The tokenizer used for encoding the text inputs.
        padding (bool, optional): Whether to pad the inputs to the maximum length in the batch. Defaults to True.
        """
        self.tokenizer = tokenizer
        self.padding = padding

    def __call__(self, features):
        # Separate features for question1 and question2
        features_q1 = [{"input_ids": feature["input_ids_q1"], "attention_mask": feature["attention_mask_q1"]} for feature in features]
        features_q2 = [{"input_ids": feature["input_ids_q2"], "attention_mask": feature["attention_mask_q2"]} for feature in features]

        # Pad each set of features independently
        batch_q1 = self.tokenizer.pad(features_q1, padding=self.padding, return_tensors="pt")
        batch_q2 = self.tokenizer.pad(features_q2, padding=self.padding, return_tensors="pt")

        # Combine the padded features into one dictionary
        batch = {
            "input_ids_q1": batch_q1["input_ids"],
            "attention_mask_q1": batch_q1["attention_mask"],
            "input_ids_q2": batch_q2["input_ids"],
            "attention_mask_q2": batch_q2["attention_mask"],
        }

        # If labels exist, include them in the batch
        if "labels" in features[0]:
            batch["labels"] = torch.tensor([feature["labels"] for feature in features], dtype=torch.long)

        return batch


In [None]:
model = BertModel.from_pretrained(checkpoint)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
data_collator = SiameseDataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
model_inputs = data_collator(tokenized_sample)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
model_inputs

{'input_ids_q1': tensor([[  101,  2339,  2024,  2512,  1011,  5356, 20700,  5167, 10426,  2096,
           8225,  5356,  4834,  4861,  1029,   102,     0],
         [  101,  2129,  2003,  1996,  2147,  3226,  1029,   102,     0,     0,
              0,     0,     0,     0,     0,     0,     0],
         [  101,  2054,  2024,  2070,  1997,  1996,  4569, 15580,  2102,  2969,
           3111,  2017,  2031,  2412,  2464,  1029,   102],
         [  101,  2064,  1045,  7796,  2769,  3784,  1029,   102,     0,     0,
              0,     0,     0,     0,     0,     0,     0]]),
 'attention_mask_q1': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
 'input_ids_q2': tensor([[  101,  2339,  2024,  2512,  1011,  5356, 20700,  5167, 10426,  1999,
           5356,  4834,  4861,  1029,   102],
         

#  <font color = 'indianred'> **5. Model Outputs** </font>
- Here again since are passing two sets of input_ids and attention mask , the AutoModelForSequenceClassification will not work.
- The whole idea behind SBERT is that cls token does not  give document level embeddings.
- We will use all the tokens and need pooling function to pool embeddings from different tokens.

In [None]:
def mean_pool(token_embeds, attention_mask):
    # reshape attention_mask to cover 768-dimension embeddings
    in_mask = attention_mask.unsqueeze(-1).expand(token_embeds.size()).float()
    # perform mean-pooling but exclude padding tokens (specified by in_mask)
    pool = torch.sum(token_embeds * in_mask, 1) / torch.clamp(in_mask.sum(1), min=1e-9 )
    return pool

<font color = 'indianred'>*Understanding the mean_pool function and the model output*</font>

In [None]:
u = model(model_inputs['input_ids_q1'], model_inputs['attention_mask_q1']).last_hidden_state
v = model(model_inputs['input_ids_q2'], model_inputs['attention_mask_q2']).last_hidden_state

In [None]:
u.shape, v.shape

(torch.Size([4, 17, 768]), torch.Size([4, 15, 768]))

In [None]:
pooled_u = mean_pool(u, model_inputs['attention_mask_q1'])
pooled_v = mean_pool(v, model_inputs['attention_mask_q2'])

In [None]:
pooled_u.shape, pooled_v.shape

(torch.Size([4, 768]), torch.Size([4, 768]))

In [None]:
labels = model_inputs['labels']

In [None]:
labels

tensor([1, 1, 1, 1])

In [None]:
pooled_u.shape, pooled_v.shape

(torch.Size([4, 768]), torch.Size([4, 768]))

#  <font color = 'indianred'> **6. Multiple Negatives Ranking Loss** </font>

In [None]:
pooled_u_normalized = F.normalize(pooled_u, p=2, dim=1)
pooled_v_normalized = F.normalize(pooled_v, p=2, dim=1)

In [None]:
pooled_u_normalized.shape, pooled_v_normalized.shape

(torch.Size([4, 768]), torch.Size([4, 768]))

In [None]:
# Compute scores (dot product) between pooled_u and pooled_v
# this simulates the cosine similarity between the two sentences
similarity_scores = torch.matmul(pooled_u_normalized, pooled_v_normalized.T)
similarity_scores

tensor([[0.9782, 0.6088, 0.5760, 0.6016],
        [0.6235, 0.9680, 0.6258, 0.7652],
        [0.5610, 0.5772, 0.9816, 0.6089],
        [0.5718, 0.6848, 0.6149, 0.8657]], grad_fn=<MmBackward0>)

<font color = 'indianred'> **We can thinbk of this as multiclass classification problem** </font>

Similarity scores are logits - we assume that diagonal elements are true labels
<pre>
Predictions (similarity Scores)       Labels
[u1v1, u1v2, u1v3, u1v4]              [0]
[u2v1, u2v2, u2v3, u2v4]              [1]
[u3v1, u3v2, u3v3, u3v4]              [2]
[u4v1, u4v2, u4v3, u4v4]              [4]
</pre>

In [None]:
labels = torch.tensor(range(len(similarity_scores)), dtype=torch.long, device=similarity_scores.device)

In [None]:
labels

tensor([0, 1, 2, 3])

In [None]:
loss_fn_mnr = nn.CrossEntropyLoss()
loss_mnr = loss_fn_mnr(similarity_scores, labels.view(-1))

In [None]:
loss_mnr

tensor(1.1506, grad_fn=<NllLossBackward0>)