This script trains a Bi-Encoder model for Sentence Similarity using the biencoder dataset.

The model is based on BERT and is trained using Cosine Similarity as the loss function.

The script includes steps for data loading, preprocessing, model training, and evaluation.

In [1]:
!pip install transformers datasets huggingface_hub

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl 

In [25]:
# Check GPU availability and set device
import torch
# If there's a GPU available...
if torch.cuda.is_available():
    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [26]:
# Authenticate with Hugging Face Hub
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
# Load dataset
from datasets import load_dataset
datasets = load_dataset("PhilipMay/stsb_multi_mt", "en")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/11.4k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/470k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/108k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/142k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [5]:
datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 5749
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 1379
    })
    dev: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 1500
    })
})

In [6]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
# Define a function to normalize tensor range
import torch

def normalize_tensor_range(tensor, new_min=-1, new_max=1):

    min_val = tensor.min()
    max_val = tensor.max()

    # Scale to [0, 1]
    tensor_scaled = (tensor - min_val) / (max_val - min_val)

    # Scale to [new_min, new_max]
    tensor_normalized = tensor_scaled * (new_max - new_min) + new_min

    return tensor_normalized

# Example tensors
labels = torch.tensor(datasets['train']['similarity_score'])
valid_labels = torch.tensor(datasets['dev']['similarity_score'])
test_labels = torch.tensor(datasets['test']['similarity_score'])


# Normalize tensors to the range [-1, 1]
normalized_labels = normalize_tensor_range(labels)
valid_normalized_labels = normalize_tensor_range(valid_labels)
test_normalized_labels = normalize_tensor_range(test_labels)

print(f"Normalized Tensor 1: {torch.mean(normalized_labels), torch.mean(valid_normalized_labels), torch.mean(test_normalized_labels)}")

Normalized Tensor 1: (tensor(0.0804), tensor(-0.0544), tensor(0.0432))


In [9]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, Dataset

In [10]:
class biencoderDataset(Dataset):
    """
      A custom dataset class for the Bi-Encoder model.

      Args:
          sentence1: List of first sentences.
          sentence2: List of second sentences.
          normalized_labels: Normalized similarity scores.
          tokenizer: The tokenizer to use.
      """
    def __init__(self, sentence1, sentence2, normalized_labels , tokenizer):
        self.all_input_id1 = []
        self.all_input_id2 = []
        self.all_attn_masks1 = []
        self.all_attn_masks2 = []
        self.normalized_labels = normalized_labels

        for i, j in zip(sentence1, sentence2):
            tokenized_sentence1 = tokenizer(i, padding='longest')
            tokenized_sentence2 = tokenizer(j, padding='longest')
            self.all_input_id1.append(tokenized_sentence1['input_ids'])
            self.all_input_id2.append(tokenized_sentence2['input_ids'])
            self.all_attn_masks1.append(tokenized_sentence1['attention_mask'])
            self.all_attn_masks2.append(tokenized_sentence2['attention_mask'])

    def __len__(self):
        return len(self.all_input_id1)

    def __getitem__(self, idx):
        return torch.tensor(self.all_input_id1[idx]), torch.tensor(self.all_attn_masks1[idx]), torch.tensor(self.all_input_id2[idx]),torch.tensor(self.all_attn_masks2[idx]), self.normalized_labels[idx].item()

In [11]:
train_dataset = biencoderDataset(datasets['train']['sentence1'], datasets['train']['sentence2'],normalized_labels, tokenizer)

In [12]:
valid_dataset = biencoderDataset(datasets['dev']['sentence1'], datasets['dev']['sentence2'],valid_normalized_labels, tokenizer)

In [13]:
test_dataset = biencoderDataset(datasets['test']['sentence1'], datasets['test']['sentence2'],test_normalized_labels, tokenizer)

In [14]:
train_dataset[0]

(tensor([ 101, 1037, 4946, 2003, 2635, 2125, 1012,  102]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1]),
 tensor([ 101, 2019, 2250, 4946, 2003, 2635, 2125, 1012,  102]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1]),
 1.0)

In [15]:
class DataCollator:
        def __init__(self, tokenizer):
            self.tokenizer = tokenizer

        def pad_tensors(self, tensors, padding_value=0):

            return torch.nn.utils.rnn.pad_sequence(tensors, batch_first=True, padding_value=padding_value)

        def __call__(self, data):
            output_dict = {'input_ids': [f[0] for f in data] + [f[2] for f in data],
                           'attention_mask': [f[1] for f in data] + [f[3] for f in data],
                           'labels': [f[4] for f in data]}
            output_dict['all_input_ids'] = self.pad_tensors(output_dict['input_ids'],
                                                        padding_value=self.tokenizer.pad_token_id)
            output_dict['input_ids_1'] = output_dict['all_input_ids'][:len(data)]
            output_dict['input_ids_2'] = output_dict['all_input_ids'][len(data):]
            output_dict['labels'] = torch.tensor(output_dict['labels'])
            output_dict['all_attention_mask'] = self.pad_tensors(output_dict['attention_mask'], padding_value=0)
            output_dict['attention_mask_1'] = output_dict['all_attention_mask'][:len(data)]
            output_dict['attention_mask_2'] = output_dict['all_attention_mask'][len(data):]
            return output_dict

In [16]:
data_collator = DataCollator(tokenizer=tokenizer)

In [17]:
train_dataloader = DataLoader(train_dataset, batch_size=64, collate_fn=data_collator, shuffle=True)

In [18]:
for i in train_dataloader:
  print(i)
  break

{'input_ids': [tensor([  101,  1996,  2382,  1011,  2095,  5416,  2149, 14142, 22123,  1027,
        25269,  3473,  1015,  1011,  1017,  1013,  3590,  2005,  1037, 10750,
         1997,  1018,  1012,  2382,  3867,  1010,  2091,  2013,  1018,  1012,
         3486,  3867,  2397,  9317,  1012,   102]), tensor([  101,  8058,  2091,  1996,  3256,  1010, 22993, 23871,  1998,  2729,
         4571,  2718,  1996, 11867,  5063,  2100,  1010,  6857,  2723,  1997,
         1996,  2314,  1998,  3202,  2371,  1996,  4139,  1012,   102]), tensor([  101,  1998, 16371, 11461,  2001,  1037,  3143,  7966,  2000,  2360,
         2054,  2002,  2056,  1012,   102]), tensor([ 101, 1996, 2158, 2003, 6183, 1996, 3158, 1012,  102]), tensor([ 101, 1037, 2304, 3899, 2003, 2770, 1999, 1996, 5568, 1012,  102]), tensor([  101,  3449, 20709,  3207,  2072,  2163,  2008,  1037,  2690,  2264,
         3521,  3066,  2064,  2069,  2022,  2584,  1999,  5903,  2007,  1037,
         3036,  3820,  2241,  2006,  3690, 14808,  

In [19]:
valid_dataloader = DataLoader(valid_dataset, batch_size=64, collate_fn=data_collator)
test_dataloader = DataLoader(test_dataset, batch_size=64, collate_fn=data_collator)

In [20]:
from transformers import BertModel

In [21]:
import torch.nn.functional as F

In [22]:
from tqdm import tqdm

def pairwise_angle_sim(x, y):
    """
    Computes the absolute normalized angle distance. See :class:`~sentence_transformers.losses.AnglELoss`
    or https://arxiv.org/abs/2309.12871v1 for more information.

    Args:
        x (Tensor): The first tensor.
        y (Tensor): The second tensor.

    Returns:
        Tensor: Vector with res[i] = angle_sim(a[i], b[i])
    """
    a, b = torch.chunk(x, 2, dim=1)
    c, d = torch.chunk(y, 2, dim=1)

    z = torch.sum(c**2 + d**2, dim=1, keepdim=True)
    re = (a * c + b * d) / z
    im = (b * c - a * d) / z

    dz = torch.sum(a**2 + b**2, dim=1, keepdim=True) ** 0.5
    dw = torch.sum(c**2 + d**2, dim=1, keepdim=True) ** 0.5
    re /= dz / dw
    im /= dz / dw

    norm_angle = torch.sum(torch.concat((re, im), dim=1), dim=1)
    return torch.abs(norm_angle)


class CoSENTLoss(torch.nn.Module):
  def __init__(self, scale: float = 20.0) -> None:
      super().__init__()
      self.scale = scale

  def forward(self, scores, labels):

      scores = scores * self.scale
      scores = scores[:, None] - scores[None, :]

      # label matrix indicating which pairs are relevant
      labels = labels[:, None] < labels[None, :]
      labels = labels.float()

      # mask out irrelevant pairs so they are negligible after exp()
      scores = scores - (1 - labels) * 1e12

      # append a zero as e^0 = 1
      scores = torch.cat((torch.zeros(1).to(scores.device), scores.view(-1)), dim=0)
      loss = torch.logsumexp(scores, dim=0)

      return loss


def train(epochs, lr, model, train_dataloader, valid_dataloader, opt_func=torch.optim.Adam):
  history = []
  optimizer = opt_func(model.parameters(), lr)
  for epoch in range(epochs):
    model.train()
    train_losses = []
    valid_losses = []
    for batch in tqdm(train_dataloader):
      sentence_embeddings1 = []
      sentence_embeddings2 = []
      input_ids1 = batch['input_ids_1'].to(device)
      attention_mask1 = batch['attention_mask_1'].to(device)

      input_ids2 = batch['input_ids_2'].to(device)
      attention_mask2 = batch['attention_mask_2'].to(device)
      labels = batch['labels'].to(device)

      outputs1 = model(input_ids=input_ids1, attention_mask=attention_mask1)
      outputs2 = model(input_ids=input_ids2, attention_mask=attention_mask2)
      token_embeddings1 = outputs1.last_hidden_state
      token_embeddings2 = outputs2.last_hidden_state

      # unsqueeze attention to make it as same dimension as token embeddings
      attention_mask1_unsqueezed = attention_mask1.unsqueeze(-1)
      # unsqueeze attention multipliplied bt token embeddings to make zero where attention mask is zero
      token_embeddings1_masked = torch.mul(token_embeddings1, attention_mask1_unsqueezed)
      # take the sum of token embeddings and divide by attention mask count to take the avergae of only tokens which have attention mask 1
      sentence_embeddings1 = torch.sum(token_embeddings1_masked, dim=1) / torch.sum(attention_mask1, dim=1, keepdim=True)

      # unsqueeze attention to make it as same dimension as token embeddings
      attention_mask2_unsqueezed = attention_mask2.unsqueeze(-1)
      # unsqueeze attention multipliplied bt token embeddings to make zero where attention mask is zero
      token_embeddings2_masked = torch.mul(token_embeddings2, attention_mask2_unsqueezed)
      # take the sum of token embeddings and divide by attention mask count to take the avergae of only tokens which have attention mask 1
      sentence_embeddings2 = torch.sum(token_embeddings2_masked, dim=1) / torch.sum(attention_mask2, dim=1, keepdim=True)

      normalized_tensor1 = F.normalize(sentence_embeddings1, p=2, dim=1)
      normalized_tensor2 = F.normalize(sentence_embeddings2, p=2, dim=1)
      output_dot_product = pairwise_angle_sim(normalized_tensor1, normalized_tensor2)
      # output_dot_product = torch.nn.functional.cosine_similarity(normalized_tensor1 , normalized_tensor2, dim=1)
      # loss = torch.nn.MSELoss()(output_dot_product, labels)
      # loss = 1 - output_dot_product.mean()
      # loss = torch.mean(torch.abs(labels - output_dot_product))
      # print("output_dot_product", output_dot_product)
      # print("labels", labels)
      loss = CoSENTLoss()(output_dot_product, labels)
      # print("output_dot_product", output_dot_product)
      # angle_loss = AnglELoss(loss)(output_dot_product, labels)
      # print("labels", labels)
      # print("loss", loss.item())
      train_losses.append(loss)
      loss.backward() #calculate gradients
      optimizer.step()
      optimizer.zero_grad()
      # print(train_losses)
    total_loss = torch.stack(train_losses).mean().item()
    # print('epoc_training_loss', total_loss)
    # history.append(total_loss)

    model.eval()
    with torch.no_grad():
      for batch in tqdm(valid_dataloader):
        sentence_embeddings1 = []
        sentence_embeddings2 = []
        input_ids1 = batch['input_ids_1'].to(device)
        attention_mask1 = batch['attention_mask_1'].to(device)

        input_ids2 = batch['input_ids_2'].to(device)
        attention_mask2 = batch['attention_mask_2'].to(device)
        labels = batch['labels'].to(device)

        outputs1 = model(input_ids=input_ids1, attention_mask=attention_mask1)
        outputs2 = model(input_ids=input_ids2, attention_mask=attention_mask2)
        token_embeddings1 = outputs1.last_hidden_state
        token_embeddings2 = outputs2.last_hidden_state

        # unsqueeze attention to make it as same dimension as token embeddings
        attention_mask1_unsqueezed = attention_mask1.unsqueeze(-1)
        # unsqueeze attention multipliplied bt token embeddings to make zero where attention mask is zero
        token_embeddings1_masked = torch.mul(token_embeddings1, attention_mask1_unsqueezed)
        # take the sum of token embeddings and divide by attention mask count to take the avergae of only tokens which have attention mask 1
        sentence_embeddings1 = torch.sum(token_embeddings1_masked, dim=1) / torch.sum(attention_mask1, dim=1, keepdim=True)

        # unsqueeze attention to make it as same dimension as token embeddings
        attention_mask2_unsqueezed = attention_mask2.unsqueeze(-1)
        # unsqueeze attention multipliplied bt token embeddings to make zero where attention mask is zero
        token_embeddings2_masked = torch.mul(token_embeddings2, attention_mask2_unsqueezed)
        # take the sum of token embeddings and divide by attention mask count to take the avergae of only tokens which have attention mask 1
        sentence_embeddings2 = torch.sum(token_embeddings2_masked, dim=1) / torch.sum(attention_mask2, dim=1, keepdim=True)

        normalized_tensor1 = F.normalize(sentence_embeddings1, p=2, dim=1)
        normalized_tensor2 = F.normalize(sentence_embeddings2, p=2, dim=1)

        output_dot_product = torch.nn.functional.cosine_similarity(normalized_tensor1, normalized_tensor2, dim=1)
        # print("output_dot_product", output_dot_product)
        # print("labels", labels)
        # loss = torch.nn.MSELoss()(output_dot_product, labels)
        # loss = 1 - output_dot_product.mean()
        loss = torch.mean(torch.abs(labels - output_dot_product))
        # print("loss", loss.item())
        valid_losses.append(loss)
        total_valid_loss = torch.stack(valid_losses).mean().item()
    print('epoch_training_loss: {}, epoch_validation_loss: {}'.format(total_loss, total_valid_loss))


In [23]:
# Load the pre-trained BERT model
model = BertModel.from_pretrained('bert-base-uncased')
model.to(device)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [24]:
epochs = 6
lr = 0.00001
history = train(epochs, lr, model, train_dataloader, valid_dataloader)

100%|██████████| 90/90 [00:32<00:00,  2.76it/s]
100%|██████████| 24/24 [00:02<00:00, 11.05it/s]


epoch_training_loss: 7.774633884429932, epoch_validation_loss: 0.965406060218811


100%|██████████| 90/90 [00:32<00:00,  2.77it/s]
100%|██████████| 24/24 [00:02<00:00, 10.76it/s]


epoch_training_loss: 7.252868175506592, epoch_validation_loss: 0.9585660696029663


100%|██████████| 90/90 [00:32<00:00,  2.73it/s]
100%|██████████| 24/24 [00:02<00:00, 10.57it/s]


epoch_training_loss: 7.14340353012085, epoch_validation_loss: 0.9500483274459839


100%|██████████| 90/90 [00:32<00:00,  2.75it/s]
100%|██████████| 24/24 [00:02<00:00, 10.65it/s]


epoch_training_loss: 7.057457447052002, epoch_validation_loss: 0.9335923194885254


100%|██████████| 90/90 [00:33<00:00,  2.72it/s]
100%|██████████| 24/24 [00:02<00:00, 10.59it/s]


epoch_training_loss: 6.946145534515381, epoch_validation_loss: 0.9192476272583008


100%|██████████| 90/90 [00:33<00:00,  2.69it/s]
100%|██████████| 24/24 [00:02<00:00, 10.64it/s]

epoch_training_loss: 6.844447135925293, epoch_validation_loss: 0.9017157554626465





In [27]:
epochs = 4
lr = 0.00001
history = train(epochs, lr, model, train_dataloader, valid_dataloader)

100%|██████████| 90/90 [00:32<00:00,  2.73it/s]
100%|██████████| 24/24 [00:02<00:00, 10.53it/s]


epoch_training_loss: 6.771575927734375, epoch_validation_loss: 0.8893661499023438


100%|██████████| 90/90 [00:33<00:00,  2.72it/s]
100%|██████████| 24/24 [00:02<00:00, 10.63it/s]


epoch_training_loss: 6.647977828979492, epoch_validation_loss: 0.867326021194458


100%|██████████| 90/90 [00:32<00:00,  2.73it/s]
100%|██████████| 24/24 [00:02<00:00, 10.64it/s]


epoch_training_loss: 6.528115749359131, epoch_validation_loss: 0.8381671905517578


100%|██████████| 90/90 [00:33<00:00,  2.69it/s]
100%|██████████| 24/24 [00:02<00:00, 10.56it/s]

epoch_training_loss: 6.436467170715332, epoch_validation_loss: 0.8215551376342773





In [29]:
epochs = 4
lr = 0.00001
history = train(epochs, lr, model, train_dataloader, valid_dataloader)

100%|██████████| 90/90 [00:33<00:00,  2.71it/s]
100%|██████████| 24/24 [00:02<00:00, 10.54it/s]


epoch_training_loss: 6.352938175201416, epoch_validation_loss: 0.8034541010856628


100%|██████████| 90/90 [00:33<00:00,  2.70it/s]
100%|██████████| 24/24 [00:02<00:00, 10.70it/s]


epoch_training_loss: 6.264917373657227, epoch_validation_loss: 0.7896143198013306


100%|██████████| 90/90 [00:33<00:00,  2.69it/s]
100%|██████████| 24/24 [00:02<00:00, 10.57it/s]


epoch_training_loss: 6.216753005981445, epoch_validation_loss: 0.7811267375946045


100%|██████████| 90/90 [00:33<00:00,  2.69it/s]
100%|██████████| 24/24 [00:02<00:00, 10.59it/s]

epoch_training_loss: 6.161207675933838, epoch_validation_loss: 0.7692922353744507





In [31]:
epochs = 4
lr = 0.00001
history = train(epochs, lr, model, train_dataloader, valid_dataloader)

100%|██████████| 90/90 [00:33<00:00,  2.70it/s]
100%|██████████| 24/24 [00:02<00:00, 10.53it/s]


epoch_training_loss: 6.155496120452881, epoch_validation_loss: 0.7603596448898315


100%|██████████| 90/90 [00:33<00:00,  2.70it/s]
100%|██████████| 24/24 [00:02<00:00, 10.67it/s]


epoch_training_loss: 6.100245475769043, epoch_validation_loss: 0.755506157875061


100%|██████████| 90/90 [00:32<00:00,  2.73it/s]
100%|██████████| 24/24 [00:02<00:00, 10.56it/s]


epoch_training_loss: 6.056992053985596, epoch_validation_loss: 0.7520484924316406


100%|██████████| 90/90 [00:33<00:00,  2.69it/s]
100%|██████████| 24/24 [00:02<00:00, 10.57it/s]

epoch_training_loss: 6.022897720336914, epoch_validation_loss: 0.7431046962738037





In [33]:
epochs = 4
lr = 0.00001
history = train(epochs, lr, model, train_dataloader, valid_dataloader)

100%|██████████| 90/90 [00:32<00:00,  2.77it/s]
100%|██████████| 24/24 [00:02<00:00, 10.54it/s]


epoch_training_loss: 5.97612190246582, epoch_validation_loss: 0.7351378202438354


100%|██████████| 90/90 [00:33<00:00,  2.69it/s]
100%|██████████| 24/24 [00:02<00:00, 10.62it/s]


epoch_training_loss: 5.993607521057129, epoch_validation_loss: 0.7320609092712402


100%|██████████| 90/90 [00:33<00:00,  2.71it/s]
100%|██████████| 24/24 [00:02<00:00, 10.63it/s]


epoch_training_loss: 5.977212429046631, epoch_validation_loss: 0.7272433042526245


100%|██████████| 90/90 [00:33<00:00,  2.70it/s]
100%|██████████| 24/24 [00:02<00:00, 10.54it/s]

epoch_training_loss: 5.959202766418457, epoch_validation_loss: 0.7280153036117554





In [None]:
# from google.colab import drive
# drive.mount('sentence-similarity')

In [None]:
# torch.save(model.state_dict(), 'sentence-similarity/My Drive/pytorch practice notebooks/ssm/model.pth')  # Change the path as needed


In [None]:
# model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
# import torch
# state_dict = torch.load('sentence-similarity/My Drive/pytorch practice notebooks/ssm/model.pth')

In [None]:
# model.load_state_dict(state_dict)

In [None]:
# import torch
# # If there's a GPU available...
# if torch.cuda.is_available():
#     # Tell PyTorch to use the GPU.
#     device = torch.device("cuda")
#     print('There are %d GPU(s) available.' % torch.cuda.device_count())
#     print('We will use the GPU:', torch.cuda.get_device_name(0))
# # If not...
# else:
#     print('No GPU available, using the CPU instead.')
#     device = torch.device("cpu")

In [None]:
# model.to(device)

In [34]:
model.eval()
with torch.no_grad():
  for batch in test_dataloader:
    sentence_embeddings1 = []
    sentence_embeddings2 = []
    input_ids1 = batch['input_ids_1'].to(device)
    attention_mask1 = batch['attention_mask_1'].to(device)

    input_ids2 = batch['input_ids_2'].to(device)
    attention_mask2 = batch['attention_mask_2'].to(device)
    labels = batch['labels'].to(device)

    outputs1 = model(input_ids=input_ids1, attention_mask=attention_mask1)
    outputs2 = model(input_ids=input_ids2, attention_mask=attention_mask2)
    token_embeddings1 = outputs1.last_hidden_state
    token_embeddings2 = outputs2.last_hidden_state

    # unsqueeze attention to make it as same dimension as token embeddings
    attention_mask1_unsqueezed = attention_mask1.unsqueeze(-1)
    # unsqueeze attention multipliplied bt token embeddings to make zero where attention mask is zero
    token_embeddings1_masked = torch.mul(token_embeddings1, attention_mask1_unsqueezed)
    # take the sum of token embeddings and divide by attention mask count to take the avergae of only tokens which have attention mask 1
    sentence_embeddings1 = torch.sum(token_embeddings1_masked, dim=1) / torch.sum(attention_mask1, dim=1, keepdim=True)

    # unsqueeze attention to make it as same dimension as token embeddings
    attention_mask2_unsqueezed = attention_mask2.unsqueeze(-1)
    # unsqueeze attention multipliplied bt token embeddings to make zero where attention mask is zero
    token_embeddings2_masked = torch.mul(token_embeddings2, attention_mask2_unsqueezed)
    # take the sum of token embeddings and divide by attention mask count to take the avergae of only tokens which have attention mask 1
    sentence_embeddings2 = torch.sum(token_embeddings2_masked, dim=1) / torch.sum(attention_mask2, dim=1, keepdim=True)

    normalized_tensor1 = F.normalize(sentence_embeddings1, p=2, dim=1)
    normalized_tensor2 = F.normalize(sentence_embeddings2, p=2, dim=1)

    output_dot_product = torch.nn.functional.cosine_similarity(normalized_tensor1, normalized_tensor2, dim=1)
    print("output_dot_product", output_dot_product)
    print("labels", labels)
    loss = torch.nn.MSELoss()(output_dot_product, labels)
    print("loss", loss.item())
    break

output_dot_product tensor([0.8173, 0.9001, 0.9419, 0.9512, 0.5170, 0.6525, 0.8983, 0.5654, 0.7108,
        0.4809, 0.4809, 0.9626, 0.4532, 0.9288, 0.6240, 0.5463, 0.9488, 0.8330,
        0.7852, 0.5015, 0.6479, 0.5305, 0.9694, 0.9511, 0.4964, 0.8838, 0.4138,
        0.8073, 0.4269, 0.8262, 0.8637, 0.8931, 0.6648, 0.9275, 0.8215, 0.3037,
        0.4293, 0.7195, 0.9141, 0.4385, 0.8808, 0.7717, 0.7759, 0.4481, 0.5469,
        0.3488, 0.2489, 0.3202, 0.9349, 0.7531, 0.3974, 0.5881, 0.4072, 0.3275,
        0.5628, 0.3220, 0.8888, 0.6198, 0.8523, 0.5216, 0.9652, 0.3208, 0.8173,
        0.5377], device='cuda:0')
labels tensor([ 0.0000,  0.4400,  1.0000,  0.6800, -0.4000, -0.2800,  0.4000, -0.1200,
        -0.1200, -0.3144, -0.3144,  1.0000, -0.7600,  0.7600, -0.2000, -0.2800,
         0.7600,  0.4400,  0.4400, -0.5200, -0.0400, -0.9200,  0.6800,  0.7600,
        -0.1000, -0.2000, -0.7000, -0.1200, -0.6800, -0.1200,  0.2800,  0.9200,
        -0.4400,  0.7000,  0.3600, -0.7868, -0.8400, -0.5200