# Task 2. Task 2. Sentence Embedding with Sentence BERT (3 Points)

# [Sentence-BERT](https://arxiv.org/pdf/1908.10084.pdf)

[Reference Code](https://www.pinecone.io/learn/series/nlp/train-sentence-transformers-softmax/)

In [1]:
import os
import math
import re
from   random import *
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import datasets
import pickle

# Set GPU device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [2]:
# Import custom modules and classes
from myutils import *

cuda


## 1. Data

### Train, Test, Validation 

- Dataset use MNLI only

In [3]:
# Load MNLI dataset
mnli = datasets.load_dataset('glue', 'mnli')
mnli['train'].features

{'premise': Value(dtype='string', id=None),
 'hypothesis': Value(dtype='string', id=None),
 'label': ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [4]:
# List of datasets to remove 'idx' column from
mnli.column_names.keys()

dict_keys(['train', 'validation_matched', 'validation_mismatched', 'test_matched', 'test_mismatched'])

In [5]:
# Remove 'idx' column from each dataset
for column_names in mnli.column_names.keys():
    mnli[column_names] = mnli[column_names].remove_columns('idx')

In [6]:
mnli.column_names.keys()

dict_keys(['train', 'validation_matched', 'validation_mismatched', 'test_matched', 'test_mismatched'])

In [7]:
import numpy as np
np.unique(mnli['train']['label'])

array([0, 1, 2])

In [8]:
# create dataset dictionary with subset of data from MNLI
from datasets import DatasetDict

raw_dataset = DatasetDict({
    'train': mnli['train'].shuffle(seed=55).select(list(range(3000))),
    'test': mnli['test_mismatched'].shuffle(seed=55).select(list(range(500))),
    'validation': mnli['validation_mismatched'].shuffle(seed=55).select(list(range(1000)))
})

raw_dataset

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 500
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 1000
    })
})

## 2. Preprocessing

In [9]:
# retrieve the model weights saved from the pretrained model in Bert-update.ipynb

data     = pickle.load(open('../models/bert_model_data.pkl', 'rb'))

vocab_size          = data['vocab_size']
word2id             = data['word2id']
batch_size          = data['batch_size']
max_mask            = data['max_mask']
max_len             = data['max_len']
n_layers            = data['n_layers']
n_heads             = data['n_heads']
d_model             = data['d_model']
d_ff                = data['d_ff']
d_k                 = data['d_k']
d_v                 = data['d_v']
n_segments          = data['n_segments']
word_list           = data['word_list']
id2word             = data['id2word']

In [11]:
# Custom preprocessing function for tokenizing and preparing input data for my model
def preprocess_function(examples):
    lst_input_ids_premise = []
    lst_input_ids_hypothesis = []
    lst_masked_tokens_premise = []
    lst_masked_pos_premise = []
    lst_masked_tokens_hypothesis = []
    lst_masked_pos_hypothesis = []
    lst_segment_ids = []
    lst_attention_premise=[]
    lst_attention_hypothesis=[]
    labels = examples['label']
    max_seq_length = 200
    seed(55) 
    for i in range(len(examples['premise'])):

        # convert the word to numeric
        tokens_premise, tokens_hypothesis            = [word2id[word] if word in word_list else len(word_list) for word in examples['premise'][i].split()], \
                                                    [word2id[word] if word in word_list else len(word_list) for word in examples['hypothesis'][i].split()]
        
        #1. token embedding - add CLS and SEP on beginning and ending of premise and hypothesis
        input_ids_premise = [word2id['[CLS]']] + tokens_premise + [word2id['[SEP]']]
        input_ids_hypothesis = [word2id['[CLS]']] + tokens_hypothesis + [word2id['[SEP]']]
      
        #2. segment embedding - there one sentence so I decide to segment it as all 0
        segment_ids = [0] * max_seq_length
        #3 masking
        n_pred_premise = min(max_mask, max(1, int(round(len(input_ids_premise) * 0.15))))

        #get all the pos excluding CLS and SEP
        candidates_masked_pos_premise = [i for i, token in enumerate(input_ids_premise) if token != word2id['[CLS]'] 
                                 and token != word2id['[SEP]']]
        shuffle(candidates_masked_pos_premise)
        masked_tokens_premise, masked_pos_premise = [], [] #compare the output with masked_tokens
        #simply loop and mask accordingly
        for pos in candidates_masked_pos_premise[:n_pred_premise]:
            masked_pos_premise.append(pos)
            masked_tokens_premise.append(input_ids_premise[pos])
           
            if random() < 0.1:  #10% replace with random token
                index = randint(0, vocab_size - 1)
                input_ids_premise[pos] = word2id[id2word[index]]
            elif random() < 0.8:  #80 replace with [MASK]
                input_ids_premise[pos] = word2id['[MASK]']
            else: 
                pass

        n_pred_hypothesis = min(max_mask, max(1, int(round(len(input_ids_hypothesis) * 0.15))))
        #get all the pos excluding CLS and SEP
        candidates_masked_pos_hypothesis = [i for i, token in enumerate(input_ids_hypothesis) if token != word2id['[CLS]'] 
                                 and token != word2id['[SEP]']]
        shuffle(candidates_masked_pos_hypothesis)
        masked_tokens_hypothesis, masked_pos_hypothesis = [], [] #compare the output with masked_tokens
        #simply loop and mask accordingly
        for pos in candidates_masked_pos_hypothesis[:n_pred_hypothesis]:
            masked_pos_hypothesis.append(pos)
            masked_tokens_hypothesis.append(input_ids_hypothesis[pos])
            if random() < 0.1:  #10% replace with random token
                index = randint(0, vocab_size - 1)
                input_ids_hypothesis[pos] = word2id[id2word[index]]
            elif random() < 0.8:  #80 replace with [MASK]
                input_ids_hypothesis[pos] = word2id['[MASK]']
            else: 
                pass
        
        #4. pad the sentence to the max length
        n_pad_premise = max_seq_length - len(input_ids_premise)
        input_ids_premise.extend([0] * n_pad_premise)
        
        #5. pad the mask tokens to the max length
        if max_mask > n_pred_premise:
            n_pad_premise = max_mask - n_pred_premise
            masked_tokens_premise.extend([0] * n_pad_premise)
            masked_pos_premise.extend([0] * n_pad_premise)
            attention_premise = [1]*n_pred_premise+[0]*(n_pad_premise)
            
        #4. pad the sentence to the max length
        n_pad_hypothesis = max_seq_length - len(input_ids_hypothesis)
        input_ids_hypothesis.extend([0] * n_pad_hypothesis)
        
        #5. pad the mask tokens to the max length
        if max_mask > n_pred_hypothesis:
            n_pad_hypothesis = max_mask - n_pred_hypothesis
            masked_tokens_hypothesis.extend([0] * n_pad_hypothesis)
            masked_pos_hypothesis.extend([0] * n_pad_hypothesis)
            attention_hypothesis = [1]*n_pred_hypothesis+[0]*(n_pad_hypothesis)
        
        # add the value to own list
        lst_input_ids_premise.append(input_ids_premise)
        lst_input_ids_hypothesis.append(input_ids_hypothesis)
        lst_segment_ids.append(segment_ids)
        lst_masked_tokens_premise.append(masked_tokens_premise)
        lst_masked_pos_premise.append(masked_pos_premise)
        lst_masked_tokens_hypothesis.append(masked_tokens_hypothesis)
        lst_masked_pos_hypothesis.append(masked_pos_hypothesis)
        lst_attention_premise.append(attention_premise)
        lst_attention_hypothesis.append(attention_hypothesis)

    # return as a dictionary
    return {
        "premise_input_ids": lst_input_ids_premise,
        "premise_pos_mask":lst_masked_pos_premise,
        "hypothesis_input_ids": lst_input_ids_hypothesis,
        "hypothesis_pos_mask": lst_masked_pos_hypothesis,
        "segment_ids": lst_segment_ids,
        "attention_premise": lst_attention_premise,
        "attention_hypothesis": lst_attention_hypothesis,
        "labels" : labels,
    }

In [12]:
# map raw dataset with preprocess_function to create new data dict
tokenized_datasets = raw_dataset.map(
    preprocess_function,
    batched=True,
)

tokenized_datasets = tokenized_datasets.remove_columns(['premise','hypothesis','label'])
tokenized_datasets.set_format("torch")

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [13]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['premise_input_ids', 'premise_pos_mask', 'hypothesis_input_ids', 'hypothesis_pos_mask', 'segment_ids', 'attention_premise', 'attention_hypothesis', 'labels'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['premise_input_ids', 'premise_pos_mask', 'hypothesis_input_ids', 'hypothesis_pos_mask', 'segment_ids', 'attention_premise', 'attention_hypothesis', 'labels'],
        num_rows: 500
    })
    validation: Dataset({
        features: ['premise_input_ids', 'premise_pos_mask', 'hypothesis_input_ids', 'hypothesis_pos_mask', 'segment_ids', 'attention_premise', 'attention_hypothesis', 'labels'],
        num_rows: 1000
    })
})

## 3. Data loader

In [14]:
from torch.utils.data import DataLoader

# initialize the dataloader
batch_size = 8
train_dataloader = DataLoader(
    tokenized_datasets['train'], 
    batch_size=batch_size, 
    shuffle=True
)
eval_dataloader = DataLoader(
    tokenized_datasets['validation'], 
    batch_size=batch_size
)
test_dataloader = DataLoader(
    tokenized_datasets['test'], 
    batch_size=batch_size
)

In [15]:
# print the shape of each key 
for batch in train_dataloader:
    print(batch['premise_input_ids'].shape)
    print(batch['premise_pos_mask'].shape)
    print(batch['hypothesis_input_ids'].shape)
    print(batch['hypothesis_pos_mask'].shape)
    print(batch['segment_ids'].shape)
    print(batch['attention_premise'].shape)
    print(batch['attention_hypothesis'].shape)
    print(batch['labels'].shape)
    break

torch.Size([8, 200])
torch.Size([8, 5])
torch.Size([8, 200])
torch.Size([8, 5])
torch.Size([8, 200])
torch.Size([8, 5])
torch.Size([8, 5])
torch.Size([8])


## 4. Model

In [16]:
# start from the pretrained model in Bert-update.ipynb
model = BERT(
    n_layers, 
    n_heads, 
    d_model, 
    d_ff, 
    d_k, 
    n_segments, 
    vocab_size, 
    max_len, 
    device
)
model.load_state_dict(torch.load('../models/bert_model.pth'))
model.to(device)

BERT(
  (embedding): Embedding(
    (tok_embed): Embedding(60305, 768)
    (pos_embed): Embedding(1000, 768)
    (seg_embed): Embedding(2, 768)
    (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (layers): ModuleList(
    (0-11): 12 x EncoderLayer(
      (enc_self_attn): MultiHeadAttention(
        (W_Q): Linear(in_features=768, out_features=768, bias=True)
        (W_K): Linear(in_features=768, out_features=768, bias=True)
        (W_V): Linear(in_features=768, out_features=768, bias=True)
      )
      (pos_ffn): PoswiseFeedForwardNet(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
      )
    )
  )
  (fc): Linear(in_features=768, out_features=768, bias=True)
  (activ): Tanh()
  (linear): Linear(in_features=768, out_features=768, bias=True)
  (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (classifier): Linear(in_features=768, out_features=2, bias=True)
  (de

### Pooling
SBERT adds a pooling operation to the output of BERT / RoBERTa to derive a fixed sized sentence embedding

In [17]:
# define mean pooling function
def mean_pool(token_embeds, attention_mask):
    # reshape attention_mask to cover 768-dimension embeddings
    in_mask = attention_mask.unsqueeze(-1).expand(
        token_embeds.size()
    ).float()
    # perform mean-pooling but exclude padding tokens (specified by in_mask)
    pool = torch.sum(token_embeds * in_mask, 1) / torch.clamp(
        in_mask.sum(1), min=1e-9
    )
    return pool

## 5. Loss Function

## Classification Objective Function 
We concatenate the sentence embeddings $u$ and $v$ with the element-wise difference  $\lvert u - v \rvert $ and multiply the result with the trainable weight  $ W_t ∈  \mathbb{R}^{3n \times k}  $:

$ o = \text{softmax}\left(W^T \cdot \left(u, v, \lvert u - v \rvert\right)\right) $

where $n$ is the dimension of the sentence embeddings and k the number of labels. We optimize cross-entropy loss. This structure is depicted in Figure 1.

## Regression Objective Function. 
The cosine similarity between the two sentence embeddings $u$ and $v$ is computed (Figure 2). We use means quared-error loss as the objective function.

(Manhatten / Euclidean distance, semantically  similar sentences can be found.)


In [18]:
def configurations(u,v):
    # build the |u-v| tensor
    uv = torch.sub(u, v)   # batch_size,hidden_dim
    uv_abs = torch.abs(uv) # batch_size,hidden_dim
    
    # concatenate u, v, |u-v|
    x = torch.cat([u, v, uv_abs], dim=-1) # batch_size, 3*hidden_dim
    return x

def cosine_similarity(u, v):
    dot_product = np.dot(u, v)
    norm_u = np.linalg.norm(u)
    norm_v = np.linalg.norm(v)
    similarity = dot_product / (norm_u * norm_v)
    return similarity

In [19]:
classifier_head = torch.nn.Linear(vocab_size*3, 3).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
optimizer_classifier = torch.optim.Adam(classifier_head.parameters(), lr=2e-5)

criterion = nn.CrossEntropyLoss()

In [20]:
from transformers import get_linear_schedule_with_warmup

# and setup a warmup for the first ~10% steps
total_steps = int(len(raw_dataset) / batch_size)
warmup_steps = int(0.1 * total_steps)
scheduler = get_linear_schedule_with_warmup(
		optimizer, num_warmup_steps=warmup_steps,
  	num_training_steps=total_steps - warmup_steps
)

# then during the training loop we update the scheduler per step
scheduler.step()

scheduler_classifier = get_linear_schedule_with_warmup(
		optimizer_classifier, num_warmup_steps=warmup_steps,
  	num_training_steps=total_steps - warmup_steps
)

# then during the training loop we update the scheduler per step
scheduler_classifier.step()



## 6. Training

In [21]:
from tqdm.auto import tqdm

num_epoch = 5
# 1 epoch should be enough, increase if wanted
for epoch in range(num_epoch):
    model.train()  
    classifier_head.train()
    # initialize the dataloader loop with tqdm (tqdm == progress bar)
    for step, batch in enumerate(tqdm(train_dataloader, leave=True)):
        # zero all gradients on each new step
        optimizer.zero_grad()
        optimizer_classifier.zero_grad()
        
        # prepare batches and more all to the active device
        inputs_ids_a = batch['premise_input_ids'].to(device)
        inputs_ids_b = batch['hypothesis_input_ids'].to(device)
        pos_mask_a = batch['premise_pos_mask'].to(device)
        pos_mask_b = batch['hypothesis_pos_mask'].to(device)
        segment_ids = batch['segment_ids'].to(device)
        attention_a = batch['attention_premise'].to(device)
        attention_b = batch['attention_hypothesis'].to(device)
        label = batch['labels'].to(device)
        
        # extract token embeddings from BERT at last_hidden_state
        u, _ = model(inputs_ids_a, segment_ids, pos_mask_a)  
        v, _ = model(inputs_ids_b, segment_ids, pos_mask_b)  

        u_last_hidden_state = u # all token embeddings A = batch_size, seq_len, hidden_dim
        v_last_hidden_state = v # all token embeddings B = batch_size, seq_len, hidden_dim

        # get the mean pooled vectors
        u_mean_pool = mean_pool(u_last_hidden_state, attention_a) # batch_size, hidden_dim
        v_mean_pool = mean_pool(v_last_hidden_state, attention_b) # batch_size, hidden_dim
        
        # build the |u-v| tensor
        uv = torch.sub(u_mean_pool, v_mean_pool)   # batch_size,hidden_dim
        uv_abs = torch.abs(uv) # batch_size,hidden_dim
        
        # concatenate u, v, |u-v|
        x = torch.cat([u_mean_pool, v_mean_pool, uv_abs], dim=-1) # batch_size, 3*hidden_dim
        
        # process concatenated tensor through classifier_head
        x = classifier_head(x) #batch_size, classifer
        
        # calculate the 'softmax-loss' between predicted and true label
        loss = criterion(x, label)
        
        # using loss, calculate gradients and then optimizerize
        loss.backward()
        optimizer.step()
        optimizer_classifier.step()

        scheduler.step() # update learning rate scheduler
        scheduler_classifier.step()
        
    print(f'Epoch: {epoch + 1} | loss = {loss.item():.6f}')

  0%|          | 0/375 [00:00<?, ?it/s]

Epoch: 1 | loss = 11.070644


  0%|          | 0/375 [00:00<?, ?it/s]

Epoch: 2 | loss = 6.145179


  0%|          | 0/375 [00:00<?, ?it/s]

Epoch: 3 | loss = 12.418708


  0%|          | 0/375 [00:00<?, ?it/s]

Epoch: 4 | loss = 8.316485


  0%|          | 0/375 [00:00<?, ?it/s]

Epoch: 5 | loss = 4.723993


In [22]:
from sklearn.metrics.pairwise import cosine_similarity

model.eval()
classifier_head.eval()
total_similarity = 0

with torch.no_grad():
    for step, batch in enumerate(eval_dataloader):
        # Prepare batches and move all to the active device
        inputs_ids_a = batch['premise_input_ids'].to(device)
        inputs_ids_b = batch['hypothesis_input_ids'].to(device)
        pos_mask_a = batch['premise_pos_mask'].to(device)
        pos_mask_b = batch['hypothesis_pos_mask'].to(device)
        segment_ids = batch['segment_ids'].to(device)
        attention_a = batch['attention_premise'].to(device)
        attention_b = batch['attention_hypothesis'].to(device)
        label = batch['labels'].to(device)
        
        # Extract token embeddings from BERT at last_hidden_state
        u, _ = model(inputs_ids_a, segment_ids, pos_mask_a)  
        v, _ = model(inputs_ids_b, segment_ids, pos_mask_b)  
        
        # Get the mean pooled vectors
        u_mean_pool = mean_pool(u, attention_a)  # Shape: [batch_size, hidden_dim]
        v_mean_pool = mean_pool(v, attention_b)  # Shape: [batch_size, hidden_dim]
        
        # Move tensors to CPU and convert to NumPy for cosine similarity calculation
        u_mean_pool = u_mean_pool.cpu().numpy()  # Convert to NumPy array
        v_mean_pool = v_mean_pool.cpu().numpy()  # Convert to NumPy array
        
        # Calculate cosine similarity for each pair in the batch
        similarity_scores = cosine_similarity(u_mean_pool, v_mean_pool)  # Shape: [batch_size, batch_size]
        
        # Extract the diagonal (similarity between corresponding pairs)
        batch_similarity = np.diag(similarity_scores).mean()  # Average similarity for the batch
        total_similarity += batch_similarity
    
# Calculate the average similarity across all batches
average_similarity = total_similarity / len(eval_dataloader)
print(f"Average Cosine Similarity: {average_similarity:.4f}")

Average Cosine Similarity: 1.0000


## 7. Inference

In [23]:
# tokenize the sentence of model
def tokenize_sentence_model(sentence_a, sentence_b):
    lst_input_ids_premise = []
    lst_input_ids_hypothesis = []
    lst_masked_tokens_premise = []
    lst_masked_pos_premise = []
    lst_masked_tokens_hypothesis = []
    lst_masked_pos_hypothesis = []
    lst_segment_ids = []
    lst_attention_premise=[]
    lst_attention_hypothesis=[]
    max_seq_length = 200
    seed(55) 

    tokens_premise, tokens_hypothesis            = [word2id[word] if word in word_list else len(word_list) for word in sentence_a.split()], \
                                                    [word2id[word] if word in word_list else len(word_list) for word in sentence_b.split()]
    
    input_ids_premise = [word2id['[CLS]']] + tokens_premise + [word2id['[SEP]']]
    input_ids_hypothesis = [word2id['[CLS]']] + tokens_hypothesis + [word2id['[SEP]']]
    
    #2. segment embedding 
    segment_ids = [0] * max_seq_length
     #3 masking
    n_pred_premise = min(max_mask, max(1, int(round(len(input_ids_premise) * 0.15))))

    #get all the pos excluding CLS and SEP
    candidates_masked_pos_premise = [i for i, token in enumerate(input_ids_premise) if token != word2id['[CLS]'] 
                                 and token != word2id['[SEP]']]
    shuffle(candidates_masked_pos_premise)
    masked_tokens_premise, masked_pos_premise = [], [] #compare the output with masked_tokens
    #simply loop and mask accordingly
    for pos in candidates_masked_pos_premise[:n_pred_premise]:
        masked_pos_premise.append(pos)
        masked_tokens_premise.append(input_ids_premise[pos])
           
        if random() < 0.1:  #10% replace with random token
            index = randint(0, vocab_size - 1)
            input_ids_premise[pos] = word2id[id2word[index]]
        elif random() < 0.8:  #80 replace with [MASK]
            input_ids_premise[pos] = word2id['[MASK]']
        else: 
            pass

    n_pred_hypothesis = min(max_mask, max(1, int(round(len(input_ids_hypothesis) * 0.15))))
    #get all the pos excluding CLS and SEP
    candidates_masked_pos_hypothesis = [i for i, token in enumerate(input_ids_hypothesis) if token != word2id['[CLS]'] 
                                 and token != word2id['[SEP]']]
    shuffle(candidates_masked_pos_hypothesis)
    masked_tokens_hypothesis, masked_pos_hypothesis = [], [] #compare the output with masked_tokens
    #simply loop and mask accordingly
    for pos in candidates_masked_pos_hypothesis[:n_pred_hypothesis]:
        masked_pos_hypothesis.append(pos)
        masked_tokens_hypothesis.append(input_ids_hypothesis[pos])
        if random() < 0.1:  #10% replace with random token
            index = randint(0, vocab_size - 1)
            input_ids_hypothesis[pos] = word2id[id2word[index]]
        elif random() < 0.8:  #80 replace with [MASK]
            input_ids_hypothesis[pos] = word2id['[MASK]']
        else: 
            pass

    #4. pad the sentence to the max length
    n_pad_premise = max_seq_length - len(input_ids_premise)
    input_ids_premise.extend([0] * n_pad_premise)
        
    #5. pad the mask tokens to the max length
    if max_mask > n_pred_premise:
        n_pad_premise = max_mask - n_pred_premise
        masked_tokens_premise.extend([0] * n_pad_premise)
        masked_pos_premise.extend([0] * n_pad_premise)
        attention_premise = [1]*n_pred_premise+[0]*(n_pad_premise)
            
    #4. pad the sentence to the max length
    n_pad_hypothesis = max_seq_length - len(input_ids_hypothesis)
    input_ids_hypothesis.extend([0] * n_pad_hypothesis)
        
    #5. pad the mask tokens to the max length
    if max_mask > n_pred_hypothesis:
        n_pad_hypothesis = max_mask - n_pred_hypothesis
        masked_tokens_hypothesis.extend([0] * n_pad_hypothesis)
        masked_pos_hypothesis.extend([0] * n_pad_hypothesis)
        attention_hypothesis = [1]*n_pred_hypothesis+[0]*(n_pad_hypothesis)

    lst_input_ids_premise.append(input_ids_premise)
    lst_input_ids_hypothesis.append(input_ids_hypothesis)
    lst_segment_ids.append(segment_ids)
    lst_masked_tokens_premise.append(masked_tokens_premise)
    lst_masked_pos_premise.append(masked_pos_premise)
    lst_masked_tokens_hypothesis.append(masked_tokens_hypothesis)
    lst_masked_pos_hypothesis.append(masked_pos_hypothesis)
    lst_attention_premise.append(attention_premise)
    lst_attention_hypothesis.append(attention_hypothesis)

    return {
        "premise_input_ids": lst_input_ids_premise,
        "premise_pos_mask":lst_masked_pos_premise,
        "hypothesis_input_ids": lst_input_ids_hypothesis,
        "hypothesis_pos_mask": lst_masked_pos_hypothesis,
        "segment_ids": lst_segment_ids,
        "attention_premise": lst_attention_premise,
        "attention_hypothesis": lst_attention_hypothesis,
        
    }

In [24]:
import torch
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity_model(model, sentence_a, sentence_b, device):
    # Tokenize and convert sentences to input IDs and attention masks
    inputs = tokenize_sentence_model(sentence_a, sentence_b)

    # Move input IDs and attention masks to the active device
    inputs_ids_a = batch['premise_input_ids'].to(device)
    inputs_ids_b = batch['hypothesis_input_ids'].to(device)
    pos_mask_a = batch['premise_pos_mask'].to(device)
    pos_mask_b = batch['hypothesis_pos_mask'].to(device)
    segment_ids = batch['segment_ids'].to(device)
    attention_a = batch['attention_premise'].to(device)
    attention_b = batch['attention_hypothesis'].to(device)
    

    # Extract token embeddings from BERT
    u,_ = model(inputs_ids_a, segment_ids, pos_mask_a)  
    v,_ = model(inputs_ids_b, segment_ids, pos_mask_b) 

    # Get the mean-pooled vectors
    u = mean_pool(u, attention_a).detach().cpu().numpy().reshape(-1)  # Move to CPU for NumPy
    v = mean_pool(v, attention_b).detach().cpu().numpy().reshape(-1)  # Move to CPU for NumPy

    # Calculate cosine similarity
    similarity_score = cosine_similarity(u.reshape(1, -1), v.reshape(1, -1))[0, 0]

    return similarity_score

# Example usage:
sentence_a = 'Your contribution helped make it possible for us to provide our students with a quality education.'
sentence_b = "Your contributions were of no help with our students' education."
similarity = calculate_similarity_model(model, sentence_a, sentence_b, device)
print(f"Cosine Similarity: {similarity:.4f}")

Cosine Similarity: 0.9999


In [25]:
# Example usage:
sentence_a = 'lemon'
sentence_b = 'lime'
similarity = calculate_similarity_model(model, sentence_a, sentence_b, device)
print(f"Cosine Similarity: {similarity:.4f}")

Cosine Similarity: 1.0000


In [26]:
#Save model
torch.save(model.state_dict(), '../models/S_BERT.pt')

# Task 3. Evaluation and Analysis (1 points)

In [28]:
# custom function to calculate the total parameters in each model
def count_parameters(model):
    params = [p.numel() for p in model.parameters() if p.requires_grad]
    print(f'______\n{sum(params):>6}')

In [29]:
# Calculate the evaluation loss for each model
def calculate_loss_model(model, classifier, criterion, eval_dataloader):
    model.eval()
    classifier.eval()
    total_loss = 0
    with torch.no_grad():
        for step, batch in enumerate(eval_dataloader):

            inputs_ids_a = batch['premise_input_ids'].to(device)
            inputs_ids_b = batch['hypothesis_input_ids'].to(device)
            pos_mask_a = batch['premise_pos_mask'].to(device)
            pos_mask_b = batch['hypothesis_pos_mask'].to(device)
            segment_ids = batch['segment_ids'].to(device)
            attention_a = batch['attention_premise'].to(device)
            attention_b = batch['attention_hypothesis'].to(device)
            label = batch['labels'].to(device)

            # extract token embeddings from BERT at last_hidden_state
            u, _ = model(inputs_ids_a, segment_ids, pos_mask_a)  # all token embeddings A = batch_size, seq_len, hidden_dim
            v, _ = model(inputs_ids_b, segment_ids, pos_mask_b)  # all token embeddings B = batch_size, seq_len, hidden_dim

            # get the mean pooled vectors
            u_mean_pool = mean_pool(u, attention_a) # batch_size, hidden_dim
            v_mean_pool = mean_pool(v, attention_b) # batch_size, hidden_dim

            # build the |u-v| tensor
            uv = torch.sub(u_mean_pool, v_mean_pool)   # batch_size,hidden_dim
            uv_abs = torch.abs(uv) # batch_size,hidden_dim
            
            # concatenate u, v, |u-v|
            x = torch.cat([u_mean_pool, v_mean_pool, uv_abs], dim=-1) # batch_size, 3*hidden_dim
            
            # process concatenated tensor through classifier_head
            x = classifier(x) #batch_size, classifer
            
            # calculate the 'softmax-loss' between predicted and true label
            loss = criterion(x, label)

            total_loss += loss
    
    average_loss = total_loss/len(eval_dataloader)
    print(f"Average Loss: {average_loss:.4f}")

In [30]:
# create function to compute the cosine similarity of model
def calculate_cosine_sim_model(model, classifier,eval_dataloader):
    model.eval()
    classifier.eval()
    total_similarity = 0
    with torch.no_grad():
        for step, batch in enumerate(eval_dataloader):
            # prepare batches and more all to the active device
            inputs_ids_a = batch['premise_input_ids'].to(device)
            inputs_ids_b = batch['hypothesis_input_ids'].to(device)
            pos_mask_a = batch['premise_pos_mask'].to(device)
            pos_mask_b = batch['hypothesis_pos_mask'].to(device)
            segment_ids = batch['segment_ids'].to(device)
            attention_a = batch['attention_premise'].to(device)
            attention_b = batch['attention_hypothesis'].to(device)
            label = batch['labels'].to(device)

            # extract token embeddings from BERT at last_hidden_state

            u, _ = model(inputs_ids_a, segment_ids, pos_mask_a)  
            v, _ = model(inputs_ids_b, segment_ids, pos_mask_b) 
            # get the mean pooled vectors
            u_mean_pool = mean_pool(u, attention_a).detach().cpu().numpy().reshape(-1) # batch_size, hidden_dim
            v_mean_pool = mean_pool(v, attention_b).detach().cpu().numpy().reshape(-1) # batch_size, hidden_dim

            similarity_score = cosine_similarity(u_mean_pool.reshape(1, -1), v_mean_pool.reshape(1, -1))[0, 0]
            total_similarity += similarity_score
        
    average_similarity = total_similarity / len(eval_dataloader)
    print(f"Average Cosine Similarity: {average_similarity:.4f}")

In [31]:
import torch
from sklearn.metrics.pairwise import cosine_similarity

# create function for compute consine similarity of unseen 2 sentence 
def calculate_similarity_model(model, sentence_a, sentence_b, device):
    # Tokenize and convert sentences to input IDs and attention masks
    inputs = tokenize_sentence_model(sentence_a, sentence_b)
    
    # Move input IDs and attention masks to the active device
    inputs_ids_a = torch.tensor(inputs['premise_input_ids']).to(device)
    pos_mask_a = torch.tensor(inputs['premise_pos_mask']).to(device)
    attention_a = torch.tensor(inputs['attention_premise']).to(device)
    inputs_ids_b = torch.tensor(inputs['hypothesis_input_ids']).to(device)
    pos_mask_b = torch.tensor(inputs['hypothesis_pos_mask']).to(device)
    attention_b = torch.tensor(inputs['attention_hypothesis']).to(device)
    segment = torch.tensor(inputs['segment_ids']).to(device)

    # Extract token embeddings from BERT
    u,_ = model(inputs_ids_a, segment, pos_mask_a)  
    v,_ = model(inputs_ids_b, segment, pos_mask_b) 

    # Get the mean-pooled vectors
    u = mean_pool(u, attention_a).detach().cpu().numpy().reshape(-1)  
    v = mean_pool(v, attention_b).detach().cpu().numpy().reshape(-1)  

    # Calculate cosine similarity
    similarity_score = cosine_similarity(u.reshape(1, -1), v.reshape(1, -1))[0, 0]

    return similarity_score  

In [32]:
# Evalue model to return accuracy, precision, recall, and F1-score

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch

def evaluate_nli_model(model, classifier, eval_dataloader, device):
    model.eval()
    classifier.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in eval_dataloader:
            inputs_ids_a = batch['premise_input_ids'].to(device)
            inputs_ids_b = batch['hypothesis_input_ids'].to(device)
            pos_mask_a = batch['premise_pos_mask'].to(device)
            pos_mask_b = batch['hypothesis_pos_mask'].to(device)
            segment_ids = batch['segment_ids'].to(device)
            attention_a = batch['attention_premise'].to(device)
            attention_b = batch['attention_hypothesis'].to(device)
            labels = batch['labels'].to(device)
            
            # Extract token embeddings
            u, _ = model(inputs_ids_a, segment_ids, pos_mask_a)
            v, _ = model(inputs_ids_b, segment_ids, pos_mask_b)
            
            # Mean pooling
            u_mean_pool = mean_pool(u, attention_a)
            v_mean_pool = mean_pool(v, attention_b)
            
            # Compute absolute difference
            uv_abs = torch.abs(u_mean_pool - v_mean_pool)
            
            # Concatenate u, v, and |u-v|
            x = torch.cat([u_mean_pool, v_mean_pool, uv_abs], dim=-1)
            
            # Pass through classifier
            logits = classifier(x)
            
            # Get predictions
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Compute metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
    
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1-score: {f1:.4f}')
    
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}


In [40]:
import torch
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity

def predict_nli_and_similarity(model, classifier_head, sentence_a, sentence_b, device):
    # Tokenize and convert sentences to input IDs and attention masks
    inputs = tokenize_sentence_model(sentence_a, sentence_b)
    
    # Move input IDs and attention masks to the active device
    inputs_ids_a = torch.tensor(inputs['premise_input_ids']).to(device)
    pos_mask_a = torch.tensor(inputs['premise_pos_mask']).to(device)
    attention_a = torch.tensor(inputs['attention_premise']).to(device)
    inputs_ids_b = torch.tensor(inputs['hypothesis_input_ids']).to(device)
    pos_mask_b = torch.tensor(inputs['hypothesis_pos_mask']).to(device)
    attention_b = torch.tensor(inputs['attention_hypothesis']).to(device)
    segment = torch.tensor(inputs['segment_ids']).to(device)

    # Extract token embeddings from BERT
    with torch.no_grad():
        u, _ = model(inputs_ids_a, segment, pos_mask_a)
        v, _ = model(inputs_ids_b, segment, pos_mask_b)

    # Get the mean-pooled vectors
    u = mean_pool(u, attention_a)
    v = mean_pool(v, attention_b)

    # Convert to numpy for cosine similarity
    u_np = u.cpu().numpy().reshape(-1)
    v_np = v.cpu().numpy().reshape(-1)

    # Calculate cosine similarity
    similarity_score = cosine_similarity(u_np.reshape(1, -1), v_np.reshape(1, -1))[0, 0]

    # Compute NLI classification
    uv_abs = torch.abs(u - v)  # |u - v|
    x = torch.cat([u, v, uv_abs], dim=-1)  # Concatenate for classification

    with torch.no_grad():
        logits = classifier_head(x)  # Pass through classification head
        probabilities = F.softmax(logits, dim=-1)

    # NLI labels: contradiction (0), neutral (1), entailment (2)
    labels = ["contradiction", "neutral", "entailment"]
    nli_result = labels[torch.argmax(probabilities).item()]

    return similarity_score, nli_result


In [41]:
# print shapes
print(f"u shape: {u.shape}, v shape: {v.shape}")


u shape: torch.Size([8, 5, 60305]), v shape: torch.Size([8, 5, 60305])


### Evaluate Model in Task 1

In [42]:
# load model_before for Task 1
model_before= BERT(
    n_layers, 
    n_heads, 
    d_model, 
    d_ff, 
    d_k, 
    n_segments, 
    vocab_size, 
    max_len, 
    device
)
model_before.load_state_dict(torch.load('../models/bert_model.pth'))
model_before.to(device)

BERT(
  (embedding): Embedding(
    (tok_embed): Embedding(60305, 768)
    (pos_embed): Embedding(1000, 768)
    (seg_embed): Embedding(2, 768)
    (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (layers): ModuleList(
    (0-11): 12 x EncoderLayer(
      (enc_self_attn): MultiHeadAttention(
        (W_Q): Linear(in_features=768, out_features=768, bias=True)
        (W_K): Linear(in_features=768, out_features=768, bias=True)
        (W_V): Linear(in_features=768, out_features=768, bias=True)
      )
      (pos_ffn): PoswiseFeedForwardNet(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
      )
    )
  )
  (fc): Linear(in_features=768, out_features=768, bias=True)
  (activ): Tanh()
  (linear): Linear(in_features=768, out_features=768, bias=True)
  (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (classifier): Linear(in_features=768, out_features=2, bias=True)
  (de

In [43]:
count_parameters(model_before)

______
126260371


In [44]:
calculate_cosine_sim_model(model_before,classifier_head,eval_dataloader)

Average Cosine Similarity: 1.0000


In [45]:
calculate_loss_model(model_before,classifier_head,criterion,eval_dataloader)

Average Loss: 7.1776


In [46]:
sentence_a = 'A man is playing basketball on stage'
sentence_b = "The man is exercising"
similarity = calculate_similarity_model(model_before, sentence_a, sentence_b, device)
print(f"Cosine Similarity: {similarity:.4f}")

Cosine Similarity: 1.0000


### EvaluateModel_after for Task 2

In [47]:
# Load model_after for Task 2
model_after= BERT(
    n_layers, 
    n_heads, 
    d_model, 
    d_ff, 
    d_k, 
    n_segments, 
    vocab_size, 
    max_len, 
    device
)
model_after.load_state_dict(torch.load('../models/S_BERT.pt'))
model_after.to(device)

BERT(
  (embedding): Embedding(
    (tok_embed): Embedding(60305, 768)
    (pos_embed): Embedding(1000, 768)
    (seg_embed): Embedding(2, 768)
    (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (layers): ModuleList(
    (0-11): 12 x EncoderLayer(
      (enc_self_attn): MultiHeadAttention(
        (W_Q): Linear(in_features=768, out_features=768, bias=True)
        (W_K): Linear(in_features=768, out_features=768, bias=True)
        (W_V): Linear(in_features=768, out_features=768, bias=True)
      )
      (pos_ffn): PoswiseFeedForwardNet(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
      )
    )
  )
  (fc): Linear(in_features=768, out_features=768, bias=True)
  (activ): Tanh()
  (linear): Linear(in_features=768, out_features=768, bias=True)
  (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (classifier): Linear(in_features=768, out_features=2, bias=True)
  (de

In [48]:
calculate_cosine_sim_model(model_after,classifier_head,eval_dataloader)

Average Cosine Similarity: 1.0000


In [49]:
calculate_loss_model(model_after,classifier_head,criterion,eval_dataloader)

Average Loss: 7.1917


In [50]:
sentence_a = 'A man is playing basketball on stage'
sentence_b = "The man is exercising"
similarity = calculate_similarity_model(model_after, sentence_a, sentence_b, device)
print(f"Cosine Similarity: {similarity:.4f}")

Cosine Similarity: 1.0000


In [51]:
eval_metrics = evaluate_nli_model(model_after, classifier_head, eval_dataloader, device)
print(eval_metrics)

Accuracy: 0.3320
Precision: 0.1102
Recall: 0.3320
F1-score: 0.1655
{'accuracy': 0.332, 'precision': 0.110224, 'recall': 0.332, 'f1': 0.1655015015015015}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [52]:
sentence_a = 'A man is playing basketball on stage'
sentence_b = "The man is exercising"
similarity, nli_result = predict_nli_and_similarity(model, classifier_head, sentence_a, sentence_b, device)

print(f"Cosine Similarity: {similarity:.4f}")
print(f"NLI Prediction: {nli_result}")

Cosine Similarity: 1.0000
NLI Prediction: neutral


### Performance Metrics

| **Model Type**       | **MNLI Performance**                                                                 |
|-----------------------|-------------------------------------------------------------------------------------|
| **Our Model**         | Accuracy: 0.3320, Precision: 0.1102, Recall: 0.3320, F1-Score: 0.1655              |