# BERT

In [6]:
# import necessary library
import math
import re
from random import *
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import re
import time
import datasets
import os

In [7]:
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    # Get the number of GPUs
    num_gpus = torch.cuda.device_count()
    print(f"Number of GPUs available: {num_gpus}")
    
    # List all GPU devices
    for i in range(num_gpus):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("CUDA is not available. No GPUs detected.")

CUDA is not available. No GPUs detected.


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device
print(device)

#make our work comparable if restarted the kernel
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

cpu


# Task1

## 1. Data

In [9]:
from datasets import load_dataset

# Load BookCorpus dataset
# The first 1% of `train` split.
dataset = load_dataset('bookcorpus', split='train[:1%]')
#dataset = dataset.select(range(100000))
dataset

Dataset({
    features: ['text'],
    num_rows: 740042
})

## 2. Preprocessing

### Tokenization and numericalization

In [10]:
sentences = dataset['text']
text = [x.lower() for x in sentences] #lower case
text = [re.sub("[.,!?\\-]", '', x) for x in text] #clean all symbols
# text

In [11]:
# making vocab list
word_list = list(set(" ".join(text).split()))
word2id   = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}

In [12]:
for sentence in text:
    print(sentence, "_____")
    words = sentence.split()
    print(words)
    break

usually  he would be tearing around the living room  playing with his toys  _____
['usually', 'he', 'would', 'be', 'tearing', 'around', 'the', 'living', 'room', 'playing', 'with', 'his', 'toys']


In [13]:
from tqdm.auto import tqdm

# Combine everything into one to make vocab
word_list = list(set(" ".join(text).split()))
word2id = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}  # special tokens

# Create the word2id in a single pass
for i, w in tqdm(enumerate(word_list), desc="Creating word2id"):
    word2id[w] = i + 4  # because 0-3 are already occupied

# Precompute the id2word mapping (this can be done once after word2id is fully populated)
id2word = {v: k for k, v in word2id.items()}
vocab_size = len(word2id)
vocab_size

Creating word2id: 60301it [00:00, 4688144.83it/s]


60305

In [14]:
len(sentences)

740042

In [15]:
vocab_size = len(word2id)

# List of all tokens for the whole text
token_list = []

# Process sentences more efficiently
for sentence in tqdm(text, desc="Processing sentences"):
    token_list.append([word2id[word] for word in sentence.split()])

Processing sentences: 100%|██████████| 740042/740042 [00:02<00:00, 362197.57it/s]


## 3. Data loader

In [16]:
# assign some hyperparameter
batch_size = 6
max_mask   = 5 
max_len    = 200 

In [17]:
# create batch that has half for positive and another half for negative
def make_batch():
    batch = []
    positive = negative = 0
    while positive != batch_size / 2 or negative != batch_size / 2: # if positive is half, negative is also half
        
        #randomly choose two sentence
        tokens_a_index, tokens_b_index = randrange(len(sentences)), randrange(len(sentences))
        tokens_a, tokens_b            = token_list[tokens_a_index], token_list[tokens_b_index]
        
        #1. token embedding - add CLS and SEP on starting and ending of sentence respectively
        input_ids = [word2id['[CLS]']] + tokens_a + [word2id['[SEP]']] + tokens_b + [word2id['[SEP]']]
        
        #2. segment embedding - which sentence is 0 (first sentence) and 1 (second sentence)
        segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)
        
        #3 masking
        n_pred = min(max_mask, max(1, int(round(len(input_ids) * 0.15))))
        #get all the pos excluding CLS and SEP
        candidates_masked_pos = [i for i, token in enumerate(input_ids) if token != word2id['[CLS]'] 
                                 and token != word2id['[SEP]']]
        shuffle(candidates_masked_pos)
        masked_tokens, masked_pos = [], [] #compare the output with masked_tokens
        #simply loop and mask accordingly
        for pos in candidates_masked_pos[:n_pred]:
            masked_pos.append(pos)
            masked_tokens.append(input_ids[pos])
            if random() < 0.1:  #10% replace with random token
                index = randint(0, vocab_size - 1)
                input_ids[pos] = word2id[id2word[index]]
            elif random() < 0.8:  #80 replace with [MASK]
                input_ids[pos] = word2id['[MASK]']
            else: 
                pass
            
        #4. pad the sentence to the max length
        n_pad = max_len - len(input_ids)
        input_ids.extend([0] * n_pad)
        segment_ids.extend([0] * n_pad)
        
        #5. pad the mask tokens to the max length
        if max_mask > n_pred:
            n_pad = max_mask - n_pred
            masked_tokens.extend([0] * n_pad)
            masked_pos.extend([0] * n_pad)
        
        #6. check whether is positive or negative
        if tokens_a_index + 1 == tokens_b_index and positive < batch_size / 2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True]) # True = it is the next sentence
            positive += 1
        elif tokens_a_index + 1 != tokens_b_index and negative < batch_size / 2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False])
            negative += 1
        
    return batch
        

In [18]:
# check the make_batch to ensure that it work correctly
batch = make_batch()

In [19]:
len(batch)

6

In [20]:
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))

In [21]:
input_ids.shape, segment_ids.shape, masked_tokens.shape, masked_pos.shape, isNext

(torch.Size([6, 200]),
 torch.Size([6, 200]),
 torch.Size([6, 5]),
 torch.Size([6, 5]),
 tensor([0, 0, 0, 1, 1, 1]))

## 4. Model


## 4.1 Embedding



In [22]:
class Embedding(nn.Module):
    def __init__(self):
        super(Embedding, self).__init__()
        self.tok_embed = nn.Embedding(vocab_size, d_model)  # token embedding
        self.pos_embed = nn.Embedding(max_len, d_model)      # position embedding
        self.seg_embed = nn.Embedding(n_segments, d_model)  # segment(token type) embedding
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, seg):
        #x, seg: (bs, len)
        seq_len = x.size(1)
        pos = torch.arange(seq_len, dtype=torch.long)
        pos = pos.unsqueeze(0).expand_as(x)  # (len,) -> (bs, len)
        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)
        return self.norm(embedding)

## 4.2 Attention mask

In [23]:
def get_attn_pad_mask(seq_q, seq_k):
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    # eq(zero) is PAD token
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # batch_size x 1 x len_k(=len_q), one is masking
    return pad_attn_mask.expand(batch_size, len_q, len_k)  # batch_size x len_q x len_k

### Testing the attention mask

In [24]:
print(get_attn_pad_mask(input_ids, input_ids).shape)

torch.Size([6, 200, 200])


## 4.3 Encoder

In [25]:
class EncoderLayer(nn.Module):
    def __init__(self):
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention()
        self.pos_ffn       = PoswiseFeedForwardNet()

    def forward(self, enc_inputs, enc_self_attn_mask):
        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs to same Q,K,V
        enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size x len_q x d_model]
        return enc_outputs, attn

Let's define the scaled dot attention, to be used inside the multihead attention

In [26]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def forward(self, Q, K, V, attn_mask):
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # scores : [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
        scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is one.
        attn = nn.Softmax(dim=-1)(scores)
        context = torch.matmul(attn, V)
        return context, attn 

Let's define the parameters first

In [27]:
n_layers = 6    # number of Encoder of Encoder Layer
n_heads  = 8    # number of heads in Multi-Head Attention
d_model  = 768  # Embedding Size
d_ff = 768 * 4  # 4*d_model, FeedForward dimension
d_k = d_v = 64  # dimension of K(=Q), V
n_segments = 2

Here is the Multiheadattention.

In [28]:
class MultiHeadAttention(nn.Module):
    def __init__(self):
        super(MultiHeadAttention, self).__init__()
        self.W_Q = nn.Linear(d_model, d_k * n_heads)
        self.W_K = nn.Linear(d_model, d_k * n_heads)
        self.W_V = nn.Linear(d_model, d_v * n_heads)
    def forward(self, Q, K, V, attn_mask):
        # q: [batch_size x len_q x d_model], k: [batch_size x len_k x d_model], v: [batch_size x len_k x d_model]
        residual, batch_size = Q, Q.size(0)
        # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)
        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # q_s: [batch_size x n_heads x len_q x d_k]
        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # k_s: [batch_size x n_heads x len_k x d_k]
        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # v_s: [batch_size x n_heads x len_k x d_v]

        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size x n_heads x len_q x len_k]

        # context: [batch_size x n_heads x len_q x d_v], attn: [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
        context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size x len_q x n_heads * d_v]
        output = nn.Linear(n_heads * d_v, d_model)(context)
        return nn.LayerNorm(d_model)(output + residual), attn # output: [batch_size x len_q x d_model]


Here is the PoswiseFeedForwardNet.

In [29]:
class PoswiseFeedForwardNet(nn.Module):
    def __init__(self):
        super(PoswiseFeedForwardNet, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        # (batch_size, len_seq, d_model) -> (batch_size, len_seq, d_ff) -> (batch_size, len_seq, d_model)
        return self.fc2(F.gelu(self.fc1(x)))


## 4.4 Putting them together

In [30]:
class BERT(nn.Module):
    def __init__(self):
        super(BERT, self).__init__()
        self.embedding = Embedding()
        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])
        self.fc = nn.Linear(d_model, d_model)
        self.activ = nn.Tanh()
        self.linear = nn.Linear(d_model, d_model)
        self.norm = nn.LayerNorm(d_model)
        self.classifier = nn.Linear(d_model, 2)
        # decoder is shared with embedding layer
        embed_weight = self.embedding.tok_embed.weight
        n_vocab, n_dim = embed_weight.size()
        self.decoder = nn.Linear(n_dim, n_vocab, bias=False)
        self.decoder.weight = embed_weight
        self.decoder_bias = nn.Parameter(torch.zeros(n_vocab))

    def forward(self, input_ids, segment_ids, masked_pos):
        output = self.embedding(input_ids, segment_ids)
        enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids)
        for layer in self.layers:
            output, enc_self_attn = layer(output, enc_self_attn_mask)
        # output : [batch_size, len, d_model], attn : [batch_size, n_heads, d_mode, d_model]
        
        # 1. predict next sentence
        # it will be decided by first token(CLS)
        h_pooled   = self.activ(self.fc(output[:, 0])) # [batch_size, d_model]
        logits_nsp = self.classifier(h_pooled) # [batch_size, 2]

        # 2. predict the masked token
        masked_pos = masked_pos[:, :, None].expand(-1, -1, output.size(-1)) # [batch_size, max_pred, d_model]
        h_masked = torch.gather(output, 1, masked_pos) # masking position [batch_size, max_pred, d_model]
        h_masked  = self.norm(F.gelu(self.linear(h_masked)))
        logits_lm = self.decoder(h_masked) + self.decoder_bias # [batch_size, max_pred, n_vocab]

        return logits_lm, logits_nsp

## 5. Training

In [31]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [32]:
num_epoch = 1000
model = BERT()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

batch = make_batch()
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))

best_loss = float('inf')

start_time = time.time()
for epoch in range(num_epoch):
    optimizer.zero_grad()
    logits_lm, logits_nsp = model(input_ids, segment_ids, masked_pos)    

    #1. mlm loss
    #logits_lm.transpose: (bs, vocab_size, max_mask) vs. masked_tokens: (bs, max_mask)
    loss_lm = criterion(logits_lm.transpose(1, 2), masked_tokens) # for masked LM
    loss_lm = (loss_lm.float()).mean()
    #2. nsp loss
    #logits_nsp: (bs, 2) vs. isNext: (bs, )
    loss_nsp = criterion(logits_nsp, isNext) # for sentence classification
    
    #3. combine loss
    loss = loss_lm + loss_nsp
    if loss < best_loss:
        best_loss = loss
        torch.save(model.state_dict(), 'app/models/best-bert-model.pt')

    if epoch % 100 == 0:
        print('Epoch:', '%02d' % (epoch), 'loss =', '{:.6f}'.format(loss))
    loss.backward()
    optimizer.step()

end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)
print(f'Time: {epoch_mins}m {epoch_secs}s')

Epoch: 00 loss = 115.965813
Epoch: 100 loss = 4.064146
Epoch: 200 loss = 4.383498
Epoch: 300 loss = 5.561941
Epoch: 400 loss = 6.128051
Epoch: 500 loss = 3.934734
Epoch: 600 loss = 4.089477
Epoch: 700 loss = 3.599619
Epoch: 800 loss = 3.912139
Epoch: 900 loss = 5.305858
Time: 42m 50s


## 6. Inference

Since our dataset is very small, it won't work very well, but just for the sake of demonstration.

In [38]:
# Predict mask tokens ans isNext
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(batch[2]))
print([id2word[w.item()] for w in input_ids[0] if id2word[w.item()] != '[PAD]'])

logits_lm, logits_nsp = model(input_ids, segment_ids, masked_pos)

#predict masked tokens
#max the probability along the vocab dim (2), [1] is the indices of the max, and [0] is the first value
logits_lm = logits_lm.data.max(2)[1][0].data.numpy() 
#note that zero is padding we add to the masked_tokens
print('masked tokens (words) : ',[id2word[pos.item()] for pos in masked_tokens[0]])
print('masked tokens list : ',[pos.item() for pos in masked_tokens[0]])
print('predict masked tokens (words) : ',[id2word[pos.item()] for pos in logits_lm])
print('predict masked tokens list : ', [pos for pos in logits_lm])

#predict nsp
logits_nsp = logits_nsp.data.max(1)[1][0].data.numpy()
print(logits_nsp)
print('isNext : ', True if isNext else False)
print('predict isNext : ',True if logits_nsp else False)

['[CLS]', 'i', 'just', 'thought', "'", 'she', 'swallowed', 'hard', 'fighting', '[MASK]', 'prevent', 'her', 'throat', '[MASK]', 'seizing', 'up', '[SEP]', 'her', 'eyes', 'widened', '[SEP]']
masked tokens (words) :  ['from', 'to', 'her', '[PAD]', '[PAD]']
masked tokens list :  [13739, 20835, 44990, 0, 0]
predict masked tokens (words) :  ['to', 'to', 'her', 'equal', 'equal']
predict masked tokens list :  [20835, 20835, 44990, 31658, 31658]
0
isNext :  False
predict isNext :  False


# Task2

In [39]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [40]:
# load the MNLI dataset
import datasets

mnli = datasets.load_dataset('glue', 'mnli')
mnli['train'].features

{'premise': Value(dtype='string', id=None),
 'hypothesis': Value(dtype='string', id=None),
 'label': ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [41]:
# list of datasets to remove 'idx' column from
mnli.column_names.keys()

dict_keys(['train', 'validation_matched', 'validation_mismatched', 'test_matched', 'test_mismatched'])

In [42]:
# remove 'idx' column from each dataset
for column_names in mnli.column_names.keys():
    mnli[column_names] = mnli[column_names].remove_columns('idx')

In [43]:
# list of datasets to ensure that 'idx' column is removed
mnli.column_names.keys()

dict_keys(['train', 'validation_matched', 'validation_mismatched', 'test_matched', 'test_mismatched'])

In [44]:
# list all label that have in the dataset
np.unique(mnli['train']['label'])

array([0, 1, 2])

In [45]:
# create dataset dictionary with sample data (since my computer cannot run all dataset)
from datasets import DatasetDict

raw_dataset = DatasetDict({
    'train': mnli['train'].shuffle(seed=55).select(list(range(10000))),
    'test': mnli['test_mismatched'].shuffle(seed=55).select(list(range(1000))),
    'validation': mnli['validation_mismatched'].shuffle(seed=55).select(list(range(1000)))
})

raw_dataset

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 1000
    })
})

### Preprocessing

In [46]:
import torch
from random import seed, shuffle, random, randint

def preprocess_function(examples):
    lst_input_ids_premise = []
    lst_input_ids_hypothesis = []
    lst_masked_tokens_premise = []
    lst_masked_pos_premise = []
    lst_masked_tokens_hypothesis = []
    lst_masked_pos_hypothesis = []
    lst_segment_ids = []
    lst_attention_premise = []
    lst_attention_hypothesis = []
    
    labels = examples['label']
    max_seq_length = 200  # Ensure all sequences are max length
    max_mask = 5         # Define max number of masked tokens
    seed(55) 

    for i in range(len(examples['premise'])):
        # Convert words to indices (handle OOV words)
        tokens_premise = [word2id[word] if word in word_list else len(word_list) for word in examples['premise'][i].split()]
        tokens_hypothesis = [word2id[word] if word in word_list else len(word_list) for word in examples['hypothesis'][i].split()]

        # 1. Token Embedding - Add CLS and SEP tokens
        input_ids_premise = [word2id['[CLS]']] + tokens_premise + [word2id['[SEP]']]
        input_ids_hypothesis = [word2id['[CLS]']] + tokens_hypothesis + [word2id['[SEP]']]

        # 2. Segment Embedding (All zeros)
        segment_ids = [0] * max_seq_length

        # 3. Masking Process (15% of words, excluding CLS/SEP)
        def apply_masking(input_ids):
            n_pred = min(max_mask, max(1, int(round(len(input_ids) * 0.15))))  # 15% masking
            candidates_masked_pos = [idx for idx, token in enumerate(input_ids) if token not in [word2id['[CLS]'], word2id['[SEP]']]]
            shuffle(candidates_masked_pos)
            
            masked_tokens = []
            masked_pos = []
            
            for pos in candidates_masked_pos[:n_pred]:
                masked_pos.append(pos)
                masked_tokens.append(input_ids[pos])
                
                if random() < 0.1:  # 10% replace with random token
                    input_ids[pos] = word2id[id2word[randint(0, vocab_size - 1)]]
                elif random() < 0.8:  # 80% replace with [MASK]
                    input_ids[pos] = word2id['[MASK]']
            
            # Ensure `masked_tokens` and `masked_pos` are `max_mask` in size
            masked_tokens = masked_tokens[:max_mask] + [0] * (max_mask - len(masked_tokens))
            masked_pos = masked_pos[:max_mask] + [0] * (max_mask - len(masked_pos))
            
            return masked_tokens, masked_pos

        masked_tokens_premise, masked_pos_premise = apply_masking(input_ids_premise)
        masked_tokens_hypothesis, masked_pos_hypothesis = apply_masking(input_ids_hypothesis)

        # 4. Pad/truncate sequences to `max_seq_length`
        input_ids_premise = input_ids_premise[:max_seq_length] + [0] * (max_seq_length - len(input_ids_premise))
        input_ids_hypothesis = input_ids_hypothesis[:max_seq_length] + [0] * (max_seq_length - len(input_ids_hypothesis))

        # 5. Attention Mask (1 for real tokens, 0 for padding)
        attention_premise = [1] * len(input_ids_premise[:max_seq_length]) + [0] * (max_seq_length - len(input_ids_premise))
        attention_hypothesis = [1] * len(input_ids_hypothesis[:max_seq_length]) + [0] * (max_seq_length - len(input_ids_hypothesis))

        # Store processed values
        lst_input_ids_premise.append(input_ids_premise)
        lst_input_ids_hypothesis.append(input_ids_hypothesis)
        lst_segment_ids.append(segment_ids)
        lst_masked_tokens_premise.append(masked_tokens_premise)
        lst_masked_pos_premise.append(masked_pos_premise)
        lst_masked_tokens_hypothesis.append(masked_tokens_hypothesis)
        lst_masked_pos_hypothesis.append(masked_pos_hypothesis)
        lst_attention_premise.append(attention_premise)
        lst_attention_hypothesis.append(attention_hypothesis)

    # Return dictionary in format expected by HuggingFace datasets
    return {
        "premise_input_ids": lst_input_ids_premise,
        "premise_pos_mask": lst_masked_pos_premise,
        "hypothesis_input_ids": lst_input_ids_hypothesis,
        "hypothesis_pos_mask": lst_masked_pos_hypothesis,
        "segment_ids": lst_segment_ids,
        "attention_premise": lst_attention_premise,
        "attention_hypothesis": lst_attention_hypothesis,
        "labels": labels,
    }

# Apply the preprocessing function to the dataset
tokenized_datasets = raw_dataset.map(preprocess_function, batched=True)

# Remove unneeded columns and format dataset for PyTorch
tokenized_datasets = tokenized_datasets.remove_columns(['premise', 'hypothesis', 'label'])
tokenized_datasets.set_format("torch")


Map: 100%|██████████| 10000/10000 [02:39<00:00, 62.74 examples/s]
Map: 100%|██████████| 1000/1000 [00:16<00:00, 61.99 examples/s]
Map: 100%|██████████| 1000/1000 [00:16<00:00, 59.65 examples/s]


In [47]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['premise_input_ids', 'premise_pos_mask', 'hypothesis_input_ids', 'hypothesis_pos_mask', 'segment_ids', 'attention_premise', 'attention_hypothesis', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['premise_input_ids', 'premise_pos_mask', 'hypothesis_input_ids', 'hypothesis_pos_mask', 'segment_ids', 'attention_premise', 'attention_hypothesis', 'labels'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['premise_input_ids', 'premise_pos_mask', 'hypothesis_input_ids', 'hypothesis_pos_mask', 'segment_ids', 'attention_premise', 'attention_hypothesis', 'labels'],
        num_rows: 1000
    })
})

### Data loader

In [48]:
from torch.utils.data import DataLoader

# create the dataloader
batch_size = 32
train_dataloader = DataLoader(
    tokenized_datasets['train'], 
    batch_size=batch_size, 
    shuffle=True
)
eval_dataloader = DataLoader(
    tokenized_datasets['validation'], 
    batch_size=batch_size
)
test_dataloader = DataLoader(
    tokenized_datasets['test'], 
    batch_size=batch_size
)

In [40]:
# print the shape of each key 
for batch in train_dataloader:
    print(batch['premise_input_ids'].shape)
    print(batch['premise_pos_mask'].shape)
    print(batch['hypothesis_input_ids'].shape)
    print(batch['hypothesis_pos_mask'].shape)
    print(batch['segment_ids'].shape)
    print(batch['attention_premise'].shape)
    print(batch['attention_hypothesis'].shape)
    print(batch['labels'].shape)
    break

torch.Size([32, 200])
torch.Size([32, 5])
torch.Size([32, 200])
torch.Size([32, 5])
torch.Size([32, 200])
torch.Size([32, 200])
torch.Size([32, 200])
torch.Size([32])


### Model

In [49]:
# load model from task1
model1 = BERT()
model1.load_state_dict(torch.load('app/models/best-bert-model.pt'))

<All keys matched successfully>

In [50]:
def mean_pool(token_embeds, attention_mask):
    # Ensure attention_mask matches token_embeds length
    if attention_mask.shape[1] != token_embeds.shape[1]:
        raise ValueError(f"Mismatch: token_embeds has {token_embeds.shape[1]}, but attention_mask has {attention_mask.shape[1]}")

    # Expand mask and perform mean pooling
    in_mask = attention_mask.unsqueeze(-1).float()  # [batch_size, seq_len, 1]
    pool = torch.sum(token_embeds * in_mask, dim=1) / torch.clamp(in_mask.sum(dim=1), min=1e-9)

    return pool



### Loss Function

In [51]:
# the function is for Classification Objective
def configurations(u,v):
    # build the |u-v| tensor
    uv = torch.sub(u, v)   # batch_size,hidden_dim
    uv_abs = torch.abs(uv) # batch_size,hidden_dim
    
    # concatenate u, v, |u-v|
    x = torch.cat([u, v, uv_abs], dim=-1) # batch_size, 3*hidden_dim
    return x

# the function is for Regression Objective
def cosine_similarity(u, v):
    dot_product = np.dot(u, v)
    norm_u = np.linalg.norm(u)
    norm_v = np.linalg.norm(v)
    similarity = dot_product / (norm_u * norm_v)
    return similarity

In [52]:
# classifier_head has shape (vocab_size*3,3)
classifier_head = torch.nn.Linear(60305*3, 3).to(device)

optimizer = torch.optim.Adam(model1.parameters(), lr=2e-5)
optimizer_classifier = torch.optim.Adam(classifier_head.parameters(), lr=2e-5)

criterion = nn.CrossEntropyLoss()

In [53]:
from transformers import get_linear_schedule_with_warmup

# and setup a warmup for the first ~10% steps
total_steps = int(len(raw_dataset) / batch_size)
warmup_steps = int(0.1 * total_steps)
scheduler = get_linear_schedule_with_warmup(
		optimizer, num_warmup_steps=warmup_steps,
  	num_training_steps=total_steps - warmup_steps
)

# then during the training loop we update the scheduler per step
scheduler.step()

scheduler_classifier = get_linear_schedule_with_warmup(
		optimizer_classifier, num_warmup_steps=warmup_steps,
  	num_training_steps=total_steps - warmup_steps
)

# then during the training loop we update the scheduler per step
scheduler_classifier.step()



### Training the model1

In [46]:
from tqdm.auto import tqdm
import time
import torch

num_epoch = 5  # Increase if needed
start_time = time.time()

best_loss = float('inf')  # Global best loss

for epoch in range(num_epoch):
    model1.train()  
    classifier_head.train()
    best_loss_epoch = float('inf')  # Reset for this epoch

    for step, batch in enumerate(tqdm(train_dataloader, leave=True)):
        optimizer.zero_grad()
        optimizer_classifier.zero_grad()
        
        # Move to device
        inputs_ids_a = batch['premise_input_ids'].to(device)
        inputs_ids_b = batch['hypothesis_input_ids'].to(device)
        segment_ids = batch['segment_ids'].to(device)
        attention_a = batch['attention_premise'].to(device)  # ✅ Correct attention mask
        attention_b = batch['attention_hypothesis'].to(device)  # ✅ Correct attention mask
        label = batch['labels'].to(device)
        
        # Extract token embeddings from BERT
        u, _ = model1(inputs_ids_a, segment_ids, attention_a)  
        v, _ = model1(inputs_ids_b, segment_ids, attention_b)  

        # Mean Pooling
        u_mean_pool = mean_pool(u, attention_a)  # ✅ Corrected
        v_mean_pool = mean_pool(v, attention_b)  # ✅ Corrected
        
        # Compute similarity vector
        uv = torch.sub(u_mean_pool, v_mean_pool)  
        uv_abs = torch.abs(uv)
        x = torch.cat([u_mean_pool, v_mean_pool, uv_abs], dim=-1) 
        
        # Classify
        x = classifier_head(x)  
        loss = criterion(x, label)

        # Update best loss
        if loss.item() < best_loss_epoch:
            best_loss_epoch = loss.item()  # Save best loss for this epoch
            best_model = model1.state_dict()

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer_classifier.step()

    # Update learning rate scheduler per epoch
    scheduler.step()
    scheduler_classifier.step()

    # Save model per epoch if best
    if best_loss_epoch < best_loss:
        best_loss = best_loss_epoch
        torch.save(best_model, 'app/models/our-model.pt')

    print(f'Epoch: {epoch + 1} | Loss = {best_loss_epoch:.6f}')

end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)
print(f'Time: {epoch_mins}m {epoch_secs}s')


100%|██████████| 313/313 [1:44:15<00:00, 19.99s/it]


Epoch: 1 | Loss = 7.943012


100%|██████████| 313/313 [1:48:08<00:00, 20.73s/it]


Epoch: 2 | Loss = 6.284531


100%|██████████| 313/313 [1:48:27<00:00, 20.79s/it]


Epoch: 3 | Loss = 8.375278


100%|██████████| 313/313 [1:53:32<00:00, 21.77s/it]


Epoch: 4 | Loss = 8.771623


100%|██████████| 313/313 [2:09:37<00:00, 24.85s/it]  

Epoch: 5 | Loss = 8.791630
Time: 564m 2s





# Task 3

In [54]:
# function for calculate the total parameters
def count_parameters(model):
    params = [p.numel() for p in model.parameters() if p.requires_grad]
    print(f'______\n{sum(params):>6}')

In [55]:
def calculate_loss_model1(model, classifier, criterion, eval_dataloader):
    model.eval()
    classifier.eval()
    
    total_loss = 0
    with torch.no_grad():
        for batch in eval_dataloader:
            # Move data to device
            inputs_ids_a = batch['premise_input_ids'].to(device)
            inputs_ids_b = batch['hypothesis_input_ids'].to(device)
            segment_ids = batch['segment_ids'].to(device)
            attention_a = batch['attention_premise'].to(device)  # ✅ Aligned with training code
            attention_b = batch['attention_hypothesis'].to(device)  # ✅ Aligned with training code
            label = batch['labels'].to(device)

            # Extract token embeddings from BERT
            u, _ = model(inputs_ids_a, segment_ids, attention_a)  
            v, _ = model(inputs_ids_b, segment_ids, attention_b)  

            # Mean Pooling
            u_mean_pool = mean_pool(u, attention_a)  # ✅ Consistent with training
            v_mean_pool = mean_pool(v, attention_b)  # ✅ Consistent with training
            
            # Compute similarity vector
            uv = torch.sub(u_mean_pool, v_mean_pool)  
            uv_abs = torch.abs(uv)
            x = torch.cat([u_mean_pool, v_mean_pool, uv_abs], dim=-1) 
            
            # Classifier Prediction
            x = classifier(x)  
            loss = criterion(x, label)

            total_loss += loss.item()  # ✅ Use .item() to avoid tensor accumulation issue
    
    average_loss = total_loss / len(eval_dataloader)  # Compute mean loss
    return average_loss  # ✅ Return loss instead of just printing


In [56]:
import torch.nn.functional as F

def calculate_cosine_sim_model1(model, classifier, eval_dataloader):
    model.eval()
    classifier.eval()
    
    total_similarity = 0
    batch_count = 0  # To track batch count for averaging
    
    with torch.no_grad():
        for batch in eval_dataloader:
            # Move batches to device
            inputs_ids_a = batch['premise_input_ids'].to(device)
            inputs_ids_b = batch['hypothesis_input_ids'].to(device)
            segment_ids = batch['segment_ids'].to(device)
            attention_a = batch['attention_premise'].to(device)  # ✅ Aligned with training
            attention_b = batch['attention_hypothesis'].to(device)  # ✅ Aligned with training

            # Extract token embeddings from BERT
            u, _ = model(inputs_ids_a, segment_ids, attention_a)  
            v, _ = model(inputs_ids_b, segment_ids, attention_b)  
            
            # Get the mean pooled vectors
            u_mean_pool = mean_pool(u, attention_a)  # ✅ Batch-wise computation
            v_mean_pool = mean_pool(v, attention_b)  # ✅ Batch-wise computation

            # Compute cosine similarity in PyTorch (batch-wise)
            similarity_score = F.cosine_similarity(u_mean_pool, v_mean_pool, dim=-1)  # ✅ No NumPy needed

            # Sum up the batch similarity and increase batch count
            total_similarity += similarity_score.mean().item()
            batch_count += 1
        
    # Compute the final average similarity
    average_similarity = total_similarity / batch_count  # ✅ Avoid division by len(eval_dataloader)
    
    return average_similarity  # ✅ Return value instead of just printing


In [57]:
import random

def tokenize_sentence_model1(sentence_a, sentence_b):
    lst_input_ids_premise = []
    lst_input_ids_hypothesis = []
    lst_masked_tokens_premise = []
    lst_masked_pos_premise = []
    lst_masked_tokens_hypothesis = []
    lst_masked_pos_hypothesis = []
    lst_segment_ids = []
    lst_attention_premise = []
    lst_attention_hypothesis = []
    
    max_seq_length = 200
    seed(55)  # Ensures consistent masking

    # Tokenize words using predefined vocabulary
    tokens_premise = [word2id[word] if word in word_list else len(word_list) for word in sentence_a.split()]
    tokens_hypothesis = [word2id[word] if word in word_list else len(word_list) for word in sentence_b.split()]
    
    # Add CLS and SEP tokens
    input_ids_premise = [word2id['[CLS]']] + tokens_premise + [word2id['[SEP]']]
    input_ids_hypothesis = [word2id['[CLS]']] + tokens_hypothesis + [word2id['[SEP]']]
    
    # Segment IDs (0 for premise, 1 for hypothesis)
    segment_ids = [0] * len(input_ids_premise) + [1] * len(input_ids_hypothesis)
    segment_ids.extend([0] * (max_seq_length - len(segment_ids)))  # Pad segment IDs to max length
    
    # Masking for premise
    n_pred_premise = min(max_mask, max(1, int(round(len(input_ids_premise) * 0.15))))
    candidates_masked_pos_premise = [i for i, token in enumerate(input_ids_premise) if token not in [word2id['[CLS]'], word2id['[SEP]']]]
    shuffle(candidates_masked_pos_premise)
    
    masked_tokens_premise, masked_pos_premise = [], []
    for pos in candidates_masked_pos_premise[:n_pred_premise]:
        masked_pos_premise.append(pos)
        masked_tokens_premise.append(input_ids_premise[pos])

        rand_val = random.random()
        if rand_val < 0.8:  # 80% replace with [MASK]
            input_ids_premise[pos] = word2id['[MASK]']
        elif rand_val < 0.9:  # 10% replace with random token
            input_ids_premise[pos] = word2id[id2word[random.randint(0, vocab_size - 1)]]
    
    # Masking for hypothesis
    n_pred_hypothesis = min(max_mask, max(1, int(round(len(input_ids_hypothesis) * 0.15))))
    candidates_masked_pos_hypothesis = [i for i, token in enumerate(input_ids_hypothesis) if token not in [word2id['[CLS]'], word2id['[SEP]']]]
    shuffle(candidates_masked_pos_hypothesis)
    
    masked_tokens_hypothesis, masked_pos_hypothesis = [], []
    for pos in candidates_masked_pos_hypothesis[:n_pred_hypothesis]:
        masked_pos_hypothesis.append(pos)
        masked_tokens_hypothesis.append(input_ids_hypothesis[pos])

        rand_val = random.random()
        if rand_val < 0.8:  # 80% replace with [MASK]
            input_ids_hypothesis[pos] = word2id['[MASK]']
        elif rand_val < 0.9:  # 10% replace with random token
            input_ids_hypothesis[pos] = word2id[id2word[random.randint(0, vocab_size - 1)]]

    # Padding premise
    n_pad_premise = max_seq_length - len(input_ids_premise)
    input_ids_premise.extend([0] * n_pad_premise)

    # Attention mask premise
    attention_premise = [1] * len(input_ids_premise) + [0] * n_pad_premise
    attention_premise = attention_premise[:max_seq_length]

    # Padding hypothesis
    n_pad_hypothesis = max_seq_length - len(input_ids_hypothesis)
    input_ids_hypothesis.extend([0] * n_pad_hypothesis)

    # Attention mask hypothesis
    attention_hypothesis = [1] * len(input_ids_hypothesis) + [0] * n_pad_hypothesis
    attention_hypothesis = attention_hypothesis[:max_seq_length]

    # Masked token padding
    if max_mask > n_pred_premise:
        masked_tokens_premise.extend([0] * (max_mask - n_pred_premise))
        masked_pos_premise.extend([0] * (max_mask - n_pred_premise))

    if max_mask > n_pred_hypothesis:
        masked_tokens_hypothesis.extend([0] * (max_mask - n_pred_hypothesis))
        masked_pos_hypothesis.extend([0] * (max_mask - n_pred_hypothesis))

    # Store in lists
    lst_input_ids_premise.append(input_ids_premise[:max_seq_length])
    lst_input_ids_hypothesis.append(input_ids_hypothesis[:max_seq_length])
    lst_segment_ids.append(segment_ids[:max_seq_length])
    lst_masked_tokens_premise.append(masked_tokens_premise)
    lst_masked_pos_premise.append(masked_pos_premise)
    lst_masked_tokens_hypothesis.append(masked_tokens_hypothesis)
    lst_masked_pos_hypothesis.append(masked_pos_hypothesis)
    lst_attention_premise.append(attention_premise)
    lst_attention_hypothesis.append(attention_hypothesis)

    return {
        "premise_input_ids": lst_input_ids_premise,
        "premise_pos_mask": lst_masked_pos_premise,
        "hypothesis_input_ids": lst_input_ids_hypothesis,
        "hypothesis_pos_mask": lst_masked_pos_hypothesis,
        "segment_ids": lst_segment_ids,
        "attention_premise": lst_attention_premise,
        "attention_hypothesis": lst_attention_hypothesis,
    }


In [76]:
def mean_pool(token_embeds, attention_mask):
    # Ensure attention_mask matches token_embeds length
    if attention_mask.shape[1] != token_embeds.shape[1]:
        #print(f"⚠ Warning: Resizing attention_mask from {attention_mask.shape} to match {token_embeds.shape}")
        attention_mask = attention_mask[:, :token_embeds.shape[1]]  # Trim to match
    
    # Expand mask and perform mean pooling
    in_mask = attention_mask.unsqueeze(-1).float()  # [batch_size, seq_len, 1]
    pool = torch.sum(token_embeds * in_mask, dim=1) / torch.clamp(in_mask.sum(dim=1), min=1e-9)

    return pool



In [77]:
import torch
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity_model1(model, sentence_a, sentence_b, device):
    # Tokenize and convert sentences to input IDs and attention masks
    inputs = tokenize_sentence_model1(sentence_a, sentence_b)
    
    # Convert to tensors and move to device
    inputs_ids_a = torch.tensor(inputs['premise_input_ids']).to(device)
    pos_mask_a = torch.tensor(inputs['premise_pos_mask']).to(device)
    attention_a = torch.tensor(inputs['attention_premise']).to(device)
    inputs_ids_b = torch.tensor(inputs['hypothesis_input_ids']).to(device)
    pos_mask_b = torch.tensor(inputs['hypothesis_pos_mask']).to(device)
    attention_b = torch.tensor(inputs['attention_hypothesis']).to(device)
    segment = torch.tensor(inputs['segment_ids']).to(device)

    
    # Ensure correct tensor dimensions (remove batch dim if necessary)
    inputs_ids_a = inputs_ids_a.squeeze(0) if inputs_ids_a.dim() > 2 else inputs_ids_a  
    inputs_ids_b = inputs_ids_b.squeeze(0) if inputs_ids_b.dim() > 2 else inputs_ids_b
    segment = segment.squeeze(0) if segment.dim() > 2 else segment

    # Extract token embeddings from BERT
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        u, _ = model(inputs_ids_a, segment, pos_mask_a)  
        v, _ = model(inputs_ids_b, segment, pos_mask_b) 

    # **DEBUG STEP: Print Shapes After Model Forward Pass**
    print(f"Shape of u (embeddings): {u.shape}")
    print(f"Shape of v (embeddings): {v.shape}")

    # Get the mean-pooled vectors
    u = mean_pool(u, attention_a).detach().cpu().numpy().squeeze(0)  
    v = mean_pool(v, attention_b).detach().cpu().numpy().squeeze(0)  

    # Calculate cosine similarity
    similarity_score = cosine_similarity(u.reshape(1, -1), v.reshape(1, -1))[0, 0]

    return similarity_score  


In [79]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
from tqdm.auto import tqdm  # Import tqdm for progress bar

def evaluate_nli_model(model, classifier, eval_dataloader, device):
    model.eval()
    classifier.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        # Wrap the evaluation loop with tqdm for progress tracking
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            inputs_ids_a = batch['premise_input_ids'].to(device)
            inputs_ids_b = batch['hypothesis_input_ids'].to(device)
            pos_mask_a = batch['premise_pos_mask'].to(device)
            pos_mask_b = batch['hypothesis_pos_mask'].to(device)
            segment_ids = batch['segment_ids'].to(device)
            attention_a = batch['attention_premise'].to(device)
            attention_b = batch['attention_hypothesis'].to(device)
            labels = batch['labels'].to(device)
            
            # Extract token embeddings
            u, _ = model(inputs_ids_a, segment_ids, pos_mask_a)
            v, _ = model(inputs_ids_b, segment_ids, pos_mask_b)
            
            # Mean pooling
            u_mean_pool = mean_pool(u, attention_a)
            v_mean_pool = mean_pool(v, attention_b)
            
            # Compute absolute difference
            uv_abs = torch.abs(u_mean_pool - v_mean_pool)
            
            # Concatenate u, v, and |u-v|
            x = torch.cat([u_mean_pool, v_mean_pool, uv_abs], dim=-1)
            
            # Pass through classifier
            logits = classifier(x)
            
            # Get predictions
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Compute metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
    
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1-score: {f1:.4f}')
    
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}


# then  both similarity and classification (entailment, neutral and contradiction) 

In [80]:
import torch
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity

def predict_nli_and_similarity(model, classifier_head, sentence_a, sentence_b, device):
    # Tokenize and convert sentences to input IDs and attention masks
    inputs = tokenize_sentence_model1(sentence_a, sentence_b)
    
    # Move input IDs and attention masks to the active device
    inputs_ids_a = torch.tensor(inputs['premise_input_ids']).to(device)
    pos_mask_a = torch.tensor(inputs['premise_pos_mask']).to(device)
    attention_a = torch.tensor(inputs['attention_premise']).to(device)
    inputs_ids_b = torch.tensor(inputs['hypothesis_input_ids']).to(device)
    pos_mask_b = torch.tensor(inputs['hypothesis_pos_mask']).to(device)
    attention_b = torch.tensor(inputs['attention_hypothesis']).to(device)
    segment = torch.tensor(inputs['segment_ids']).to(device)

    # Extract token embeddings from BERT
    with torch.no_grad():
        u, _ = model(inputs_ids_a, segment, pos_mask_a)
        v, _ = model(inputs_ids_b, segment, pos_mask_b)

    # Get the mean-pooled vectors
    u = mean_pool(u, attention_a)
    v = mean_pool(v, attention_b)

    # Convert to numpy for cosine similarity
    u_np = u.cpu().numpy().reshape(-1)
    v_np = v.cpu().numpy().reshape(-1)

    # Calculate cosine similarity
    similarity_score = cosine_similarity(u_np.reshape(1, -1), v_np.reshape(1, -1))[0, 0]

    # Compute NLI classification
    uv_abs = torch.abs(u - v)  # |u - v|
    x = torch.cat([u, v, uv_abs], dim=-1)  # Concatenate for classification

    with torch.no_grad():
        logits = classifier_head(x)  # Pass through classification head
        probabilities = F.softmax(logits, dim=-1)

    # NLI labels: contradiction (0), neutral (1), entailment (2)
    labels = ["contradiction", "neutral", "entailment"]
    nli_result = labels[torch.argmax(probabilities).item()]

    return similarity_score, nli_result


## Model 1

### Evaluate model1 before training with MNLI dataset

In [63]:
# load model1 before retrain in task2 
model1 = BERT()
model1.load_state_dict(torch.load('app/models/bert-model-v2.pt'))

<All keys matched successfully>

In [64]:
count_parameters(model1)

______
83137171


In [65]:
calculate_cosine_sim_model1(model1,classifier_head,eval_dataloader)

0.9987411666661501

In [66]:
calculate_loss_model1(model1,classifier_head,criterion,eval_dataloader)

8.723345130681992

In [78]:
sentence_a = 'A man is playing a guitar on stage'
sentence_b = "The man is performing music"
similarity = calculate_similarity_model1(model1, sentence_a, sentence_b, device)
print(f"Cosine Similarity: {similarity:.4f}")

Shape of u (embeddings): torch.Size([1, 5, 60305])
Shape of v (embeddings): torch.Size([1, 5, 60305])
Cosine Similarity: 0.9993


In [81]:
eval_metrics1 = evaluate_nli_model(model1, classifier_head, eval_dataloader, device)
print(eval_metrics1)

Evaluating: 100%|██████████| 32/32 [01:47<00:00,  3.35s/it]

Accuracy: 0.3190
Precision: 0.1018
Recall: 0.3190
F1-score: 0.1543
{'accuracy': 0.319, 'precision': 0.10176099999999999, 'recall': 0.319, 'f1': 0.15430022744503413}



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Evaluate model1 after training with MNLI dataset

In [82]:
# Instantiate the BERT model
our_model = BERT()
our_model.load_state_dict(torch.load('app/models/our-model.pt'))

<All keys matched successfully>

In [83]:
calculate_cosine_sim_model1(our_model,classifier_head,eval_dataloader)

0.9986222609877586

In [84]:
calculate_loss_model1(our_model,classifier_head,criterion,eval_dataloader)

8.952201031148434

In [85]:
sentence_a = 'A man is playing a guitar on stage'
sentence_b = "The man is performing music"
similarity = calculate_similarity_model1(our_model, sentence_a, sentence_b, device)
print(f"Cosine Similarity: {similarity:.4f}")

Shape of u (embeddings): torch.Size([1, 5, 60305])
Shape of v (embeddings): torch.Size([1, 5, 60305])
Cosine Similarity: 0.9996


In [86]:
eval_metrics = evaluate_nli_model(our_model, classifier_head, eval_dataloader, device)
print(eval_metrics)

Evaluating: 100%|██████████| 32/32 [01:55<00:00,  3.61s/it]

Accuracy: 0.3190
Precision: 0.1018
Recall: 0.3190
F1-score: 0.1543
{'accuracy': 0.319, 'precision': 0.10176099999999999, 'recall': 0.319, 'f1': 0.15430022744503413}



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [88]:
sentence_a = "A man is playing a guitar."
sentence_b = "A person is performing music."
similarity, nli_result = predict_nli_and_similarity(our_model, classifier_head, sentence_a, sentence_b, device)

print(f"Cosine Similarity: {similarity:.4f}")
print(f"NLI Prediction: {nli_result}")


Cosine Similarity: 0.9988
NLI Prediction: entailment


In [91]:
sentence_a = "Vrenna and I both fought him and he nearly took us."
sentence_b = "Neither Vrenna nor myself have ever fought him."
similarity, nli_result = predict_nli_and_similarity(our_model, classifier_head, sentence_a, sentence_b, device)

print(f"Cosine Similarity: {similarity:.4f}")
print(f"NLI Prediction: {nli_result}")

Cosine Similarity: 0.9998
NLI Prediction: entailment


In [101]:
# start from a pretrained bert-base-uncased model
from transformers import BertTokenizer, BertModel
model2 = BertModel.from_pretrained('bert-base-uncased')
model2.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [105]:
count_parameters(model2)

______
109482240
