## Train a BERT-based model to label semantic equivalence

1. Finetune BERT based on GLOVE dataset
2. BERT -> dropout -> dense output

In [37]:
# Torch 2.0.1
import torch
from  transformers import BertModel, AutoModelForMaskedLM, BertTokenizer
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from datasets import load_dataset
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

In [3]:
# Determine token length
MAX_TOKEN_LEN = 256


In [4]:
# settings
device = torch.device('cuda')
# load distilbert
model_checkpoint = "bert-base-uncased" #'distilbert-base-uncased'
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
tokenizer = BertTokenizer.from_pretrained(model_checkpoint)
#load dataset
dataset = load_dataset('glue', 'mrpc')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Found cached dataset glue (C:/Users/liujj/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 3/3 [00:00<00:00, 377.33it/s]


In [5]:
print(dataset['train'][0])

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0}


In [6]:
tokenizer.decode(tokenizer.encode(dataset['train'][0]['sentence1'], add_special_tokens=True))

'[CLS] amrozi accused his brother, whom he called " the witness ", of deliberately distorting his evidence. [SEP]'

In [7]:
def bertTokenizeGlue(dataset, tokenizer = tokenizer, max_length = MAX_TOKEN_LEN):

    encoded = tokenizer.encode_plus(
            dataset['sentence1'], dataset['sentence2'],
            add_special_tokens=True, 
            max_length = max_length, 
            padding = 'max_length',
            return_attention_mask = True,
            return_tensors = 'pt'
        )
        
    # convert to pytorch tensors
    labels = dataset['label'] 
    labels = torch.tensor(labels)

    return encoded, labels

def bertTokenizeGlue_all(dataset, tokenizer = tokenizer, max_length = MAX_TOKEN_LEN):
    
    encoded = tokenizer.encode(
            dataset['sentence1'], dataset['sentence2'],
            add_special_tokens=True, 
            max_length = max_length, 
            padding = 'max_length',
            return_attention_mask = True,
            return_tensors = 'pt'
        )
        
    # convert to pytorch tensors
    labels = dataset['label'] 
    labels = torch.tensor(labels)

    return encoded, labels

    
    

In [8]:

class BertDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length):
        super(BertDataset, self).__init__()
        self.dataset = dataset
        self.max_length=max_length
        self.tokenizer = tokenizer
        
    def __len__(self):
        return self.dataset.num_rows 
        
    def __getitem__(self, index):
        text = self.dataset[index]
        inputs, labels = bertTokenizeGlue(text, tokenizer=self.tokenizer, max_length=self.max_length)
        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]

        return {
            'input_ids': ids,
            'attention_mask': mask,
            'token_type_ids': token_type_ids,
            'target': labels
            }

In [44]:
class BERTgrader(nn.Module):
    def __init__(self):
        super(BERTgrader, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased") # initialized above
        self.dropout = nn.Dropout(p = 0.2)
        self.outlayer = nn.Linear(768, 1) # output layer
    
    def forward(self, ids, mask, token_type_ids):
        _ , o2 = self.bert(ids,attention_mask=mask,token_type_ids=token_type_ids, return_dict=False)
        # o2 = self.dropout(o2)
        out= self.outlayer(o2)
        return out
    


In [45]:
def finetune(epochs,dataloader,grader_model,loss_fn,optimizer):
    grader_model.train()
    for epoch in range(epochs):
        #print(epoch)
        
        loop=tqdm(enumerate(dataloader),leave=False,total=len(dataloader))
        for batch, dl in loop:
            print('batch: ', batch)
            ids=dl['input_ids'].squeeze()
            token_type_ids=dl['token_type_ids'].squeeze()
            mask= dl['attention_mask'].squeeze()
            label=dl['target']
            label = label.unsqueeze(1)
            optimizer.zero_grad()
            print(ids.shape)
            output=grader_model(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids)
            label = label.type_as(output)

            loss=loss_fn(output,label)
            loss.backward()
            
            optimizer.step()
            
            pred = np.where(output >= 0, 1, 0)

            num_correct = sum(1 for a, b in zip(pred, label) if a[0] == b[0])
            num_samples = pred.shape[0]
            accuracy = num_correct/num_samples
            
            print(f'Got {num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}')
            
            # Show progress while training
            loop.set_description(f'Epoch={epoch}/{epochs}')
            loop.set_postfix(loss=loss.item(),acc=accuracy)

    return grader_model

In [46]:
train_data = BertDataset(dataset['train'], tokenizer, max_length=MAX_TOKEN_LEN)
dataloader = DataLoader(dataset=train_data, batch_size=32)

In [47]:
grader_model = BERTgrader()
loss = nn.BCEWithLogitsLoss()        
optimizer= optim.Adam(model.parameters(),lr= 0.0001)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [48]:
# a['token_type_ids'].squeeze() # one iter
# grader_model(a['input_ids'].squeeze(), mask=a['attention_mask'].squeeze(),token_type_ids=a['token_type_ids'].squeeze()) # 41s

tensor([[1.0482],
        [1.0481],
        [1.0211],
        [1.0481],
        [1.0207],
        [1.0170],
        [1.0331],
        [1.0097],
        [1.0140],
        [1.0208],
        [1.0521],
        [1.0270],
        [1.0064],
        [1.0458],
        [1.0300],
        [1.0282],
        [1.0564],
        [1.0448],
        [0.9934],
        [1.0533],
        [1.0350],
        [1.0236],
        [1.0375],
        [1.0450],
        [1.0337],
        [1.0400],
        [1.0352],
        [1.0235],
        [1.0295],
        [1.0231],
        [1.0438],
        [0.9409]], grad_fn=<AddmmBackward0>)

In [49]:
agrader_model=finetune(5, dataloader, grader_model, loss, optimizer)

  0%|          | 0/115 [00:00<?, ?it/s]

batch:  0
torch.Size([32, 256])


                                       

In [50]:
agrader_model