## CS310 Natural Language Processing
## Assignment 5 (part 2): Pretraining BERT with on a Full Dataset

You should re-use the code from A5_bert_toy.ipynb. For clarity, you are suggested to put the code for model definition in a separate file, e.g., model.py, and import it here.

In [18]:
import math
import re
import random
from typing import List, Dict
from pprint import pprint
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from model import Embedding,ScaledDotProductAttention,MultiHeadAttention,PoswiseFeedForwardNet,EncoderLayer,BERT
import importlib
import sys




In [19]:

def build_vocab(file_paths):
    sentences=[]
    for file_path in file_paths:
    # Initialize a set to store unique words
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            sentences.extend(text.replace('\n',""))

    word_types = set(list(sentences))

    word_to_id = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}
    for i, w in enumerate(word_types):
        word_to_id[w] = i + 4
    id_to_word = {i: w for i, w in enumerate(word_to_id)}
    vocab_size = len(word_to_id)


    return word_to_id, id_to_word, vocab_size

def tokenize_sentences(file_path, word_to_id):

    tokens_list = []
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        sentences=text.split('\n')
        
        for sentence in sentences:
            tokens=[]
            for s in list(sentence):
                tokens.append(word_to_id[s])
            tokens_list.append(tokens)

    return tokens_list

### 1. Data Processing

In [20]:

# Load and preprocess data
word_to_id, id_to_word, vocab_size = build_vocab(['train.txt','test.raw.txt'])
print("vocab_size",vocab_size)
print("word_to_id",word_to_id)
train_data = tokenize_sentences('train.txt', word_to_id)
print("train_data",train_data[:3])
test_data = tokenize_sentences('test.raw.txt', word_to_id)
print("test_data",test_data[:3])
tokens_list=train_data

max_len = max(len(seq) for seq in tokens_list)
print("max_len:", max_len)
MAX_LEN = max_len
VOCAB_SIZE=vocab_size
batch_size = 6
MAX_PRED = 5 



vocab_size 1529
word_to_id {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3, '呺': 4, '战': 5, '设': 6, '滑': 7, '閒': 8, '时': 9, '避': 10, '恤': 11, '謷': 12, '矣': 13, '鼷': 14, '复': 15, '徙': 16, '仆': 17, '失': 18, '停': 19, '疲': 20, '倍': 21, '利': 22, '知': 23, '对': 24, '暴': 25, '挑': 26, '刑': 27, '晦': 28, '术': 29, '连': 30, '可': 31, '饥': 32, '矰': 33, '仞': 34, '偏': 35, '排': 36, '撮': 37, '彰': 38, '诸': 39, '振': 40, '池': 41, '始': 42, '鹪': 43, '胡': 44, '斗': 45, '维': 46, '栎': 47, '观': 48, '商': 49, '把': 50, '同': 51, '翣': 52, '？': 53, '猎': 54, '危': 55, '绝': 56, '理': 57, '旷': 58, '华': 59, '雾': 60, '毁': 61, '鸡': 62, '未': 63, '咬': 64, '巫': 65, '勤': 66, '象': 67, '儵': 68, '地': 69, '譹': 70, '坠': 71, '榆': 72, '慎': 73, '椿': 74, '军': 75, '杀': 76, '成': 77, '域': 78, '礴': 79, '争': 80, '黜': 81, '顾': 82, '涕': 83, '饮': 84, '泣': 85, '郑': 86, '誉': 87, '喁': 88, '斄': 89, '洫': 90, '汝': 91, '猵': 92, '耆': 93, '黥': 94, '繲': 95, '管': 96, '任': 97, '多': 98, '后': 99, '履': 100, '惑': 101, '暖': 102, '莛': 103, '西': 104, '豚': 105, '靡': 

In [21]:
def make_batch(tokens_list: List[int], batch_size: int, word_to_id: Dict):
    batch = []
    positive = negative = 0
    
    while positive != batch_size/2 or negative != batch_size/2:
        sent_a_index, sent_b_index= random.randrange(len(tokens_list)), random.randrange(len(tokens_list))
        tokens_a, tokens_b= tokens_list[sent_a_index], tokens_list[sent_b_index]

        input_ids = [word_to_id['[CLS]']] + tokens_a + [word_to_id['[SEP]']] + tokens_b + [word_to_id['[SEP]']]
        segment_ids = [1] * (1 + len(tokens_a) + 1) + [2] * (len(tokens_b) + 1)

        # The following code is used for the Masked Language Modeling (MLM) task.
        n_pred =  min(MAX_PRED, max(1, int(round(len(input_ids) * 0.15)))) # Predict at most 15 % of tokens in one sentence
        masked_candidates_pos = [i for i, token in enumerate(input_ids)
                          if token != word_to_id['[CLS]'] and token != word_to_id['[SEP]']]
        random.shuffle(masked_candidates_pos)
        masked_tokens, masked_pos = [], []
        for pos in masked_candidates_pos[:n_pred]:
            masked_pos.append(pos)
            masked_tokens.append(input_ids[pos])
            ### START YOUR CODE ###
            # Throw a dice to decide if you want to replace the token with [MASK], random word, or remain the same
            dice = random.random()
            if dice < 0.8:
                input_ids[pos] = word_to_id['[MASK]']
            else:
                dice = random.random()
                if dice < 0.5:
                    dice = random.random()
                    input_ids[pos] = random.randint(0, VOCAB_SIZE-1)
            # Otherwise, keep the same token (10% of the time)
            ### END YOUR CODE ###

        # Make zero paddings
        n_pad = MAX_LEN - len(input_ids)
        input_ids.extend([0] * n_pad)
        segment_ids.extend([0] * n_pad)

        # Zero padding (100% - 15%) of thetokens
        if MAX_PRED > n_pred:
            n_pad = MAX_PRED - n_pred
            masked_tokens.extend([0] * n_pad)
            masked_pos.extend([0] * n_pad)

        # The following code is used for the Next Sentence Prediction (NSP) task.
        ### START YOUR CODE ###
        # Decide if the is_next label is positive or negative, by comparing sent_a_index and sent_b_index
        # Don't forget to increment the positive/negative count
        # The following code is used for the Next Sentence Prediction (NSP) task.
        is_next = sent_a_index + 1 == sent_b_index
        if is_next and positive < batch_size // 2:
            positive += 1
            batch.append((input_ids, segment_ids,  masked_tokens, masked_pos,is_next))
        elif not is_next and negative < batch_size // 2:
            negative += 1
            batch.append((input_ids, segment_ids,  masked_tokens, masked_pos,is_next))

        ### END YOUR CODE ###

    return batch

In [22]:
random.seed(0)
batch = make_batch(tokens_list, batch_size, word_to_id)
input_ids, segment_ids, masked_tokens, masked_pos, is_next = map(torch.LongTensor, zip(*batch))

sample = 2

print('sampled text:')
print([id_to_word[w.item()] for w in input_ids[sample] if id_to_word[w.item()] != '[PAD]'])
print()
print('input_ids:', input_ids[sample])
print('segment_ids:', segment_ids[sample])
print('masked_tokens:', masked_tokens[sample])
print('masked_pos:', masked_pos[sample])
print('is_next:', is_next[sample].item())

sampled text:
['[CLS]', '三', '年', '[MASK]', '后', '，', '未', '尝', '[MASK]', '[MASK]', '牛', '也', '；', '[SEP]', '以', '汝', '为', '虫', '臂', '乎', '？', '[SEP]']

input_ids: tensor([   1,  537, 1324,    3,   99,  505,   63, 1237,    3,    3,  183,  675,
        1477,    2,  219,   91,  428,  492,  566, 1516,   53,    2,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0])
segment_ids: tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
masked_tokens: tensor([ 469,  193, 1149,    0,    0])
masked_pos: tensor([9, 8, 3, 0, 0])
is_next: 0


### 2. Model Training

In [23]:
def get_pad_attn_mask(seq_q, seq_k):
    batch_size, seq_len = seq_q.size()
    batch_size, len_k = seq_k.size()
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1) 
    return pad_attn_mask.expand(batch_size, seq_len, len_k)  

In [24]:
# Test

torch.manual_seed(0)
random.seed(0)
batch = make_batch(tokens_list, batch_size, word_to_id)
input_ids, segment_ids, masked_tokens, masked_pos, is_next = map(torch.LongTensor, zip(*batch))

enc_layer = EncoderLayer()
enc_self_attn_mask = get_pad_attn_mask(input_ids, input_ids)
embedding = Embedding(VOCAB_SIZE,MAX_LEN)
enc_inputs = embedding(input_ids, segment_ids)
enc_outputs, attn = enc_layer(enc_inputs, enc_self_attn_mask)

print('enc_outputs:', enc_outputs.size())
print('attn:', attn.size())



enc_outputs: torch.Size([6, 71, 768])
attn: torch.Size([6, 12, 71, 71])


In [25]:
# Test
torch.manual_seed(0)
random.seed(0)

batch = make_batch(tokens_list, batch_size, word_to_id)
input_ids, segment_ids, masked_tokens, masked_pos, is_next = map(torch.LongTensor, zip(*batch))

model = BERT(VOCAB_SIZE,MAX_LEN)
logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)

print('logits_lm:', logits_lm.size())
print('logits_clsf:', logits_clsf.size())



logits_lm: torch.Size([6, 5, 1529])
logits_clsf: torch.Size([6, 2])


In [26]:
random.seed(0)
torch.manual_seed(0)

model = BERT(VOCAB_SIZE,MAX_LEN)
criterion = nn.CrossEntropyLoss() # You can also try two separate losses for each task
optimizer = optim.Adam(model.parameters(), lr=0.001)

batch = make_batch(tokens_list, batch_size, word_to_id)
input_ids, segment_ids, masked_tokens, masked_pos, is_next = map(torch.LongTensor, zip(*batch))

for epoch in range(500):
    optimizer.zero_grad()

    ### START YOUR CODE ###
    logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)
    loss_lm = criterion(logits_lm.transpose(1, 2), masked_tokens)  # We need to transpose logits_lm to match the shape expected by CrossEntropyLoss
    loss_clsf = criterion(logits_clsf, is_next)
    loss = loss_lm + loss_clsf
    ### END YOUR CODE ###

    if (epoch + 1) % 10 == 0:
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
    loss.backward()
    optimizer.step()

Epoch: 0010 cost = 21.685839
Epoch: 0020 cost = 12.189532
Epoch: 0030 cost = 7.815399
Epoch: 0040 cost = 4.595700
Epoch: 0050 cost = 2.177538
Epoch: 0060 cost = 1.900656
Epoch: 0070 cost = 1.496390
Epoch: 0080 cost = 2.591231
Epoch: 0090 cost = 1.242195
Epoch: 0100 cost = 1.395167
Epoch: 0110 cost = 1.195192
Epoch: 0120 cost = 1.121188
Epoch: 0130 cost = 1.157933
Epoch: 0140 cost = 1.079421
Epoch: 0150 cost = 1.139462
Epoch: 0160 cost = 1.341852
Epoch: 0170 cost = 1.196683
Epoch: 0180 cost = 1.121397
Epoch: 0190 cost = 1.064820
Epoch: 0200 cost = 1.076007
Epoch: 0210 cost = 1.186879
Epoch: 0220 cost = 0.971573
Epoch: 0230 cost = 1.113909
Epoch: 0240 cost = 0.989813
Epoch: 0250 cost = 1.022099
Epoch: 0260 cost = 0.941513
Epoch: 0270 cost = 1.040421
Epoch: 0280 cost = 0.977089
Epoch: 0290 cost = 0.885053
Epoch: 0300 cost = 1.035623
Epoch: 0310 cost = 1.140220
Epoch: 0320 cost = 1.000193
Epoch: 0330 cost = 0.927826
Epoch: 0340 cost = 0.926857
Epoch: 0350 cost = 1.012740
Epoch: 0360 cost =

In [27]:
model_path = "model.pth"
import torch

torch.save(model.state_dict(), model_path)  

In [28]:
# model =BERT(VOCAB_SIZE,MAX_LEN) 
# state_dict = torch.load(model_path)
# model.load_state_dict(state_dict)  
# model.eval()


### 3. Evaluation

In [29]:
def parse_line(line):
    parts = line.strip().split('\t')
    
    if len(parts) == 3:
        text, is_next, ground_truth = parts  
        
        segments = text.split('[SEP]')
        
        if len(segments) == 3:
            sentence_a = segments[0].replace('[CLS]', '').strip()
            sentence_b = segments[1].strip()
            is_next = int(is_next)
            ground_truth_tokens = ground_truth.strip()
            
            return sentence_a, sentence_b, is_next, ground_truth_tokens


def tokenize_sentence(sentence, word_to_id):
    tokens = [word_to_id.get(word, 0) for word in sentence.split()]
    return tokens

def read_and_parse_data(file_path, word_to_id):
    parsed_data = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines() 
    
    for line in lines:
        sentence_a, sentence_b, is_next, ground_truth_words = parse_line(line)
        
        tokens_a = tokenize_sentence(sentence_a, word_to_id)
        tokens_b = tokenize_sentence(sentence_b, word_to_id)
        ground_truth_tokens= tokenize_sentence(ground_truth_words, word_to_id)
        
        parsed_data.append({
            'tokens_a': tokens_a,
            'tokens_b': tokens_b,
            'is_next': is_next,
            'ground_truth_tokens': ground_truth_tokens,
        })
    
    return parsed_data




In [30]:
def evaluate_model(model, parsed_data, word_to_id):
    results = []
    nsp_correct_count = 0 
    mlm_correct_count=0
    total_mask_words=0
    total_samples = len(parsed_data)  
    
    for data in parsed_data:
        tokens_a = data['tokens_a']
        tokens_b = data['tokens_b']
        ground_truth_tokens = data['ground_truth_tokens']
        is_next = data['is_next']

        
        input_ids = [word_to_id['[CLS]']] + tokens_a + [word_to_id['[SEP]']] + tokens_b + [word_to_id['[SEP]']]
        segment_ids = [1] * (len(tokens_a) + 2) + [2] * (len(tokens_b) + 1)  # Segment IDs
        masked_pos = [i for i, token in enumerate(input_ids) if token == word_to_id.get('[MASK]', 0)]
        
        input_ids_tensor = torch.LongTensor([input_ids])
        segment_ids_tensor = torch.LongTensor([segment_ids])
        masked_pos_tensor = torch.LongTensor([masked_pos])

        logits_lm, logits_clsf = model(input_ids_tensor, segment_ids_tensor, masked_pos_tensor)

        # MLM predictions
        predicted_ids = logits_lm.argmax(dim=2).squeeze().data.numpy()
        
        # print('masked tokens ground truth: ',[id_to_word[pos] for pos in ground_truth_tokens])
        # print('predicted masked tokens: ',[id_to_word[pos] for pos in predicted_ids])

        # Calculate MLM accuracy
        mlm_correct_predictions=0
        for i in range(len(predicted_ids)):
            if predicted_ids[i]==ground_truth_tokens[i]:
                mlm_correct_predictions+=1
        mlm_correct_count+=mlm_correct_predictions
        total_mask_words+=len(predicted_ids)
        mlm_accuracy=mlm_correct_predictions/len(predicted_ids)

        # NSP prediction
        predicted_is_next = logits_clsf.argmax(dim=1).data.numpy()[0] 
        nsp_correct = (predicted_is_next == is_next)
        if nsp_correct:
            nsp_correct_count += 1
        results.append({
            "mlm_accuracy": mlm_accuracy,
            "nsp_correct": nsp_correct,
        })
      # Calculate overall MLM and NSP results
    overall_mlm_accuracy = mlm_correct_count / total_mask_words  
    overall_nsp_accuracy = nsp_correct_count / total_samples  

        
    
    return results,overall_mlm_accuracy,overall_nsp_accuracy


In [31]:
test_data = read_and_parse_data("test.pairs.txt", word_to_id)

print(test_data)

[{'tokens_a': [439, 3, 1095, 1458, 1367, 217, 691, 1061, 1294, 1512, 505, 439, 3, 3, 771, 1343, 217, 691, 1061, 236, 496, 289], 'tokens_b': [1210, 1149, 887, 505, 3, 299, 691, 82, 289], 'is_next': 1, 'ground_truth_tokens': [1405, 907, 762, 653]}, {'tokens_a': [148, 247, 1149, 304, 505, 1405, 3, 1200, 910, 505, 645, 694, 51, 512, 675, 3], 'tokens_b': [1311, 247, 1255, 355, 3, 311, 691, 193, 1073, 743, 1516, 53], 'is_next': 1, 'ground_truth_tokens': [217, 289, 247]}, {'tokens_a': [1355, 1354, 217, 627, 505, 219, 327, 3, 299, 1477], 'tokens_b': [547, 104, 1093, 258, 505, 3, 10, 979, 1329, 1477], 'is_next': 1, 'ground_truth_tokens': [1183, 691]}, {'tokens_a': [239, 51, 1516, 1389, 299, 178, 3, 505, 1140, 51, 1516, 1389, 13, 505, 1456, 126, 178, 1149, 3], 'tokens_b': [239, 1075, 1516, 1389, 3, 463, 299, 178, 1149, 3, 1140, 1075, 1516, 1389, 900, 463, 3, 505, 1456, 126, 178, 1149, 53], 'is_next': 1, 'ground_truth_tokens': [1149, 53, 900, 505, 13]}, {'tokens_a': [239, 51, 1516, 3, 3, 463, 299

In [32]:


results,overall_mlm_accuracy ,overall_nsp_accuracy= evaluate_model(model, test_data, word_to_id)
print(f"Overall MLM Accuracy: {overall_mlm_accuracy:.2%}")
print(f"Overall NSP Accuracy: {overall_nsp_accuracy:.2%}")
# Display the results
for i, res in enumerate(results):
    print(f"Sample {i + 1}:")
    print(f"MLM Accuracy: {res['mlm_accuracy']:.2%}")
    print(f"NSP Correct: {'Yes' if res['nsp_correct'] else 'No'}")


Overall MLM Accuracy: 0.57%
Overall NSP Accuracy: 54.39%
Sample 1:
MLM Accuracy: 0.00%
NSP Correct: Yes
Sample 2:
MLM Accuracy: 0.00%
NSP Correct: Yes
Sample 3:
MLM Accuracy: 0.00%
NSP Correct: Yes
Sample 4:
MLM Accuracy: 0.00%
NSP Correct: No
Sample 5:
MLM Accuracy: 0.00%
NSP Correct: No
Sample 6:
MLM Accuracy: 0.00%
NSP Correct: Yes
Sample 7:
MLM Accuracy: 0.00%
NSP Correct: Yes
Sample 8:
MLM Accuracy: 0.00%
NSP Correct: Yes
Sample 9:
MLM Accuracy: 0.00%
NSP Correct: Yes
Sample 10:
MLM Accuracy: 0.00%
NSP Correct: No
Sample 11:
MLM Accuracy: 0.00%
NSP Correct: Yes
Sample 12:
MLM Accuracy: 50.00%
NSP Correct: Yes
Sample 13:
MLM Accuracy: 0.00%
NSP Correct: Yes
Sample 14:
MLM Accuracy: 0.00%
NSP Correct: Yes
Sample 15:
MLM Accuracy: 0.00%
NSP Correct: Yes
Sample 16:
MLM Accuracy: 0.00%
NSP Correct: No
Sample 17:
MLM Accuracy: 0.00%
NSP Correct: No
Sample 18:
MLM Accuracy: 0.00%
NSP Correct: Yes
Sample 19:
MLM Accuracy: 0.00%
NSP Correct: Yes
Sample 20:
MLM Accuracy: 0.00%
NSP Correct: 