In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/MyDrive/HW3DL/D3')


! ls
!pip install packaging==21.3
!pip install transformers==4.5.0

from transformers import AdamW, BertTokenizerFast, BertForQuestionAnswering

# model_name can be one of models in huggingface model hub 

model_name = 'bert-base-chinese'
model = BertForQuestionAnswering.from_pretrained(model_name)
eng_tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

eng_paragraph = 'professor feng Deep Learning class'

tokens = eng_tokenizer.tokenize(eng_paragraph)
print(tokens)
eng_tokenizer.convert_tokens_to_ids(tokens)

question = 'Who established Apple?'
paragraph = 'Apple Computers, Inc. was founded on April 1, 1976, by college dropouts Steve Jobs and Steve Wozniak'

encoded = eng_tokenizer.encode(question, paragraph)
decoded = eng_tokenizer.decode(encoded)
print(encoded)
print(decoded)

inputs = eng_tokenizer(question, paragraph, return_tensors='pt') 

print('##################')
# Indices of input sequence tokens in the vocabulary
print('Input ids:      ', inputs['input_ids'])
print('##################')
# Segment token indices to indicate first and second portions of the inputs. Indices are selected in [0, 1]
print('Token type ids: ', inputs['token_type_ids'])
print('##################')
# Mask to avoid performing attention on padding token indices. Mask values selected in [0, 1]
print('Attention mask: ', inputs['attention_mask'])

output = model(**inputs, start_positions=torch.tensor([14]), end_positions=torch.tensor([19]))
print("loss: ", output.loss)

optimizer = AdamW(model.parameters(), lr=1e-4)
output.loss.backward()
optimizer.step()

question = 'when  Apple was founded?'
paragraph = 'Apple Computers, Inc. was founded on April 1, 1976, by college dropouts Steve Jobs and Steve Wozniak'

inputs = eng_tokenizer(question, paragraph, return_tensors='pt')

with torch.no_grad():
    output = model(**inputs)

print("start_logits: ")
print(output.start_logits)

print("end_logits: ")
print(output.end_logits)

start = torch.argmax(output.start_logits)
end = torch.argmax(output.end_logits)
print("start position: ", start.item())
print("end position:   ", end.item())

predict_id = inputs['input_ids'][0][start : end + 1]
print("predict_id:     ", predict_id)

predict_answer = eng_tokenizer.decode(predict_id)
print("predict_answer: ", predict_answer)

In [None]:
import math
import json
import numpy as np
import random
import torch
from torch.utils.data import DataLoader, Dataset 
from transformers import AdamW, BertForQuestionAnswering, BertTokenizerFast
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from transformers import get_linear_schedule_with_warmup

from tqdm.auto import tqdm

device = torch.device("cuda", 1) if torch.cuda.is_available() else "cpu"


def same_seeds(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

same_seeds(0)    # Fix random seed for reproducibility

# Change "fp16_training" to True to support automatic mixed precision training (fp16)

fp16_training = True

if fp16_training:
    %pip install accelerate==0.2.0
    from accelerate import Accelerator
    accelerator = Accelerator(fp16=True)
    device = accelerator.device
    
model = BertForQuestionAnswering.from_pretrained("bert-base-chinese").to(device)
tokenizer = BertTokenizerFast.from_pretrained("bert-base-chinese")


def read_data(file):
    with open(file, 'r', encoding="utf-8") as reader:
        data = json.load(reader)
    return data["questions"], data["paragraphs"]

train_questions, train_paragraphs = read_data("train.json")
dev_questions, dev_paragraphs     = read_data("dev.json")
test_questions, test_paragraphs   = read_data("test.json")

print('train_questions : ', len(train_questions))
print('dev_questions   : ', len(dev_questions))
print('test_questions  : ', len(test_questions))

# Tokenize questions and paragraphs separately

train_questions_tokenized  = tokenizer([train_question["question_text"] for train_question in train_questions], add_special_tokens=False)
dev_questions_tokenized    = tokenizer([dev_question["question_text"] for dev_question in dev_questions], add_special_tokens=False)
test_questions_tokenized   = tokenizer([test_question["question_text"] for test_question in test_questions], add_special_tokens=False) 

train_paragraphs_tokenized = tokenizer(train_paragraphs, add_special_tokens=False)
dev_paragraphs_tokenized   = tokenizer(dev_paragraphs, add_special_tokens=False)
test_paragraphs_tokenized  = tokenizer(test_paragraphs, add_special_tokens=False)

DOC_STRIDE = None

class QA_Dataset(Dataset):
    def __init__(self, split, questions, tokenized_questions, tokenized_paragraphs):
        self.split = split
        self.questions = questions
        self.tokenized_questions = tokenized_questions
        self.tokenized_paragraphs = tokenized_paragraphs
        self.max_question_len = 40
        self.max_paragraph_len = 350
        
        ##### Change value of doc_stride #####
        self.doc_stride = int(0.9 * self.max_paragraph_len)

        ############################################
        global DOC_STRIDE
        
        DOC_STRIDE = self.doc_stride
        ############################################
        # Input sequence length = [CLS] + question + [SEP] + paragraph + [SEP]

        self.max_seq_len = 1 + self.max_question_len + 1 + self.max_paragraph_len + 1

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        tokenized_question = self.tokenized_questions[idx]
        tokenized_paragraph = self.tokenized_paragraphs[question["paragraph_id"]]

        ##### Preprocessing #####

        if self.split == "train":
            # Convert answer's start/end positions in paragraph_text to start/end positions in tokenized_paragraph  
            answer_start_token = tokenized_paragraph.char_to_token(question["answer_start"])
            answer_end_token = tokenized_paragraph.char_to_token(question["answer_end"])

            # A single window is obtained by slicing the portion of paragraph containing the answer
            mid = (answer_start_token + answer_end_token) // 2
            prefix_len = int(random.random() * self.max_paragraph_len)
            postfix_len = self.max_paragraph_len - prefix_len
            paragraph_start, paragraph_end = mid - prefix_len, mid + postfix_len
            if paragraph_start < 0:
                paragraph_end -= paragraph_start
                paragraph_start = 0
            if paragraph_end >= len(tokenized_paragraph):
                paragraph_end = len(tokenized_paragraph) - 1
            
            # Slice question/paragraph and add special tokens (101: CLS, 102: SEP)
            input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102] 
            input_ids_paragraph = tokenized_paragraph.ids[paragraph_start : paragraph_end] + [102]
            
            # Convert answer's start/end positions in tokenized_paragraph to start/end positions in the window  
            answer_start_token += len(input_ids_question) - paragraph_start
            answer_end_token += len(input_ids_question) - paragraph_start
            
            # Pad sequence and obtain inputs to model 
            input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)
            return torch.tensor(input_ids), torch.tensor(token_type_ids), torch.tensor(attention_mask), answer_start_token, answer_end_token

        # Validation
        # Testing
        else:
            input_ids_list, token_type_ids_list, attention_mask_list = [], [], []
            
            # Paragraph is split into several windows, each with start positions separated by step "doc_stride"
            for i in range(0, len(tokenized_paragraph), self.doc_stride):
                
                # Slice question/paragraph and add special tokens (101: CLS, 102: SEP)
                input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102]
                input_ids_paragraph = tokenized_paragraph.ids[i : i + self.max_paragraph_len] + [102]
                
                # Pad sequence and obtain inputs to model
                input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)
                
                input_ids_list.append(input_ids)
                token_type_ids_list.append(token_type_ids)
                attention_mask_list.append(attention_mask)
            
            return torch.tensor(input_ids_list), torch.tensor(token_type_ids_list), torch.tensor(attention_mask_list)

    def padding(self, input_ids_question, input_ids_paragraph):
        # Pad zeros if sequence length is shorter than max_seq_len
        padding_len = self.max_seq_len - len(input_ids_question) - len(input_ids_paragraph)
        # Indices of input sequence tokens in the vocabulary
        input_ids = input_ids_question + input_ids_paragraph + [0] * padding_len
        # Segment token indices to indicate first and second portions of the inputs. Indices are selected in [0, 1]
        token_type_ids = [0] * len(input_ids_question) + [1] * len(input_ids_paragraph) + [0] * padding_len
        # Mask to avoid performing attention on padding token indices. Mask values selected in [0, 1]
        attention_mask = [1] * (len(input_ids_question) + len(input_ids_paragraph)) + [0] * padding_len
        
        return input_ids, token_type_ids, attention_mask

train_set = QA_Dataset("train", train_questions, train_questions_tokenized, train_paragraphs_tokenized)
dev_set = QA_Dataset("dev", dev_questions, dev_questions_tokenized, dev_paragraphs_tokenized)
test_set = QA_Dataset("test", test_questions, test_questions_tokenized, test_paragraphs_tokenized)

train_batch_size = 4

#########################

train_loader = DataLoader(train_set, batch_size=train_batch_size, shuffle=True, pin_memory=True)
dev_loader   = DataLoader(dev_set, batch_size=1, shuffle=False, pin_memory=True)
test_loader  = DataLoader(test_set, batch_size=1, shuffle=False, pin_memory=True)

In [None]:
def evaluate(data, output, paragraph, paragraph_tokenized):
    ##### Postprocessing #####
    
    answer = ''
    max_prob = float('-inf')
    num_of_windows = data[0].shape[1]
    
    paragraph_start_index = 0
    paragraph_end_index = 0
    
    for k in range(num_of_windows):

        mask = (data[1][0][k].bool() & data[2][0][k].bool()).to(device)
    
        masked_output_start = torch.masked_select(output.start_logits[k], mask)
        masked_output_start = masked_output_start[:-1]
        
        start_prob, start_index = torch.max(masked_output_start, dim=0)
        
        masked_output_end = torch.masked_select(output.end_logits[k], mask)
        masked_output_end = masked_output_end[start_index: -1]
        
        end_prob, end_index = torch.max(masked_output_end, dim=0)
        
        end_index += start_index
        
        # Probability of answer is calculated as sum of start_prob and end_prob
        prob = start_prob + end_prob
        masked_data = torch.masked_select(data[0][0][k].to(device), mask)[:-1]
        
        # Replace answer if calculated probability is larger than previous windows
        if (prob > max_prob) and (start_index <= end_index <= (start_index + 50)):
            max_prob = prob
            paragraph_start_index = start_index.item() + (DOC_STRIDE * k)
            paragraph_end_index = end_index.item() + (DOC_STRIDE * k)
            answer = tokenizer.decode(masked_data[start_index : end_index + 1])


    
    ##########
    char_count = 0
    start_flag = False

    for i, token in enumerate(paragraph_tokenized):
        if token in ('[UNK]', '[CLS]', '[SEP]'):
            if i == paragraph_start_index:
                new_start = char_count
            if i == paragraph_end_index:
                new_end = char_count
            char_count += 1
        else:
            for char in token:
                if i == paragraph_start_index and not start_flag:
                    new_start = char_count
                    start_flag = True
                if i == paragraph_end_index:
                    new_end = char_count
                if char == "#":
                    continue
                else:
                    while char_count < len(paragraph) and char != paragraph[char_count]:
                        char_count += 1
                    char_count += 1
     
    if "[UNK]" in answer:
        print(f"original answer: {answer}")
        answer = paragraph[new_start: new_end+1]
        print(f"corrected answer: {answer}")
        print("-"*50)

###########################################################

    answer = answer.replace(' ', '')
    
###########################################################

    if len(answer) > 1:
        if "「" not in answer and answer[-1] == "」":
            answer = answer[:-1]
    return answer

num_epoch     = 5  
validation    = True  
logging_step  = 500
learning_rate = 5e-6

optimizer = AdamW(model.parameters(), lr=learning_rate)

##### Apply linear learning rate decay #####
total_steps = len(train_loader) * num_epoch

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

##################################################

if fp16_training:
    model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) 

model.train()

print("Start Training ...")

for epoch in range(num_epoch):
    step = 1
    train_loss = train_acc = 0
    
    for batch_idx, data in enumerate(tqdm(train_loader)):
        # Load all data into GPU
        data = [i.to(device) for i in data]
         
        output = model(input_ids=data[0], token_type_ids=data[1], attention_mask=data[2], start_positions=data[3], end_positions=data[4])

        # Choose the most probable start position / end position
        start_index = torch.argmax(output.start_logits, dim=1)
        end_index = torch.argmax(output.end_logits, dim=1)

        # Prediction is correct only if both start_index and end_index are correct
        train_acc += ((start_index == data[3]) & (end_index == data[4])).float().mean()
        train_loss += output.loss

        if fp16_training:
            accelerator.backward(output.loss)
        else:
            output.loss.backward()

        ##### Apply linear learning rate decay #####
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        step += 1
        ##################################################
        
        # Print training loss and accuracy over past logging step
        if step % logging_step == 0:
            print(f"Epoch {epoch + 1} | Step {step} | loss = {train_loss.item() / logging_step:.3f}, acc = {train_acc / logging_step:.3f}")
            train_loss = train_acc = 0
            
    if validation:
        print("Evaluating Dev Set ...")
        model.eval()
        with torch.no_grad():
            dev_acc = 0
            for i, data in enumerate(tqdm(dev_loader)):
                output = model(input_ids=data[0].squeeze(dim=0).to(device), token_type_ids=data[1].squeeze(dim=0).to(device),
                       attention_mask=data[2].squeeze(dim=0).to(device))
                # prediction is correct only if answer text exactly matches
                dev_acc += evaluate(data, output, dev_paragraphs[dev_questions[i]['paragraph_id']], dev_paragraphs_tokenized[dev_questions[i]['paragraph_id']].tokens) == dev_questions[i]["answer_text"]
            print(f"Validation | Epoch {epoch + 1} | acc = {dev_acc / len(dev_loader):.3f}")
        model.train()


##################################################

# Save a model 
print("Saving Model ...")
model_save_dir = "./models/saved_model" 
model.save_pretrained(model_save_dir)

model.eval()
with torch.no_grad():
    dev_acc = 0
    for i, data in enumerate(tqdm(dev_loader)):
        output = model(input_ids=data[0].squeeze(dim=0).to(device), token_type_ids=data[1].squeeze(dim=0).to(device),
               attention_mask=data[2].squeeze(dim=0).to(device))
        # prediction is correct only if answer text exactly matches
        pred_answer = evaluate(data, output, dev_paragraphs[dev_questions[i]['paragraph_id']], dev_paragraphs_tokenized[dev_questions[i]['paragraph_id']].tokens)
        true_answer = dev_questions[i]["answer_text"]
        dev_acc += (pred_answer == true_answer)
        if pred_answer != true_answer:
            print("*"*50)
            print(f"correct answer: {true_answer}")
            print(f"predict answer: {pred_answer}")
            print("*"*50)
    print(f"Validation | acc = {dev_acc / len(dev_loader):.3f}")
model.train()
print(f"Validation | acc = {dev_acc / len(dev_loader):.3f}")

print("Evaluating Test Set ...")

result = []

model.eval()
with torch.no_grad():
    for i, data in enumerate(tqdm(test_loader)):
        output = model(input_ids=data[0].squeeze(dim=0).to(device), token_type_ids=data[1].squeeze(dim=0).to(device),
                       attention_mask=data[2].squeeze(dim=0).to(device))
        result.append(evaluate(data, output, test_paragraphs[test_questions[i]['paragraph_id']], test_paragraphs_tokenized[test_questions[i]['paragraph_id']].tokens))

result_file = "./result.csv"
with open(result_file, 'w') as f:
    f.write("ID,Answer\n")
    for i, test_question in enumerate(test_questions):
        # Replace commas in answers with empty strings (since csv is separated by comma)
        f.write(f"{test_question['id']},{result[i].replace(',','')}\n")

print(f"Completed! Result is in {result_file}")

In [None]:
def process_answer(answer, paragraph, paragraph_tokenized):
    ##### Postprocessing #####
    answer_text = ''
    max_prob = float('-inf')
    num_of_windows = data[0].shape[1]
    
    paragraph_start_index = 0
    paragraph_end_index = 0
    
    for k in range(num_of_windows):

        mask = (data[1][0][k].bool() & data[2][0][k].bool()).to(device_custom)
    
        masked_output_start = torch.masked_select(output.start_logits[k], mask)
        masked_output_start = masked_output_start[:-1]
        
        start_prob, start_index = torch.max(masked_output_start, dim=0)
        
        masked_output_end = torch.masked_select(output.end_logits[k], mask)
        masked_output_end = masked_output_end[start_index: -1]
        
        end_prob, end_index = torch.max(masked_output_end, dim=0)
        
        end_index += start_index
        
        prob = start_prob + end_prob
        masked_data = torch.masked_select(data[0][0][k].to(device_custom), mask)[:-1]
        
        if (prob > max_prob) and (start_index <= end_index <= (start_index + 50)):
            max_prob = prob
            paragraph_start_index = start_index.item() + (DOC_STRIDE_CUSTOM * k)
            paragraph_end_index = end_index.item() + (DOC_STRIDE_CUSTOM * k)
            answer_text = tokenizer_custom.decode(masked_data[start_index : end_index + 1])

    char_count = 0
    start_flag = False

    for i, token in enumerate(paragraph_tokenized):
        if token in ('[UNK]', '[CLS]', '[SEP]'):
            if i == paragraph_start_index:
                new_start = char_count
            if i == paragraph_end_index:
                new_end = char_count
            char_count += 1
        else:
            for char in token:
                if i == paragraph_start_index and not start_flag:
                    new_start = char_count
                    start_flag = True
                if i == paragraph_end_index:
                    new_end = char_count
                if char == "#":
                    continue
                else:
                    while char_count < len(paragraph) and char != paragraph[char_count]:
                        char_count += 1
                    char_count += 1
     
    if "[UNK]" in answer_text:
        print(f"original answer: {answer_text}")
        answer_text = paragraph[new_start: new_end+1]
        print(f"corrected answer: {answer_text}")
        print("-"*50)

    answer_text = answer_text.replace(' ', '')
    
    if len(answer_text) > 1:
        if "「" not in answer_text and answer_text[-1] == "」":
            answer_text = answer_text[:-1]
    return answer_text


In [None]:
num_epoch_custom = 5  
validation_custom = True  
logging_step_custom = 500
learning_rate_custom = 5e-6

optimizer_custom = AdamW(model_custom.parameters(), lr=learning_rate_custom)

total_steps_custom = len(train_loader) * num_epoch_custom

scheduler_custom = get_linear_schedule_with_warmup(optimizer_custom, num_warmup_steps=0, num_training_steps=total_steps_custom)

if fp16_training_custom:
    model_custom, optimizer_custom, train_loader_custom = accelerator_custom.prepare(model_custom, optimizer_custom, train_loader_custom) 

model_custom.train()

print("Start Training ...")

for epoch in range(num_epoch_custom):
    step = 1
    train_loss = train_acc = 0
    
    for batch_idx, data in enumerate(tqdm(train_loader_custom)):
        data = [i.to(device_custom) for i in data]
        
        output = model_custom(input_ids=data[0], token_type_ids=data[1], attention_mask=data[2], start_positions=data[3], end_positions=data[4])

        start_index = torch.argmax(output.start_logits, dim=1)
        end_index = torch.argmax(output.end_logits, dim=1)

        train_acc += ((start_index == data[3]) & (end_index == data[4])).float().mean()
        train_loss += output.loss

        if fp16_training_custom:
            accelerator_custom.backward(output.loss)
        else:
            output.loss.backward()

        optimizer_custom.step()
        scheduler_custom.step()
        optimizer_custom.zero_grad()
        step += 1
        
        if step % logging_step_custom == 0:
            print(f"Epoch {epoch + 1} | Step {step} | loss = {train_loss.item() / logging_step_custom:.3f}, acc = {train_acc / logging_step_custom:.3f}")
            train_loss = train_acc = 0
            
    if validation_custom:
        print("Evaluating Dev Set ...")
        model_custom.eval()
        with torch.no_grad():
            dev_acc = 0
            for i, data in enumerate(tqdm(dev_loader_custom)):
                output = model_custom(input_ids=data[0].squeeze(dim=0).to(device_custom), token_type_ids=data[1].squeeze(dim=0).to(device_custom),
                       attention_mask=data[2].squeeze(dim=0).to(device_custom))
                dev_acc += evaluate(data, output, dev_paragraphs_custom[dev_questions_custom[i]['paragraph_id']], dev_paragraphs_tokenized_custom[dev_questions_custom[i]['paragraph_id']].tokens) == dev_questions_custom[i]["answer_text"]
            print(f"Validation | Epoch {epoch + 1} | acc = {dev_acc / len(dev_loader_custom):.3f}")
        model_custom.train()


In [None]:
print("Here we are saving the model")
model_save_dir_custom = "./model/saved" 
model_custom.save_pretrained(model_save_dir_custom)

model_custom.eval()
with torch.no_grad():
    dev_acc = 0
    for i, data in enumerate(tqdm(dev_loader_custom)):
        output = model_custom(input_ids=data[0].squeeze(dim=0).to(device_custom), token_type_ids=data[1].squeeze(dim=0).to(device_custom),
               attention_mask=data[2].squeeze(dim=0).to(device_custom))
        pred_answer = evaluate(data, output, dev_paragraphs_custom[dev_questions_custom[i]['paragraph_id']], dev_paragraphs_tokenized_custom[dev_questions_custom[i]['paragraph_id']].tokens)
        true_answer = dev_questions_custom[i]["answer_text"]
        dev_acc += (pred_answer == true_answer)
        if pred_answer != true_answer:
            print("*"*50)
            print(f"correct answer: {true_answer}")
            print(f"predict answer: {pred_answer}")
            print("*"*50)
    print(f"Validation | acc = {dev_acc / len(dev_loader_custom):.3f}")
model_custom.train()
print(f"Validation | acc = {dev_acc / len(dev_loader_custom):.3f}")

print("Evaluating Test Set ...")

result_custom = []

model_custom.eval()
with torch.no_grad():
    for i, data in enumerate(tqdm(test_loader_custom)):
        output = model_custom(input_ids=data[0].squeeze(dim=0).to(device_custom), token_type_ids=data[1].squeeze(dim=0).to
