### **Part1** Extracting features from the SQuAD dataset

In [1]:
import os
 
# Set the path to the pkl file
pkl_file_path = 'training_features.pkl'
 
# Check if the file exists
if os.path.isfile(pkl_file_path):
    os.remove(pkl_file_path)
    print(f"The file {pkl_file_path} has been deleted successfully.")
else:
    print(f"The file {pkl_file_path} does not exist.")

The file training_features.pkl has been deleted successfully.


In [2]:
import pickle
from transformers.data.processors.squad import SquadV2Processor, squad_convert_examples_to_features
from transformers import BertTokenizer
 
# Initialize SQuAD Processor, Dataset, and Tokenizer
processor = SquadV2Processor()
train_examples = processor.get_train_examples('train')
tokenizer = BertTokenizer.from_pretrained('uncased')
 
# Convert SQUAD 2.0 training dataset to BERT input features
train_features = squad_convert_examples_to_features(
    examples=train_examples,
    tokenizer=tokenizer,
    max_seq_length=384,
    doc_stride=128,
    max_query_length=64,
    is_training=True,
    return_dataset=False,
    threads=1
)

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 442/442 [00:20<00:00, 21.35it/s]
convert squad examples to features: 100%|██████████| 130319/130319 [07:28<00:00, 290.30it/s]
add example index and unique id: 100%|██████████| 130319/130319 [00:00<00:00, 1964891.70it/s]


In [3]:
 # Save features to disk
with open('training_features.pkl', 'wb') as f:
    pickle.dump(train_features, f)

### **Part2** Load pre-trained models

In [38]:
from transformers import BertForQuestionAnswering, BertTokenizer, BertForQuestionAnswering, AdamW
import torch
from torch.utils.data import TensorDataset
 
# use GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
# Using BERT-Uncased model downloaded from Hugging Face
tokenizer = BertTokenizer.from_pretrained('uncased')
model = BertForQuestionAnswering.from_pretrained('uncased').to(device)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
# Evaluate the performance of BERT before fine-tune.
def shenzhen_population():
    question, text = "What is the population of Shenzhen? ", "The population of Shenzhen is approximately 13 million."
    inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs.to(device))
    answer_start_index = torch.argmax(outputs.start_logits)
    answer_end_index = torch.argmax(outputs.end_logits) + 1
    predict_answer_tokens = inputs['input_ids'][0][answer_start_index:answer_end_index]
    predicted_answer = tokenizer.decode(predict_answer_tokens)

    print("What is the population of Shenzhen?", predicted_answer)

shenzhen_population() 

What is the population of Shenzhen? 


### **Part3** Prepare trainning data 

In [40]:
from transformers import BertTokenizer, BertForQuestionAnswering, AdamW
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers.data.processors.squad import SquadV2Processor, SquadExample, squad_convert_examples_to_features
 
# Load features of the SQuAD 2.0 dataset
import pickle
with open('training_features.pkl', 'rb') as f:
    train_features = pickle.load(f)
# Define hyperparameters
train_batch_size = 8
num_epochs = 3
learning_rate = 3e-5
 
# Convert features into tensors
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_attention_mask = torch.tensor([f.attention_mask for f in train_features], dtype=torch.long)
all_token_type_ids = torch.tensor([f.token_type_ids for f in train_features], dtype=torch.long)
all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
 
train_dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_start_positions, all_end_positions)
num_samples = 100
train_dataset = TensorDataset(
    all_input_ids[:num_samples], 
    all_attention_mask[:num_samples], 
    all_token_type_ids[:num_samples], 
    all_start_positions[:num_samples], 
    all_end_positions[:num_samples])
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size)

In [41]:
train_dataset

<torch.utils.data.dataset.TensorDataset at 0x1ace0706350>

### **Part4** Fine-tune Bert with feature dataset

In [58]:
# define model and optimizer
model = BertForQuestionAnswering.from_pretrained('uncased').to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

# fine-tune bert 
for epoch in range(5):
    for step, batch in enumerate(train_dataloader):
        model.train()
        optimizer.zero_grad()
        input_ids, attention_mask, token_type_ids, start_positions, end_positions = tuple(t.to(device) for t in batch)
        outputs = model(input_ids=input_ids, 
                        attention_mask=attention_mask, 
                        token_type_ids=token_type_ids, 
                        start_positions=start_positions, 
                        end_positions=end_positions)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
 
        # Print the training loss every 500 steps
        if step % 5 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{step+1}/{len(train_dataloader)}], Loss: {loss.item():.4f}")

# save the model after trainning
model.save_pretrained("SQuAD_finetuned_bert")            

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/3], Step [1/13], Loss: 5.8802
Epoch [1/3], Step [6/13], Loss: 5.2648
Epoch [1/3], Step [11/13], Loss: 4.3755
Epoch [2/3], Step [1/13], Loss: 3.9924
Epoch [2/3], Step [6/13], Loss: 4.0794
Epoch [2/3], Step [11/13], Loss: 3.1915
Epoch [3/3], Step [1/13], Loss: 2.9590
Epoch [3/3], Step [6/13], Loss: 3.0439
Epoch [3/3], Step [11/13], Loss: 2.8346
Epoch [4/3], Step [1/13], Loss: 2.0298
Epoch [4/3], Step [6/13], Loss: 1.8817
Epoch [4/3], Step [11/13], Loss: 2.4617
Epoch [5/3], Step [1/13], Loss: 1.1852
Epoch [5/3], Step [6/13], Loss: 1.0643
Epoch [5/3], Step [11/13], Loss: 0.9030


In [59]:
'''
def shenzhen_population():
    question, text = "What is the population of Shenzhen? ", "The population of Shenzhen is approximately 13 million."
    inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs.to(device))
    answer_start_index = torch.argmax(outputs.start_logits)
    answer_end_index = torch.argmax(outputs.end_logits) + 1
    predict_answer_tokens = inputs['input_ids'][0][answer_start_index:answer_end_index]
    predicted_answer = tokenizer.decode(predict_answer_tokens)

    print("What is the population of Shenzhen?", predicted_answer)
'''

# Expected answer: 13 million
shenzhen_population() 

What is the population of Shenzhen? 13 million


In [60]:
def HKU_established():
    question, text = "In which year was the University of Hong Kong established? ", "The University of Hong Kong was established in 1911."
    inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs.to(device))
    answer_start_index = torch.argmax(outputs.start_logits)
    answer_end_index = torch.argmax(outputs.end_logits) + 1
    predict_answer_tokens = inputs['input_ids'][0][answer_start_index:answer_end_index]
    predicted_answer = tokenizer.decode(predict_answer_tokens)

    print("In which year was the University of Hong Kong established? ", predicted_answer)

In [61]:
# Expected answer: 1911
HKU_established()

In which year was the University of Hong Kong established?  1911


In [62]:
def Liming():
    question, text = "What is Li ming's favorite food? ", "Li ming's favorite food is noodles"
    inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs.to(device))
    answer_start_index = torch.argmax(outputs.start_logits)
    answer_end_index = torch.argmax(outputs.end_logits) + 1
    predict_answer_tokens = inputs['input_ids'][0][answer_start_index:answer_end_index]
    predicted_answer = tokenizer.decode(predict_answer_tokens)

    print("What is Li ming's favorite food? ", predicted_answer)

In [63]:
# Expected answer: Yes
Liming()

What is Li ming's favorite food?  li ming's favorite food is noodles
