In [8]:
import torch
torch.cuda.is_available()

False

In [2]:
import json

In [3]:
with open('combined-newsqa-data-v1.json', 'r') as f: 
    newsqa = json.load(f) 

In [4]:
def split_train_dev_test(data_list):
    train_list = list()
    dev_list = list()
    test_list = list()
    for data in data_list['data']:
        data_type = data['type']
        if data_type == 'train':
            train_list.append(data)
        elif data_type == 'dev':
            dev_list.append(data)
        elif data_type == 'test':
            test_list.append(data)
        else:
            raise Exception('%s' % data_type)
    return train_list, dev_list, test_list

train_news, dev_news, test_news = split_train_dev_test(newsqa)

In [5]:
print(len(train_news))
print(len(dev_news))
print(len(test_news))

11469
638
637


In [6]:
def preprocess(data_list):
    
    contexts = list()
    questions = list()
    answers = list()
    
    for data in data_list:
        text = data['text']
        for question in data['questions']:
            q = question['q']
            for answer in question['answers']:
                for sa in answer['sourcerAnswers']:
                    start = sa.get('s', None)
                    end = sa.get('e', None)
                    if start is None or end is None:
                        continue
                    ans = {'answer_start': start,
                           'answer_end': end}
                    contexts.append(text)
                    questions.append(q)
                    answers.append(ans)
    
    return contexts, questions, answers

In [7]:
train_contexts, train_questions, train_answers = preprocess(train_news)
print(len(train_contexts))
print(len(train_questions))
print(len(train_answers))

301577
301577
301577


In [8]:
dev_contexts, dev_questions, dev_answers = preprocess(dev_news)
print(len(dev_contexts))
print(len(dev_questions))
print(len(dev_answers))

16858
16858
16858


In [9]:
test_contexts, test_questions, test_answers = preprocess(test_news)
print(len(test_contexts))
print(len(test_questions))
print(len(test_answers))

16640
16640
16640


In [10]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
dev_encodings = tokenizer(dev_contexts, dev_questions, truncation=True, padding=True)
test_encodings = tokenizer(test_contexts, test_questions, truncation=True, padding=True)

In [11]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length

        # if end position is None, the 'char_to_token' function points to the space before the correct token - > add + 1
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(dev_encodings, dev_answers)
add_token_positions(test_encodings, test_answers)

In [12]:
import torch

class NewsQADataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = NewsQADataset(train_encodings)
dev_dataset = NewsQADataset(dev_encodings)
test_dataset = NewsQADataset(test_encodings)

In [13]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

import copy
import numpy as np

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(dev_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

min_val_loss = np.Inf
best_model_weight = copy.deepcopy(model.state_dict())
min_val_epoch = 0

loss_file_name = 'log/newsqa_log.csv'
f = open(loss_file_name,'a')
f.write('epoch, train loss, valid loss')
f.write('\n')
f.close()

for epoch in range(50):
    train_loss = 0
    val_loss = 0

    for phase in ['Train', 'Valid']:
        if phase == 'Train':
            pbar = tqdm(train_loader)
            model.train()
        else: 
            pbar = tqdm(val_loader)
            model.eval()
            
        for batch in pbar:            
            optim.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            loss = outputs[0]
            pbar.set_description('Epoch %d | %s | Loss: %f' % (epoch + 1, phase, loss.item()))
            
            if phase == 'Train':
                train_loss += loss.item()
                loss.backward()
                optim.step()
            
            else:
                val_loss += loss.item()
    
    if epoch == 0:
        min_val_loss = val_loss / len(val_loader)
    else:
        if val_loss / len(val_loader) < min_val_loss:
            best_model_weight = copy.deepcopy(model.state_dict())
            min_val_loss = val_loss / len(val_loader)
            min_val_epoch = epoch
            torch.save(best_model_weight, 'models/newsqa_ep{}.pt'.format(min_val_epoch))
            
    f = open(loss_file_name,'a')
    f.write(str((epoch+1)) + ", " + str(train_loss/len(train_loader)) + ", " + str(val_loss/len(val_loader)))
    f.write("\n")
    f.close()

print('Training completed')
print('Minimum loss:', min_val_loss, 'in epoch', min_val_epoch)

save_path = 'models/newsqa_ep{}.pt'.format(min_val_epoch)
torch.save(best_model_weight, save_path)

  0%|          | 0/18849 [00:00<?, ?it/s]

cpu


Epoch 1 | Train | Loss: 4.545799:   1%|          | 168/18849 [08:48<17:06:51,  3.30s/it]

In [None]:
context = "The US has passed the peak on new coronavirus cases, " \
          "President Donald Trump said and predicted that some states would reopen this month. " \
          "The US has over 637,000 confirmed Covid-19 cases and over 30,826 deaths, the highest for any country in the world."

question = "What was President Donald Trump's prediction?"

encoding = tokenizer.encode_plus(question, context)

input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]

output = model(torch.tensor([input_ids]), attention_mask=torch.tensor([attention_mask]))
start_scores = output.start_logits
end_scores = output.end_logits

ans_tokens = input_ids[torch.argmax(start_scores) : torch.argmax(end_scores)+1]
answer_tokens = tokenizer.convert_ids_to_tokens(ans_tokens , skip_special_tokens=True)

print ("\nQuestion ",question)
print ("\nAnswer Tokens: ")
print (answer_tokens)

answer_tokens_to_string = tokenizer.convert_tokens_to_string(answer_tokens)

print ("\nAnswer : ",answer_tokens_to_string)