In [2]:
# Adapted from https://huggingface.co/transformers/v3.2.0/custom_datasets.html#qa-squad

In [1]:
!python colab_module_imports.py



In [2]:
from datasets import load_dataset

In [3]:
dataset = load_dataset("squad")

train_contexts = dataset["train"]["context"]
train_questions = dataset["train"]["question"]
train_answers = dataset["train"]["answers"]

val_contexts = dataset["validation"]["context"]
val_questions = dataset["validation"]["question"]
val_answers = dataset["validation"]["answers"]

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [5]:
num = 7459
# print(train_contexts[num])
# print(train_questions[num])
# print(train_answers[num])

print(val_contexts[num])
print(val_questions[num])
print(val_answers[num])

Trade liberalization may shift economic inequality from a global to a domestic scale. When rich countries trade with poor countries, the low-skilled workers in the rich countries may see reduced wages as a result of the competition, while low-skilled workers in the poor countries may see increased wages. Trade economist Paul Krugman estimates that trade liberalisation has had a measurable effect on the rising inequality in the United States. He attributes this trend to increased trade with poor countries and the fragmentation of the means of production, resulting in low skilled jobs becoming more tradeable. However, he concedes that the effect of trade on inequality in America is minor when compared to other causes, such as technological innovation, a view shared by other experts. Empirical economists Max Roser and Jesus Crespo-Cuaresma find support in the data that international trade is increasing income inequality. They empirically confirm the predictions of the Stolper–Samuelson th

# My data set has a different format.  Namely, I have a list for 'text' in the answers.  This creates an issue in the val_answers since the length > 1.

In [6]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):

        #Note: for this experiment, I am only taking the first element of a list.
        #The difference is in the val_answers, which have lists longer than 1.
        #So my experiment will not be on the full data.
        answer['text'] = answer['text'][0]
        answer['answer_start'] = answer['answer_start'][0]

        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two – fix this
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters

add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [7]:
num = 55

print(train_contexts[num])
print(train_questions[num])
print(train_answers[num])

print(val_contexts[num])
print(val_questions[num])
print(val_answers[num])

In 1882, Albert Zahm (John Zahm's brother) built an early wind tunnel used to compare lift to drag of aeronautical models. Around 1899, Professor Jerome Green became the first American to send a wireless message. In 1931, Father Julius Nieuwland performed early work on basic reactions that was used to create neoprene. Study of nuclear physics at the university began with the building of a nuclear accelerator in 1936, and continues now partly through a partnership in the Joint Institute for Nuclear Astrophysics.
Which professor sent the first wireless message in the USA?
{'text': 'Professor Jerome Green', 'answer_start': 136, 'answer_end': 158}
The Broncos took an early lead in Super Bowl 50 and never trailed. Newton was limited by Denver's defense, which sacked him seven times and forced him into three turnovers, including a fumble which they recovered for a touchdown. Denver linebacker Von Miller was named Super Bowl MVP, recording five solo tackles, 2½ sacks, and two forced fumbles.


In [8]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [9]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
        # if None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [10]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [11]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()

In [None]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()