In [1]:
%pip install datasets evaluate transformers[sentencepiece] accelerate

Note: you may need to restart the kernel to use updated packages.


In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Loading the dataset

In [4]:
from datasets import load_dataset

squad = load_dataset("squad", split="train[:5000]")

In [5]:
squad = squad.train_test_split(test_size=0.2)

In [6]:
squad["train"][0]

{'id': '56d381b459d6e41400146591',
 'title': 'Frédéric_Chopin',
 'context': 'At the end of November, Chopin returned to Paris. He passed the winter in unremitting illness, but gave occasional lessons and was visited by friends, including Delacroix and Franchomme. Occasionally he played, or accompanied the singing of Delfina Potocka, for his friends. During the summer of 1849, his friends found him an apartment in Chaillot, out of the centre of the city, for which the rent was secretly subsidised by an admirer, Princess Obreskoff. Here in June 1849 he was visited by Jenny Lind.',
 'question': 'When did Chopin return to Paris?',
 'answers': {'text': ['November'], 'answer_start': [14]}}

## Preprocessing

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")

In [8]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [9]:
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [10]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

## Train

In [11]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("albert/albert-base-v2")

Some weights of AlbertForQuestionAnswering were not initialized from the model checkpoint at albert/albert-base-v2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
training_args = TrainingArguments(
    output_dir="my_qa_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)



In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(


In [14]:
trainer.train()

  0%|          | 0/750 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 1.038723111152649, 'eval_runtime': 17.7468, 'eval_samples_per_second': 56.348, 'eval_steps_per_second': 7.044, 'epoch': 1.0}
{'loss': 1.1144, 'grad_norm': 110.24805450439453, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.9152036309242249, 'eval_runtime': 17.6566, 'eval_samples_per_second': 56.636, 'eval_steps_per_second': 7.08, 'epoch': 2.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.9605663418769836, 'eval_runtime': 17.8501, 'eval_samples_per_second': 56.022, 'eval_steps_per_second': 7.003, 'epoch': 3.0}
{'train_runtime': 680.4304, 'train_samples_per_second': 17.636, 'train_steps_per_second': 1.102, 'train_loss': 0.8699696858723959, 'epoch': 3.0}


TrainOutput(global_step=750, training_loss=0.8699696858723959, metrics={'train_runtime': 680.4304, 'train_samples_per_second': 17.636, 'train_steps_per_second': 1.102, 'total_flos': 198754228224000.0, 'train_loss': 0.8699696858723959, 'epoch': 3.0})

In [15]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/iSathyam03/my_qa_model/commit/b6eaed3ad21173774288b0c751850dc83384ed81', commit_message='End of training', commit_description='', oid='b6eaed3ad21173774288b0c751850dc83384ed81', pr_url=None, repo_url=RepoUrl('https://huggingface.co/iSathyam03/my_qa_model', endpoint='https://huggingface.co', repo_type='model', repo_id='iSathyam03/my_qa_model'), pr_revision=None, pr_num=None)

## Inferencing

In [16]:
question = "How many programming languages does BLOOM support?"
context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."

In [17]:
from transformers import pipeline

question_answerer = pipeline("question-answering", model="iSathyam03/my_qa_model")
question_answerer(question=question, context=context)

config.json:   0%|          | 0.00/842 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/44.4M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.27M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

Device set to use cuda:0


{'score': 0.9352093935012817, 'start': 93, 'end': 95, 'answer': '13'}

In [18]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("iSathyam03/my_qa_model")
inputs = tokenizer(question, context, return_tensors="pt")

In [20]:
import torch
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained("iSathyam03/my_qa_model")
with torch.no_grad():
    outputs = model(**inputs)

In [21]:
answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

In [22]:
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

'13'