In [1]:
! pip install torch transformers datasets evaluate



In [2]:
import torch
from datasets import load_dataset
from transformers import AutoModelForQuestionAnswering, AutoTokenizer

squad2 = load_dataset("squad_v2")

Downloading readme:   0%|          | 0.00/8.18k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

## Pre-process Training Data

In [3]:
MAX_TOKEN_LENGTH = 512
STRIDE = 128


def preprocess_training_examples(
    examples,
    tokenizer,
    max_token_length=MAX_TOKEN_LENGTH,
    stride=STRIDE,
):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_token_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
        return_tensors="pt",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]

        # If the answer doesn't exist, the label is (0, 0)
        if not answer["answer_start"]:
          start_positions.append(0)
          end_positions.append(0)
          continue

        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [4]:
# from functools import partial

# train_dataset = squad2["train"].select(range(5000))
# preprocessed_train_dataset = train_dataset.map(
#     partial(preprocess_training_examples, tokenizer=roberta_squad2_tokenizer),
#     batched=True,
#     remove_columns=train_dataset.column_names,
# )
# len(train_dataset), len(preprocessed_train_dataset)

## Pre-process Test Data

In [5]:
def preprocess_test_examples(
    examples,
    tokenizer,
    max_token_length=MAX_TOKEN_LENGTH,
    stride=STRIDE,
):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_token_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

## Inference

In [6]:
from tqdm import trange


BATCH_SIZE = 700


def infer_outputs(model, preprocessed_inputs, batch_size=BATCH_SIZE):
    preprocessed_inputs.set_format("torch")

    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        print("No GPU available, using CPU for inference")
        device = torch.device("cpu")
    
    model.to(device)

    num_examples = len(preprocessed_inputs)
    outputs = None
    for batch_start in trange(0, num_examples, batch_size):
      batch_end = min(batch_start+batch_size, num_examples)
      batch_inputs = preprocessed_inputs.select(range(batch_start, batch_end))
      batch_inputs_for_model = {
          k: batch_inputs[k].to(device)
          for k in batch_inputs.column_names
          if k not in ["offset_mapping", "example_id"]
      }

      with torch.no_grad():
        batch_outputs = model(**batch_inputs_for_model)

      # Free memory for inputs
      for v in batch_inputs_for_model.values():
        del v

      if outputs is None:
        outputs = batch_outputs
        for v in outputs.values():
          v.cpu()
      else:
        for k in batch_outputs.keys():
          outputs[k] = torch.cat((outputs[k], batch_outputs[k]), dim=0)

      # Free memory for outputs
      for v in batch_outputs.values():
        del v

    return outputs

## Post-processing

In [7]:
import collections
import numpy as np
import tqdm


def post_process(
    outputs,
    test_dataset,
    preprocessed_test_dataset,
    n_best=20,
    max_answer_length=30,
):
    preprocessed_test_dataset.set_format()

    start_logits = outputs.start_logits.cpu().to(torch.float32).numpy()
    end_logits = outputs.end_logits.cpu().to(torch.float32).numpy()
    offset_mapping = preprocessed_test_dataset["offset_mapping"]

    example_to_features = collections.defaultdict(list)
    example_ids = preprocessed_test_dataset["example_id"]
    for idx, example_id in tqdm.tqdm(enumerate(example_ids), total=len(example_ids)):
        example_to_features[example_id].append(idx)

    predicted_answers = []
    for example in tqdm.tqdm(test_dataset, total=len(test_dataset)):
        example_id = example["id"]
        context = example["context"]
        answers = []

        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = offset_mapping[feature_index]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Prediction is that there's no answer
                    if start_index == 0 and end_index == 0:
                      answers.append(
                          {
                              "text": None,
                              "logit_score": start_logit[start_index] + end_logit[end_index],
                          }
                      )
                    # Skip answers that are not fully in the context
                    elif offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length.
                    elif (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue
                    else:
                      answers.append(
                          {
                              "text": context[offsets[start_index][0] : offsets[end_index][1]],
                              "logit_score": start_logit[start_index] + end_logit[end_index],
                          }
                      )

        best_answer = max(answers, key=lambda x: x["logit_score"])
        predicted_answers.append(
            {
                "id": example_id,
                "prediction_text": best_answer["text"] or "",
                # TODO: can this be improved?
                "no_answer_probability": 1.0 if best_answer["text"] else 0.0
            }
        )
    return predicted_answers

In [8]:
def find_examples_with_no_answer(dataset):
  example_idxs = []
  for idx, ex in enumerate(dataset):
    if not ex["answers"].get("text"):
      example_idxs.append(idx)
  return example_idxs

In [9]:
import evaluate

metric = evaluate.load("squad_v2")

Downloading builder script:   0%|          | 0.00/6.47k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

In [10]:
from functools import partial


def evaluate_model(model_name, test_dataset):
    model = AutoModelForQuestionAnswering.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    preprocessed_test_dataset = test_dataset.map(
        partial(preprocess_test_examples, tokenizer=tokenizer),
        batched=True,
        remove_columns=test_dataset.column_names,
    )

    outputs = infer_outputs(model, preprocessed_test_dataset)

    predicted_answers = post_process(
        outputs,
        test_dataset,
        preprocessed_test_dataset=preprocessed_test_dataset,
    )
    theoretical_answers = [
        {"id": ex["id"], "answers": ex["answers"]} for ex in test_dataset
    ]

    metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [11]:
# Evaluate RoBERTa
evaluate_model("deepset/roberta-base-squad2", squad2["validation"])

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

  0%|          | 0/18 [01:21<?, ?it/s]


KeyboardInterrupt: 

In [16]:
# Evaluate MDeBERTa-v3

test_dataset = squad2["validation"]

mdeberta_squad2_name = "timpal0l/mdeberta-v3-base-squad2"
mdeberta_squad2_model = AutoModelForQuestionAnswering.from_pretrained(
    mdeberta_squad2_name,
    torch_dtype=torch.bfloat16,
)
mdeberta_squad2_tokenizer = AutoTokenizer.from_pretrained(mdeberta_squad2_name)

preprocessed_test_dataset = test_dataset.map(
    partial(preprocess_test_examples, tokenizer=mdeberta_squad2_tokenizer),
    batched=True,
    remove_columns=test_dataset.column_names,
)

outputs = infer_outputs(
    mdeberta_squad2_model,
    preprocessed_test_dataset,
    batch_size=250
)

predicted_answers = post_process(
    outputs,
    test_dataset,
    preprocessed_test_dataset=preprocessed_test_dataset,
)
theoretical_answers = [
    {"id": ex["id"], "answers": ex["answers"]} for ex in test_dataset
]

metric.compute(predictions=predicted_answers, references=theoretical_answers)

100%|██████████| 49/49 [10:28<00:00, 12.82s/it]
100%|██████████| 12054/12054 [00:00<00:00, 1111656.56it/s]
100%|██████████| 11873/11873 [00:03<00:00, 3716.15it/s]


{'exact': 80.31668491535416,
 'f1': 83.49163486164709,
 'total': 11873,
 'HasAns_exact': 79.6221322537112,
 'HasAns_f1': 85.98113709722277,
 'HasAns_total': 5928,
 'NoAns_exact': 81.00925147182507,
 'NoAns_f1': 81.00925147182507,
 'NoAns_total': 5945,
 'best_exact': 80.31668491535416,
 'best_exact_thresh': 1.0,
 'best_f1': 83.49163486164709,
 'best_f1_thresh': 1.0}