In [None]:
# 

In [None]:
# !python --version

In [None]:
# Original file is located at
#     https://colab.research.google.com/drive/1UxKWCRsrgot1xmCKCGwC9RBlalkamDhT

Longformer for Question Answering

In [None]:
# !nvidia-smi
!git clone https://github.com/huggingface/transformers.git
# !pip install -U ./transformers
# !pip uninstall transformers
!pip install transformers==2.11.0
!pip install pytorch
!pip install git+https://github.com/huggingface/nlp.git

In [None]:
import sys
print(sys.version)



the Longformer model was presented in [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan. As the paper explains it  `Longformer` is a BERT-like model fo r long documents. Training longformer for QA is similar to how you train BERT for QA. But there few things to keep in mind when using longformer for QA task. Longformer uses sliding-window local attention which scales linearly with sequence length. This is what allows longformer to handle longer sequences. For more details on how the sliding window attention works, please refer to the paper. Along with local attention longformer also allows you to use global attention for certain tokens. For QA task, all question tokens should have global attention. The attention is configured using the `attention_mask` paramter of the `forward` method of `LongformerForQuestionAnswering`. Mask values are selected in [0, 1, 2]: 0 for no attention (padding tokens), 1 for local attention (a sliding window attention), 2 for global attention (tokens that attend to all other tokens, and all other tokens attend to them). As stated above all question tokens should be given gloabl attention. The `LongformerForQuestionAnswering` model handles this automatically for you. To allow it to do that
1. The input sequence must have three sep tokens, i.e the sequence should be encoded like this `<s> question</s></s> context</s>`. If you encode the question and answer as a input pair, then the tokenizer already takes care of that, you shouldn't worry about it.
2. input_ids should always be a batch of examples.


In [None]:
import transformers
# import accelerate
# import torch

transformers_version = transformers.__version__
# accelerate_version = accelerate.__version__
# pytorch_version = torch.__version__

print("Transformers version:", transformers_version)
# print("Accelerate version:", accelerate_version)
# print("PyTorch version:", pytorch_version)



## Load and process data
Here we are using the awesome new nlp library to load and process the dataset.
Also we will use Transformers's fast tokenizers alignement methods to get position of answer spans


In [None]:
!pip install nlp
# !pip install transformers

In [None]:
!pip install torch

In [None]:
import torch
import nlp
from transformers import LongformerTokenizerFast
from transformers import AutoConfig
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer

In [None]:
model_name = "valhalla/longformer-base-4096-finetuned-squadv1"

In [None]:
config = AutoConfig.from_pretrained(model_name, num_labels=4)

In [None]:
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, config=config)

In [None]:
tokenizer = LongformerTokenizerFast.from_pretrained(model_name)


In [26]:
def get_correct_alignement(context, answers):
    """ Some original examples in SQuAD have indices wrong by 1 or 2 character. We test and fix this here. """
    # print(answers)
    gold_text = answers['text'][0]
    # print(gold_text)
    start_idx = answers['answer_start'][0]
    end_idx = start_idx + len(gold_text)
    return start_idx, end_idx      

Tokenize our training dataset

In [27]:
def convert_to_features(example):
    # Tokenize contexts and questions (as pairs of inputs)
    input_pairs = [example['question'], example['context']]
    encodings = tokenizer.encode_plus(input_pairs, pad_to_max_length=True, max_length=512)
    context_encodings = tokenizer.encode_plus(example['context'])


    # Compute start and end tokens for labels using Transformers's fast tokenizers alignement methodes.
    # this will give us the position of answer span in the context text
    start_idx, end_idx = get_correct_alignement(example['context'], example['answers'])
    # print("print(start_idx)", type(start_idx))

    start_positions_context = context_encodings.char_to_token(start_idx)
    end_positions_context = context_encodings.char_to_token(end_idx-1)
    try:
      start_positions_context = int(start_positions_context)
    except TypeError:
      start_positions_context = 0
    try:
      end_positions_context = int(end_positions_context)
    except TypeError:
      end_positions_context = 0
    # print(start_positions_context)
    # print(type(end_positions_context))

    # here we will compute the start and end position of the answer in the whole example
    # as the example is encoded like this <s> question</s></s> context</s>
    # and we know the postion of the answer in the context
    # we can just find out the index of the sep token and then add that to position + 1 (+1 because there are two sep tokens)
    # this will give us the position of the answer span in whole example
    sep_idx = encodings['input_ids'].index(tokenizer.sep_token_id)
    # print(type(sep_idx))
    start_positions = start_positions_context + sep_idx + 1
    end_positions = end_positions_context + sep_idx + 1
    if end_positions > 512:
      start_positions, end_positions = 0, 0
    encodings.update({'start_positions': start_positions,
                      'end_positions': end_positions,
                      'attention_mask': encodings['attention_mask']})
    return encodings

In [28]:
!pip install datasets

/bin/bash: /home/beyond-data/anaconda3/envs/tf/lib/libtinfo.so.6: no version information available (required by /bin/bash)


In [29]:
from datasets import load_dataset

load train and validation split of squad

In [30]:
dataset_dict  = (load_dataset('covid_qa_deepset', split='train').train_test_split(test_size=0.1))
# valid_dataset = nlp.load_dataset('squad', split=nlp.Split.VALIDATION)
train_dataset = dataset_dict['train']
valid_dataset = dataset_dict['test']
# print("dataset_dict: ",dataset_dict)
print("train_dict: ",train_dataset)
print("valid_dict: ",valid_dataset)

Found cached dataset covid_qa_deepset (/home/beyond-data/.cache/huggingface/datasets/covid_qa_deepset/covid_qa_deepset/1.0.0/fb886523842e312176f92ec8e01e77a08fa15a694f5741af6fc42796ee9c8c46)


train_dict:  Dataset({
    features: ['document_id', 'context', 'question', 'is_impossible', 'id', 'answers'],
    num_rows: 1817
})
valid_dict:  Dataset({
    features: ['document_id', 'context', 'question', 'is_impossible', 'id', 'answers'],
    num_rows: 202
})


In [31]:
train_dataset = train_dataset.map(convert_to_features)
valid_dataset = valid_dataset.map(convert_to_features, load_from_cache_file=False)

Map:   0%|          | 0/1817 [00:00<?, ? examples/s]

Map:   0%|          | 0/202 [00:00<?, ? examples/s]

set the tensor type and the columns which the dataset should return
columns = ['answers', 'context', 'document_id', 'id', 'is_impossible', 'question']

In [32]:
columns = ['input_ids', 'attention_mask', 'start_positions', 'end_positions']
train_dataset.set_format(type='torch', columns=columns)
valid_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [33]:
len(train_dataset), len(valid_dataset)

(1817, 202)

cach the dataset, so we can load it directly for training

In [34]:
torch.save(train_dataset, 'train_data.pt')
torch.save(valid_dataset, 'valid_data.pt')

In [35]:
import dataclasses
import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Dict, List, Optional

import numpy as np
import torch

from transformers import LongformerForQuestionAnswering, LongformerTokenizerFast, EvalPrediction
from transformers import (
    HfArgumentParser,
    DataCollator,
    Trainer,
    TrainingArguments,
    set_seed,
)


logger = logging.getLogger(__name__)

@dataclass
class DummyDataCollator():
    def collate_batch(self, batch: List) -> Dict[str, torch.Tensor]:
        """
        Take a list of samples from a Dataset and collate them into a batch.
        Returns:
            A dictionary of tensors
        """
        input_ids = torch.stack([example['input_ids'] for example in batch])
        attention_mask = torch.stack([example['attention_mask'] for example in batch])
        start_positions = torch.stack([example['start_positions'] for example in batch])
        end_positions = torch.stack([example['end_positions'] for example in batch])

        return {
            'input_ids': input_ids,
            'start_positions': start_positions,
            'end_positions': end_positions,
            'attention_mask': attention_mask
        }


@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
    )

@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """
    train_file_path: Optional[str] = field(
        default='train_data.pt',
        metadata={"help": "Path for cached train dataset"},
    )
    valid_file_path: Optional[str] = field(
        default='valid_data.pt',
        metadata={"help": "Path for cached valid dataset"},
    )
    max_len: Optional[int] = field(
        default=512,
        metadata={"help": "Max input length for the source text"},
    )


def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))

    # we will load the arguments from a json file,
    # make sure you save the arguments in at ./args.json
    model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath('args.json'))

    if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        # training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    tokenizer = LongformerTokenizerFast.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    model = LongformerForQuestionAnswering.from_pretrained(
        model_args.model_name_or_path,
        cache_dir=model_args.cache_dir
    )

    # Get datasets
    print('loading data')
    train_dataset  = torch.load(data_args.train_file_path)
    valid_dataset = torch.load(data_args.valid_file_path)
    print('loading done')

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        data_collator=DummyDataCollator(),
        prediction_loss_only=True,
    )

    # Training
    if training_args.do_train:
        trainer.train(
            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
        )
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval and training_args.local_rank in [-1, 0]:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(eval_output.keys()):
                logger.info("  %s = %s", key, str(eval_output[key]))
                writer.write("%s = %s\n" % (key, str(eval_output[key])))

        results.update(eval_output)

    return results, model


def _mp_fn(index):
    # For xla_spawn (TPUs)
    main()


## Train

import json
et's write the arguments in a dict and store in a json file. The above code will load this file and parse the arguments.


In [None]:
import json

In [None]:
args_dict = {
  # "n_gpu": 1,
  "model_name_or_path": "valhalla/longformer-base-4096-finetuned-squadv1",
  "max_len": 512 ,
  "output_dir": './models',
  "overwrite_output_dir": True,
  "per_gpu_train_batch_size": 8,
  "per_gpu_eval_batch_size": 8,
  "gradient_accumulation_steps": 16,
  "learning_rate": 1e-4,
  "num_train_epochs": 3,
  "do_train": True
}

In [None]:
with open('args.json', 'w') as f:
  json.dump(args_dict, f)

Start training!

In [None]:
results, model = main()


# Eval


 SQuAD evaluation script. Modifed slightly for this notebook

In [36]:
from __future__ import print_function
from collections import Counter
import string
import re
import argparse
import json
import sys

In [37]:
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

In [38]:
def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

In [39]:
def exact_match_score(prediction, ground_truth):
    return (normalize_answer(prediction) == normalize_answer(ground_truth))

In [40]:
def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    return max(scores_for_ground_truths)

In [41]:
def evaluate(gold_answers, predictions):
    f1 = exact_match = total = 0
    for ground_truths, prediction in zip(gold_answers, predictions):
      total += 1
      exact_match += metric_max_over_ground_truths(
                    exact_match_score, prediction, ground_truths)
      f1 += metric_max_over_ground_truths(
          f1_score, prediction, ground_truths)

    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total
    return {'exact_match': exact_match, 'f1': f1}

In [42]:
import torch
from transformers import LongformerTokenizerFast, LongformerForQuestionAnswering
from tqdm.auto import tqdm

In [43]:
# tokenizer = LongformerTokenizerFast.from_pretrained("valhalla/longformer-base-4096-finetuned-squadv1")
# model = LongformerForQuestionAnswering.from_pretrained("valhalla/longformer-base-4096-finetuned-squadv1")
# model = model.cuda()
model.eval()

LongformerForQuestionAnswering(
  (longformer): LongformerModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(4098, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): LongformerSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_global): Linear(in_features=768, out_features=768, bias=True)
              (value_global): Linear(in_feat

In [44]:
valid_dataset = torch.load('valid_data.pt')
dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=16)

In [45]:
answers = []
with torch.no_grad():
  for batch in tqdm(dataloader):
    start_scores, end_scores = model(input_ids=batch['input_ids'].cuda(),
                                  attention_mask=batch['attention_mask'].cuda())
    for i in range(start_scores.shape[0]):
      all_tokens = tokenizer.convert_ids_to_tokens(batch['input_ids'][i])
      answer = ' '.join(all_tokens[torch.argmax(start_scores[i]) : torch.argmax(end_scores[i])+1])
      ans_ids = tokenizer.convert_tokens_to_ids(answer.split())
      answer = tokenizer.decode(ans_ids)
      answers.append(answer)

  0%|          | 0/13 [00:00<?, ?it/s]

In [46]:
print(answers)



In [47]:
predictions = []
references = []
# print(answers)
# print(valid_dataset[0])
for ref, pred in zip(valid_dataset, answers):
  predictions.append(pred)
  references.append(answers)
# print(predictions)

In [48]:
evaluate(references, predictions)

{'exact_match': 100.0, 'f1': 96.03960396039604}

In [50]:
device = "cuda:0"
model = model.to(device)


## Model in action 
The trained model is available on Huggingface hub if you want to play with it.
You can find the model [here](https://huggingface.co/valhalla/longformer-base-4096-finetuned-squadv1)


In [8]:
import torch
from transformers import LongformerTokenizer, LongformerForQuestionAnswering

tokenizer = LongformerTokenizer.from_pretrained("valhalla/longformer-base-4096-finetuned-squadv1")
# model = LongformerForQuestionAnswering.from_pretrained("valhalla/longformer-base-4096-finetuned-squadv1")
# print(model)
text = "covid has lead to respiratory issues in humans. The primary target of the SARS-CoV-2 virus, which causes COVID-19, is the respiratory system. The virus primarily affects the respiratory tract, including the nose, throat, and lungs. It gains entry into the body through respiratory droplets when an infected person coughs, sneezes, talks, or exhales. These droplets can be inhaled by nearby individuals, leading to infection. Once inside the body, the virus primarily targets the cells lining the respiratory tract, particularly the cells that line the airways and the alveoli (small air sacs) in the lungs. It attaches to specific receptors on these cells, known as angiotensin-converting enzyme 2 (ACE2) receptors, to gain entry and replicate. COVID-19 can cause a range of respiratory symptoms, including cough, sore throat, shortness of breath, and pneumonia. However, it is important to note that the virus can also affect other organs and systems in the body, such as the cardiovascular system, gastrointestinal system, kidneys, liver, and neurological system. Severe cases of COVID-19 can lead to complications and multiorgan involvement, which can result in a more severe illness. It's worth mentioning that the impact of COVID-19 can vary from person to person, and some individuals may experience more severe respiratory symptoms than others. Additionally, emerging research continues to shed light on the diverse effects of the virus on different body systems."
question = "what part of the body does corona viruus affect the most?"
encoding = tokenizer.encode_plus(question, text, return_tensors="pt")
input_ids = encoding["input_ids"]

default is local attention everywhere
the forward method will automatically set global attention on question tokens

In [9]:
attention_mask = encoding["attention_mask"]

In [10]:
start_scores, end_scores = model(input_ids, attention_mask=attention_mask)
all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())

In [11]:
answer_tokens = all_tokens[torch.argmax(start_scores) :torch.argmax(end_scores)+1]
answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens))
print(answer)
# output => democratized NLP

 respiratory tract
