# NLP Information Extraction: Question Answering

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch.optim import AdamW
from torch.utils.data import DataLoader

from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, default_data_collator, get_scheduler

from tqdm.auto import tqdm
random_seed = 42

model_name = 'DeepPavlov/rubert-base-cased'  # Baseline
model_name = 'ai-forever/ruBert-large'
# model_name = 'ai-forever/sbert_large_mt_nlu_ru'
# model_name = 'cointegrated/rubert-tiny2'
# model_name = 'M-CLIP/M-BERT-Distil-40'
# model_name = 'distilbert-base-multilingual-cased'

# model_name = 'bert-base-multilingual-uncased'  # Server crashes
# model_name = 'DeepPavlov/xlm-roberta-large-en-ru'  # No separation tokens
# model_name = 'ai-forever/ruRoberta-large'  # No separation tokens
# model_name = 'xlm-roberta-base'  # No separation tokens
# model_name = 'ai-forever/rugpt3large_based_on_gpt2'  # NOT for QA

model_name_to_save = model_name.split('/')[1] if '/' in model_name else model_name
torch.cuda.empty_cache()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
raw_dataset = load_from_disk("../data/raw")
test = pd.read_json(f'../data/test.json')

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [4]:
max_length = 512


def preprocess_examples(example):
    inputs = tokenizer(
        example["label"],
        example["text"],
        max_length=max_length,
        truncation="only_second",
        padding="max_length",
        return_offsets_mapping=True,
        return_token_type_ids=True
    )

    offset_mapping = inputs.pop("offset_mapping")
    
    answer = example["extracted_part"]
    start_positions = []
    end_positions = []
    
    start_char = answer["answer_start"][0]
    end_char = answer["answer_end"][0]
    
    context_start = inputs.token_type_ids.index(1)
    context_end = len(inputs.token_type_ids) - 2 - inputs.token_type_ids[::-1].index(1)
    
    start_positions = []
    end_positions = []
    
    if start_char == end_char:
        start_positions.append(start_char)
        end_positions.append(start_char)
    
    else:
        idx = context_start
        while idx <= context_end and offset_mapping[idx][0] <= start_char:
            idx += 1   
        start_positions.append(idx - 1)

        idx = context_end
        while idx >= context_start and offset_mapping[idx][1] >= end_char:
            idx -= 1
        end_positions.append(idx + 1)
    
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    inputs["offset_mapping"] = offset_mapping
    
    return inputs

In [5]:
train_dataset = raw_dataset["train"].map(
    preprocess_examples,
    remove_columns=raw_dataset["train"].column_names,
)

val_dataset = raw_dataset["val"].map(
    preprocess_examples,
    remove_columns=raw_dataset["val"].column_names,
)

Loading cached processed dataset at /home/vova/nlp-ie/data/raw/train/cache-274b23c804fac426.arrow
                                                              

In [6]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
batch_size = 2

num_train_epochs = 8

In [7]:
train_dataloader = DataLoader(
    train_dataset.remove_columns('offset_mapping'),
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=batch_size,
)
val_dataloader = DataLoader(
    val_dataset.remove_columns('offset_mapping'), 
    collate_fn=default_data_collator, 
    batch_size=batch_size
)

In [8]:
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
model = model.to(device)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-b

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [9]:
optimizer = AdamW(model.parameters(), lr=2e-5)

num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [10]:
n_best = 10
max_answer_length = 500


def compute_metrics(start_logits, end_logits, features, examples):
    
    predicted_answers = []
    for start_logit, end_logit, feature, example in zip(start_logits, end_logits, features, examples):
        
        example_id = str(example["id"])
        context = example["text"]
        answers = []
        offsets = feature["offset_mapping"]

        start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
        end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()

        for start_index in start_indexes:
            for end_index in end_indexes:
                if (
                    end_index < start_index
                    or end_index - start_index + 1 > max_answer_length
                ):
                    continue

                answer = {
                    "text": context[offsets[start_index][0] : offsets[end_index][1]],
                    "logit_score": start_logit[start_index] + end_logit[end_index],
                }
                answers.append(answer)

        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": {'text': ex["extracted_part"]["text"], 
                                                            'answer_start': ex["extracted_part"]["answer_start"][0]
                                                                 }
                           } for ex in examples]
    
    return np.mean([p['prediction_text']==t['answers']['text'][0] for p,t in zip(predicted_answers, 
                                                                              theoretical_answers)])

In [None]:
for epoch in tqdm(range(num_train_epochs), desc='epochs'):

    model.train()
    for batch in tqdm(train_dataloader, desc='train samples'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        
    model.eval()
    start_logits = []
    end_logits = []
    for batch in tqdm(val_dataloader, desc='val samples'):
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask,
                            start_positions=start_positions,
                            end_positions=end_positions)
        start_logits.append(outputs.start_logits.cpu().numpy())
        end_logits.append(outputs.end_logits.cpu().numpy())

    start_logits = np.concatenate(start_logits)
    end_logits = np.concatenate(end_logits)
    
    start_logits = start_logits[: len(val_dataset)]
    end_logits = end_logits[: len(val_dataset)]
    
    metrics = compute_metrics(
        start_logits, end_logits, val_dataset, raw_dataset["val"]
    )
    print(f"epoch {epoch + 1}:", metrics)

epochs:   0%|          | 0/8 [00:00<?, ?it/s]
train samples:   0%|          | 0/810 [00:00<?, ?it/s][A
train samples:   0%|          | 1/810 [00:00<12:11,  1.11it/s][A
train samples:   0%|          | 3/810 [00:01<03:52,  3.46it/s][A
train samples:   1%|          | 5/810 [00:01<02:26,  5.51it/s][A
train samples:   1%|          | 7/810 [00:01<01:51,  7.23it/s][A
train samples:   1%|          | 9/810 [00:01<01:33,  8.60it/s][A
train samples:   1%|▏         | 11/810 [00:01<01:22,  9.66it/s][A
train samples:   2%|▏         | 13/810 [00:01<01:16, 10.46it/s][A
train samples:   2%|▏         | 15/810 [00:02<01:11, 11.05it/s][A
train samples:   2%|▏         | 17/810 [00:02<01:09, 11.47it/s][A
train samples:   2%|▏         | 19/810 [00:02<01:07, 11.78it/s][A
train samples:   3%|▎         | 21/810 [00:02<01:05, 11.97it/s][A
train samples:   3%|▎         | 23/810 [00:02<01:04, 12.13it/s][A
train samples:   3%|▎         | 25/810 [00:02<01:04, 12.24it/s][A
train samples:   3%|▎         

In [None]:
model.save_pretrained(model_name_to_save + '_8epochs')

In [None]:
model = model.to('cpu')
model.eval();

In [356]:
predictions = []

empty_threshold = 30  # number of characters (Postprocessing)

for idx, row in tqdm(test.iterrows()):
    context = row['text']
    label = row['label']
    
    inputs = tokenizer(label, 
                       context,
                       return_tensors="pt", truncation='only_second',
                                   max_length=max_length)
    with torch.no_grad():
        outputs = model(**inputs)
    
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits
    
    start_probabilities = torch.nn.functional.softmax(start_logits, dim=-1)[0]
    end_probabilities = torch.nn.functional.softmax(end_logits, dim=-1)[0]
    scores = start_probabilities[:, None] * end_probabilities[None, :]
    scores = torch.triu(scores)
    max_index = scores.argmax().item()
    start_index = max_index // scores.shape[1]
    end_index = max_index % scores.shape[1]
    
    inputs_with_offsets = tokenizer(label, context, return_offsets_mapping=True, truncation='only_second',
                                   max_length=max_length)
    offsets = inputs_with_offsets["offset_mapping"]
    
    start_char, _ = offsets[start_index]
    _, end_char = offsets[end_index]
    predicted_answer = context[start_char:end_char]
    
    # Postprocessing
    if len(predicted_answer) <= empty_threshold:
        predicted_answer = ''
        start_char = 0
        end_char = 0

    predictions.append({
        'text': [predicted_answer],
        'answer_start': [start_char],
        'answer_end': [end_char]
    })

318it [00:23, 13.65it/s]


In [357]:
test['extracted_part'] = predictions

In [358]:
test.to_json(f'predictions_{model_name_to_save}_8epochs_post.json', orient='records', force_ascii=False)

In [None]:
!python validate.py --predict qa/predictions_$model_name_to_save.json --gt data/test_with_labels.json

## Results of QA pipelines

### cointegrated/rubert-tiny2 (can work with 2048 max_len but were cutted to 512)
- Accuracy: 55.35%
### M-CLIP/M-BERT-Distil-40
- Accuracy: 72.33% (8 epochs)
### DeepPavlov/rubert-base-cased (baseline)
- Accuracy: 75.80%
### distilbert-base-multilingual-cased
- Accuracy: 78.62% (8 epochs + postprocessing)
### ai-forever/sbert_large_mt_nlu_ru
- Accuracy: 83.65%
### ai-forever/ruBert-large
- Accuracy: 83.02%
- Accuracy: 84.91% (8 epochs)

**Defaults** = sequence length 512, batch size 2, epochs 4

**postprocessing** = make prediction empty if the predicted part length in chars is less than or equal to 30 chars (30 is because I assumed token mean len is 6 while the smallest extracted part is 5 tokens) 