## Answer single Questions

In [10]:
import electra
from utils import *

testmodel = electra.Electra()

In [11]:
testmodel.predict("who am I?", "My name is the Roman.")

{'score': 0.768954336643219, 'start': 15, 'end': 20, 'answer': 'Roman'}

In [12]:
testmodel.predict("who am I?", "My name is Roman but not Vincenzo.")

{'score': 0.295525461435318,
 'start': 11,
 'end': 33,
 'answer': 'Roman but not Vincenzo'}

In [13]:
testmodel.predict("who am I?", "My name is Roman and Vincenzo.")

{'score': 0.5456690788269043,
 'start': 11,
 'end': 29,
 'answer': 'Roman and Vincenzo'}

In [14]:
testmodel.predict("who am I?", "My name is not Roman but Vincenzo.")

{'score': 0.8158816695213318, 'start': 25, 'end': 33, 'answer': 'Vincenzo'}

## Answer multiple Questions

In [15]:
from transformers import Trainer, default_data_collator, AutoModelForQuestionAnswering, AutoTokenizer, TrainingArguments
from datasets import load_dataset, load_metric
import torch
import numpy as np

In [16]:
tokenizer = AutoTokenizer.from_pretrained("./test-squad-trained_electra")
model = AutoModelForQuestionAnswering.from_pretrained("./test-squad-trained_electra").to("cuda:0")

config = load_yaml_file('electra_config.yaml')

##
GPU_USAGE = config['GPU_USAGE']
BATCH_SIZE = config['BATCH_SIZE']
MAX_LENGTH = config['MAX_LENGTH']
DOC_STRIDE = config['DOC_STRIDE']
MAX_ANSWER_LENGTH = config['MAX_ANSWER_LENGTH']
squad_v2 = config['squad_v2']
N_BEST_SIZE = config['N_BEST_SIZE']
##
PAD_RIGHT = tokenizer.padding_side == "right"

datasets = load_dataset("squad_v2")
data_collator = default_data_collator

Reusing dataset squad_v2 (C:\Users\vince\.cache\huggingface\datasets\squad_v2\squad_v2\2.0.0\09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d)
100%|██████████| 2/2 [00:00<00:00, 71.42it/s]


In [17]:
torch.cuda.is_available()

True

In [18]:
tokenized_datasets = datasets.map(
    prepare_train_features, 
    fn_kwargs={
        'tokenizer':tokenizer, 
        'PAD_RIGHT':PAD_RIGHT,
        'MAX_LENGTH':MAX_LENGTH, 
        'DOC_STRIDE':DOC_STRIDE
        }, 
    batched=True, remove_columns=datasets["train"].column_names)

Loading cached processed dataset at C:\Users\vince\.cache\huggingface\datasets\squad_v2\squad_v2\2.0.0\09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d\cache-07172c5262b075b2.arrow
Loading cached processed dataset at C:\Users\vince\.cache\huggingface\datasets\squad_v2\squad_v2\2.0.0\09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d\cache-409600a060f88279.arrow


In [19]:
model_name = "test-squad-trained"

import os
os.environ["WANDB_DISABLED"] = "true"

args = TrainingArguments(
    f"{model_name}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,

)

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [20]:
validation_features = datasets["validation"].map(
    prepare_validation_features,
    fn_kwargs={
        'tokenizer':tokenizer, 
        'PAD_RIGHT':PAD_RIGHT, 
        'MAX_LENGTH':MAX_LENGTH, 
        'DOC_STRIDE':DOC_STRIDE
        },
    batched=True,
    remove_columns=datasets["validation"].column_names
)

Loading cached processed dataset at C:\Users\vince\.cache\huggingface\datasets\squad_v2\squad_v2\2.0.0\09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d\cache-94b7a3ec88df0c48.arrow


In [21]:
raw_predictions = trainer.predict(validation_features)

The following columns in the test set  don't have a corresponding argument in `ElectraForQuestionAnswering.forward` and have been ignored: example_id, offset_mapping.
***** Running Prediction *****
  Num examples = 12134
  Batch size = 16
100%|█████████▉| 758/759 [02:01<00:00,  9.92it/s]

In [22]:
for batch in trainer.get_eval_dataloader():
    break
batch = {k: v.to(trainer.args.device) for k, v in batch.items()}
with torch.no_grad():
    output = trainer.model(**batch)
output.keys()

odict_keys(['loss', 'start_logits', 'end_logits'])

In [23]:
start_logits = output.start_logits[0].cpu().numpy()
end_logits = output.end_logits[0].cpu().numpy()
offset_mapping = validation_features[0]["offset_mapping"]
# The first feature comes from the first example. For the more general case, we will need to be match the example_id to
# an example index
context = datasets["validation"][0]["context"]

# Gather the indices the best start/end logits:
start_indexes = np.argsort(start_logits)[-1 : -N_BEST_SIZE - 1 : -1].tolist()
end_indexes = np.argsort(end_logits)[-1 : -N_BEST_SIZE - 1 : -1].tolist()
valid_answers = []
for start_index in start_indexes:
    for end_index in end_indexes:
        # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
        # to part of the input_ids that are not in the context.
        if (
                start_index >= len(offset_mapping)
                or end_index >= len(offset_mapping)
                or offset_mapping[start_index] is None
                or offset_mapping[end_index] is None
        ):
            continue
        # Don't consider answers with a length that is either < 0 or > max_answer_length.
        if end_index < start_index or end_index - start_index + 1 > MAX_ANSWER_LENGTH:
            continue
        if start_index <= end_index: # We need to refine that test to check the answer is inside the context
            start_char = offset_mapping[start_index][0]
            end_char = offset_mapping[end_index][1]
            valid_answers.append(
                {
                    "score": start_logits[start_index] + end_logits[end_index],
                    "text": context[start_char: end_char]
                }
            )

valid_answers = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[:N_BEST_SIZE  ]
valid_answers

[{'score': 13.208599, 'text': 'France'},
 {'score': 8.978622, 'text': 'France.'},
 {'score': 5.863811,
  'text': 'France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway'},
 {'score': 5.8469973, 'text': 'in France'},
 {'score': 4.745803,
  'text': 'France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark'},
 {'score': 4.712638, 'text': 'a region in France'},
 {'score': 4.3390865,
  'text': 'France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland'},
 {'score': 3.6333709, 'text': 'Normandy, a region in France'},
 {'score': 2.5342517, 'text': 'region in France'},
 {'score': 1.8388834, 'text': ', a region in France'},
 {'score': 1.6170213, 'text': 'in France.'},
 {'score': 0.48266208, 'text': 'a region in France.'},
 {'score': -0.59660506, 'text': 'Normandy, a region in France.'},
 {'score': -1.0939817, 'text': '.'},


In [24]:
final_predictions = postprocess_qa_predictions(
    datasets["validation"], 
    validation_features, 
    raw_predictions.predictions,
    tokenizer, 
    config['squad_v2'], 
    n_best_size=config['N_BEST_SIZE'], 
    max_answer_length=config['MAX_ANSWER_LENGTH'])

Post-processing 11873 example predictions split into 12134 features.


100%|██████████| 11873/11873 [00:26<00:00, 453.01it/s]


In [25]:
metric = load_metric("squad_v2" if squad_v2 else "squad")
if squad_v2:
    formatted_predictions = [{"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in final_predictions.items()]
else:
    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in final_predictions.items()]
references = [{"id": ex["id"], "answers": ex["answers"]} for ex in datasets["validation"]]
metric.compute(predictions=formatted_predictions, references=references)

{'exact': 63.06746399393582,
 'f1': 68.22302029014142,
 'total': 11873,
 'HasAns_exact': 55.83670715249663,
 'HasAns_f1': 66.16260457234176,
 'HasAns_total': 5928,
 'NoAns_exact': 70.2775441547519,
 'NoAns_f1': 70.2775441547519,
 'NoAns_total': 5945,
 'best_exact': 63.07588646508886,
 'best_exact_thresh': 0.0,
 'best_f1': 68.23144276129486,
 'best_f1_thresh': 0.0}

In [41]:
bleus = calculate_bleu_score(formatted_predictions, references)
bleu_avg = np.sum(bleus)/len(bleus)
print('BLEU-2 avg:', bleu_avg, 'of', len(bleus), 'comparisons')

BLEU-2 avg: 0.7976333814292565 of 4826 comparisons


In [40]:
bleus = calculate_bleu_score_new(formatted_predictions, references)
bleu_avg = np.sum(bleus)/len(bleus)
print('BLEU-2 avg:', bleu_avg, 'of', len(bleus), 'comparisons')

BLEU-2 avg: 0.6761036552495234 of 11873 comparisons


In [None]:
tp, fp, tn, fn = [], [], [], []
for i in range(len(formatted_predictions)):
    pred = formatted_predictions[i]['prediction_text']
    ref = references[i]['answers']['text']
    if (len(pred) == 0) and (len(ref) == 0):
        tn.append(i)
    if (len(pred) != 0) and (len(ref) == 0):
        fp.append(i)
    if (len(pred) == 0) and (len(ref) != 0):
        fn.append(i)
    if (len(pred) != 0) and (len(ref) != 0):
        tp.append(i)
    
print('{} tp | {} fp\n-----------------\n{} fn | {} tn'.format(len(tp),len(fp),len(fn),len(tn)))

4823 tp | 1765 fp
-----------------
1105 fn | 4180 tn


In [None]:
fp_n = 3
np.random.seed(42)
idxs_fp = np.random.choice(len(fp),fp_n,False)

for idx_fp in idxs_fp:
    idx_fp = fp[int(idx_fp)]
    print(str(idx_fp))
    print(str(datasets['validation'][idx_fp]))
    print('\n'+str(formatted_predictions[idx_fp]))
    print('\n'+str(references[idx_fp])+'\n\n\n')

10742
{'id': '5ad4d1785b96ef001a10a1b2', 'title': 'Warsaw', 'context': 'Warsaw lies in east-central Poland about 300 km (190 mi) from the Carpathian Mountains and about 260 km (160 mi) from the Baltic Sea, 523 km (325 mi) east of Berlin, Germany. The city straddles the Vistula River. It is located in the heartland of the Masovian Plain, and its average elevation is 100 metres (330 ft) above sea level. The highest point on the left side of the city lies at a height of 115.7 metres (379.6 ft) ("Redutowa" bus depot, district of Wola), on the right side – 122.1 metres (400.6 ft) ("Groszówka" estate, district of Wesoła, by the eastern border). The lowest point lies at a height 75.6 metres (248.0 ft) (at the right bank of the Vistula, by the eastern border of Warsaw). There are some hills (mostly artificial) located within the confines of the city – e.g. Warsaw Uprising Hill (121 metres (397.0 ft)), Szczęśliwice hill (138 metres (452.8 ft) – the highest point of Warsaw in general).', 'questi

In [None]:
fn_n = 3
np.random.seed(42)
idxs_fn = np.random.choice(len(fn),fn_n,False)

for idx_fn in idxs_fn:
    idx_fn = fn[int(idx_fn)]
    print(str(idx_fn))
    print(str(datasets['validation'][idx_fn]))
    print('\n'+str(formatted_predictions[idx_fn]))
    print('\n'+str(references[idx_fn])+'\n\n\n')

3397
{'id': '5729e500af94a219006aa6b6', 'title': 'Amazon_rainforest', 'context': 'Following the Cretaceous–Paleogene extinction event, the extinction of the dinosaurs and the wetter climate may have allowed the tropical rainforest to spread out across the continent. From 66–34 Mya, the rainforest extended as far south as 45°. Climate fluctuations during the last 34 million years have allowed savanna regions to expand into the tropics. During the Oligocene, for example, the rainforest spanned a relatively narrow band. It expanded again during the Middle Miocene, then retracted to a mostly inland formation at the last glacial maximum. However, the rainforest still managed to thrive during these glacial periods, allowing for the survival and evolution of a broad diversity of species.', 'question': 'Beginning how many years ago did the amazon rainforest extend 45 degrees south?', 'answers': {'text': ['66–34 Mya', '66–34'], 'answer_start': [190, 190]}}

{'id': '5729e500af94a219006aa6b6', 'p

In [None]:
from transformers import pipeline
model_cpu = AutoModelForQuestionAnswering.from_pretrained("./test-squad-trained_electra")
qa_prediction = pipeline('question-answering', model=model_cpu, tokenizer=tokenizer)

loading configuration file ./test-squad-trained_electra/config.json
Model config ElectraConfig {
  "_name_or_path": "./test-squad-trained_electra",
  "architectures": [
    "ElectraForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "torch_dtype": "float32",
  "transformers_version": "4.15.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file ./test-squad-trained_electra/pytorch_model.bin
All model checkpoint weights were used 