In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
import torch

In [3]:
import os
import sys
module_path = os.path.join(os.getcwd(), '..')
sys.path.append(module_path)

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "deepset/roberta-large-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name).to(device=device, dtype=torch.bfloat16)

In [5]:
from lrp_engine import LRPEngine, checkpoint_hook
from lrp_engine.lrp_graph import make_graph

In [6]:
lrp = LRPEngine(topk=1, use_gamma=True, dtype=torch.bfloat16)

# SQuADv2 Benchmark

In [8]:
from datasets import load_dataset

dataset = load_dataset("squad_v2")

In [9]:
# QA version

from tqdm import tqdm
import json
import pickle

def run_roberta_squad_benchmark(model, tokenizer, lrp, dataset, run_name, question_first=True, model_topk=2):
    results = []
    top1_label_hits = 0
    top1_model_hits = 0
    total_examples = 0
    total_strict_intersect = 0
    total_span_intersect = 0
    total_strict_union = 0
    total_span_union = 0
    model_exact_matches = 0
    lrp_model_exact_matches = 0
    precision_denom = 0
    recall_denom = 0
    total_overlap = 0
    total_overlap_ratio = 0
    total_start_end_skips = 0
    total_empty_answer_skips = 0
    total_unanswerable = 0
    for example in tqdm(dataset["validation"]):
        question = example["question"]
        context = example["context"]
        answers = example["answers"]["text"]
        answer_start_inds = example["answers"]["answer_start"]
    
        if not answers:
            total_unanswerable += 1
            continue
    
        answer_char_ranges = [ (i, i + len(a)) for (a, i) in zip(answers, answer_start_inds) ]
    
        if question_first:
            tokenized = tokenizer(question, context, return_tensors="pt", return_offsets_mapping=True)
            input_ids = tokenized["input_ids"].to(device)
            offset_mapping = tokenized["offset_mapping"]
        else:
            tokenized = tokenizer(context, question, return_tensors="pt", return_offsets_mapping=True)
            input_ids = tokenized["input_ids"].to(device)
            offset_mapping = tokenized["offset_mapping"]
        if input_ids.shape[-1] > 512:
            continue
    
        output = model(input_ids.to(device))
    
        # Tokenizer outputs <s><question></s></s<context></s>
        # We need to mask the question tokens in the start/end logits, this is standard in QA evals
        sep_token_ind = list(input_ids[0]).index(tokenizer.eos_token_id)
        mask = torch.zeros_like(output.start_logits).bool()
        if question_first:
            mask_slice = [slice(None, None), slice(sep_token_ind + 1, None)]
        else:
            mask_slice = [slice(None, None), slice(None, sep_token_ind + 1)]
        mask[mask_slice] = True
    
        masked_start_logits = output.start_logits.masked_fill(~mask, -float("inf"))
        masked_end_logits = output.end_logits.masked_fill(~mask, -float("inf"))
        start = torch.argmax(masked_start_logits)
        end = torch.argmax(masked_end_logits)
    
        if start > end:
            total_start_end_skips += 1
            continue

        # Get the decoded answer per the model
        model_answer = tokenizer.decode(input_ids[0][start:end + 1], skip_special_tokens=False).replace("<unk>", "x").strip()
        if model_answer == "":
            total_empty_answer_skips += 1
            continue
    
        # Convert from token inds to char inds since that's how the dataset tracks the labels
        char_start = offset_mapping[0][start][0]
        if context[char_start] == " ":
            char_start += 1
        char_end = offset_mapping[0][end][1]

        # Take the answer where the prediction gets the best # chars intersecting over the model and label ranges divided by total # chars in label
        # best_overlap_ratio is the Jaccard similarity
        best_overlap, best_answer_len, best_overlap_ratio = max([ (ans_overlap := (max(0, min(char_end, answer_end) - max(char_start, answer_start))), ans_len := (answer_end - answer_start), ans_overlap / (ans_len + len(model_answer) - ans_overlap)) for (answer_start, answer_end) in answer_char_ranges ], key =lambda x: x[2])
        total_overlap += best_overlap
        total_overlap_ratio += best_overlap_ratio
        precision_denom += len(model_answer)
        recall_denom += best_answer_len
    
        if model_exact_match := (best_overlap_ratio == 1):
            model_exact_matches += 1

        # Now run LRP
        lrp_input1, lrp_input2 = output.start_logits, output.end_logits
        lrp_input1[0][0] = 0
        lrp_input2[0][0] = 0
        checkpoint_vals, param_vals = lrp.run((lrp_input1, lrp_input2))

        lrp_max = param_vals[2].masked_fill(~mask, -float("inf")).flatten().argmax()
        lrp_top_token = tokenizer.decode([input_ids[0][lrp_max]], skip_special_tokens=True).strip().replace(chr(9601), "")

        # Do model answer-based accuracy, i.e. is the attribution aligned with the model prediction
        if start <= lrp_max <= end:
            top1_model_hits += 1

        # Do IoU with the model prediction
        # We split into two ways to compute:
        # For Strict IoU we do not count tokens IN BETWEEN start and end because the model is meant to only output 2 signals for start and end.
        # So, strict_intersect = 2, strict_union = 2 if LRP top-2 and the model prediction indices matched exactly
        # strict_intersect = 1, strict_union = 3 if LRP only 1 of the LRP top-2 was a model prediction index
        # strict_intersect = 0, strict_union = 4 if neither LRP top-2 were a model prediction index
        # We double count in the case of single word answers (start = end), and do not consider the second place attributed token if the first
        #   place token was the answer.
        # For Span IoU, we DO count tokens in between start and end
        strict_intersect = 0
        span_intersect = 0
        strict_union = model_topk
        span_union = model_topk
        found_start = False
        found_end = False
        for top_ind in param_vals[2].flatten().topk(model_topk).indices:
            if found_start and found_end:
                break
            if start == top_ind:
                found_start = True
                strict_intersect += 1
                span_intersect += 1

            if end == top_ind:   # Crucial to keep these two as separate ifs for single-word answers
                found_end = True
                strict_intersect += 1
                span_intersect += 1

            if start < top_ind < end:
                strict_union += 1
                span_intersect += 1

            elif top_ind < start or end < top_ind:
                strict_union += 1
                span_union += 1

        if lrp_model_exact_match := (strict_intersect == strict_union):
            lrp_model_exact_matches += 1
        total_strict_intersect += strict_intersect
        total_strict_union += strict_union
        total_span_intersect += span_intersect
        total_span_union += span_union

        # Do label-based accuracy
        if any(lrp_top_token in ans for ans in answers):
            # Is the attribution aligned with the ground truth label
            top1_label_hits += 1

        lrp_top5 = param_vals[2].flatten().topk(k=5)
        lrp_top5_tokens = tokenizer.decode(input_ids[0][lrp_top5.indices.cpu()])
        results.append({
            "example": example,
            "model_exact_match": bool(model_exact_match),
            "lrp_top1_ind": lrp_max.tolist(),
            "model_start_end": (start.detach().cpu().tolist(), end.detach().cpu().tolist()),
            "lrp_top1_token": lrp_top_token,
            "lrp_top5_tokens": lrp_top5_tokens,
            "lrp_top5_relevances": lrp_top5.values.detach().cpu().tolist(),
            "lrp_model_strict_intersect": strict_intersect,
            "lrp_model_strict_union": strict_union,
            "lrp_model_span_intersect": span_intersect,
            "lrp_model_span_union": span_union,
            "lrp_model_exact_match": lrp_model_exact_match,
        })

        total_examples += 1
        if not (total_examples % 100):
            precision = float(total_overlap / precision_denom)
            recall = float(total_overlap / recall_denom)
            f1 = float(2 / ((1 / precision) + (1 / recall)))
            print("Total examples: ", total_examples)
            print("Model exact matches: ", model_exact_matches)
            print("Model exact match %: ", model_exact_matches / total_examples)
            print("Model precision: ", precision)
            print("Model recall: ", recall)
            print("Model F1: ", f1)
            print("Model overlap ratio: ", float(total_overlap_ratio / total_examples))
            print("LRP Top-1 model answer hits: ", top1_model_hits)
            print("LRP Top-1 model answer hit %: ", top1_model_hits / total_examples)
            print("LRP Top-1 label answer hits: ", top1_label_hits)
            print("LRP Top-1 label answer hit %: ", top1_label_hits / total_examples)
            print(f"LRP Top-{model_topk} Strict Accuracy: ", total_strict_intersect / (model_topk * total_examples))
            print(f"LRP Top-{model_topk} Span Accuracy: ", total_span_intersect / (model_topk * total_examples))
            print("LRP-Model exact matches: ", lrp_model_exact_matches)
            print("LRP-Model Strict IoU: ", total_strict_intersect / total_strict_union)
            print("LRP-Model Span IoU: ", total_span_intersect / total_span_union)
            print("Total start-end skips: ", total_start_end_skips)
            print("Total empty model answer skips: ", total_empty_answer_skips)
            print("Total unanswerable: ", total_unanswerable)

    precision = float(total_overlap / precision_denom)
    recall = float(total_overlap / recall_denom)
    f1 = float(2 / ((1 / precision) + (1 / recall)))
    data = {
        "summary": {
            "total_examples": total_examples,
            "model_exact_matches": model_exact_matches,
            "model_exact_match_pct": model_exact_matches / total_examples,
            "model_precision": precision,
            "model_recall": recall,
            "model_f1": f1,
            "model_overlap_ratio": float(total_overlap_ratio / total_examples),
            "lrp_top1_model_answer_hits": top1_model_hits,
            "lrp_top1_model_answer_hit_pct": top1_model_hits / total_examples,
            "lrp_top1_label_answer_hits": top1_label_hits,
            "lrp_top1_label_answer_hit_pct": top1_label_hits / total_examples,
            f"lrp_top{model_topk}_strict_accuracy": total_strict_intersect / (model_topk * total_examples),
            f"lrp_top{model_topk}_span_accuracy": total_span_intersect / (model_topk * total_examples),
            "lrp_model_exact_matches": lrp_model_exact_matches,
            "lrp_model_strict_iou": total_strict_intersect / total_strict_union,
            "lrp_model_span_iou": total_span_intersect / total_span_union,
            "total_start_end_skips" : total_start_end_skips,
            "total_empty_answer_skips" : total_empty_answer_skips,
            "total_unanswerable" : total_unanswerable,
        },
        "results": results
    }

    try:
        with open(f"roberta_squadv2_results_{run_name}.json", "w") as f:
            json.dump(data, f, indent=4)
    except TypeError:
        print("Failed JSON write, attempting pickle...")
        try:
            with open(f"roberta_squadv2_results_{run_name}.pkl", "wb") as f:
                pickle.dump(data, f)
        except Exception as e:
            print("Encountered exception while pickling, will return (data, exception)...")
            return data, e
    return data

In [33]:
results = run_roberta_squad_benchmark(model, tokenizer, lrp, dataset, "gamma")

  2%|█▍                                                                            | 218/11873 [00:29<24:56,  7.79it/s]

Total examples:  100
Model exact matches:  83
Model exact match %:  0.83
Model precision:  0.8022130131721497
Model recall:  0.9243028163909912
Model F1:  0.858941151565508
Model overlap ratio:  0.888780951499939
LRP Top-1 model answer hits:  96
LRP Top-1 model answer hit %:  0.96
LRP Top-1 label answer hits:  94
LRP Top-1 label answer hit %:  0.94
LRP Top-2 Strict Accuracy:  0.89
LRP Top-2 Span Accuracy:  0.945
LRP-Model exact matches:  82
LRP-Model Strict IoU:  0.8018018018018018
LRP-Model Span IoU:  0.8957345971563981
Total start-end skips:  1
Total empty model answer skips:  0
Total unanswerable:  117


  4%|██▊                                                                           | 434/11873 [00:59<18:59, 10.04it/s]

Total examples:  200
Model exact matches:  155
Model exact match %:  0.775
Model precision:  0.7599042654037476
Model recall:  0.8984596133232117
Model F1:  0.8233938295634095
Model overlap ratio:  0.8675488829612732
LRP Top-1 model answer hits:  189
LRP Top-1 model answer hit %:  0.945
LRP Top-1 label answer hits:  182
LRP Top-1 label answer hit %:  0.91
LRP Top-2 Strict Accuracy:  0.885
LRP Top-2 Span Accuracy:  0.93
LRP-Model exact matches:  165
LRP-Model Strict IoU:  0.7937219730941704
LRP-Model Span IoU:  0.8691588785046729
Total start-end skips:  3
Total empty model answer skips:  0
Total unanswerable:  231


  5%|████▏                                                                         | 639/11873 [01:28<41:24,  4.52it/s]

Total examples:  300
Model exact matches:  235
Model exact match %:  0.7833333333333333
Model precision:  0.7875936627388
Model recall:  0.9023756980895996
Model F1:  0.8410867057098496
Model overlap ratio:  0.8806749582290649
LRP Top-1 model answer hits:  286
LRP Top-1 model answer hit %:  0.9533333333333334
LRP Top-1 label answer hits:  277
LRP Top-1 label answer hit %:  0.9233333333333333
LRP Top-2 Strict Accuracy:  0.8983333333333333
LRP Top-2 Span Accuracy:  0.94
LRP-Model exact matches:  253
LRP-Model Strict IoU:  0.8154311649016641
LRP-Model Span IoU:  0.8867924528301887
Total start-end skips:  3
Total empty model answer skips:  0
Total unanswerable:  336


  7%|█████▍                                                                        | 828/11873 [01:57<34:08,  5.39it/s]

Total examples:  400
Model exact matches:  323
Model exact match %:  0.8075
Model precision:  0.7694311141967773
Model recall:  0.9146162867546082
Model F1:  0.8357653450639807
Model overlap ratio:  0.8932269811630249
LRP Top-1 model answer hits:  383
LRP Top-1 model answer hit %:  0.9575
LRP Top-1 label answer hits:  371
LRP Top-1 label answer hit %:  0.9275
LRP Top-2 Strict Accuracy:  0.895
LRP Top-2 Span Accuracy:  0.94375
LRP-Model exact matches:  334
LRP-Model Strict IoU:  0.8099547511312217
LRP-Model Span IoU:  0.893491124260355
Total start-end skips:  8
Total empty model answer skips:  0
Total unanswerable:  420


  9%|██████▌                                                                      | 1020/11873 [02:26<18:29,  9.78it/s]

Total examples:  500
Model exact matches:  410
Model exact match %:  0.82
Model precision:  0.7859138250350952
Model recall:  0.922393798828125
Model F1:  0.848701988446674
Model overlap ratio:  0.9003891348838806
LRP Top-1 model answer hits:  482
LRP Top-1 model answer hit %:  0.964
LRP Top-1 label answer hits:  468
LRP Top-1 label answer hit %:  0.936
LRP Top-2 Strict Accuracy:  0.907
LRP Top-2 Span Accuracy:  0.952
LRP-Model exact matches:  426
LRP-Model Strict IoU:  0.8298261665141812
LRP-Model Span IoU:  0.9083969465648855
Total start-end skips:  8
Total empty model answer skips:  0
Total unanswerable:  512


 10%|███████▊                                                                     | 1212/11873 [02:55<27:23,  6.49it/s]

Total examples:  600
Model exact matches:  493
Model exact match %:  0.8216666666666667
Model precision:  0.7976821064949036
Model recall:  0.9278326630592346
Model F1:  0.8578489459526415
Model overlap ratio:  0.9034292697906494
LRP Top-1 model answer hits:  580
LRP Top-1 model answer hit %:  0.9666666666666667
LRP Top-1 label answer hits:  564
LRP Top-1 label answer hit %:  0.94
LRP Top-2 Strict Accuracy:  0.905
LRP Top-2 Span Accuracy:  0.955
LRP-Model exact matches:  507
LRP-Model Strict IoU:  0.8264840182648402
LRP-Model Span IoU:  0.9138755980861244
Total start-end skips:  9
Total empty model answer skips:  0
Total unanswerable:  603


 12%|█████████▏                                                                   | 1412/11873 [03:23<30:53,  5.65it/s]

Total examples:  700
Model exact matches:  572
Model exact match %:  0.8171428571428572
Model precision:  0.7913013696670532
Model recall:  0.9256272315979004
Model F1:  0.8532097323382066
Model overlap ratio:  0.8956931829452515
LRP Top-1 model answer hits:  678
LRP Top-1 model answer hit %:  0.9685714285714285
LRP Top-1 label answer hits:  653
LRP Top-1 label answer hit %:  0.9328571428571428
LRP Top-2 Strict Accuracy:  0.9078571428571428
LRP Top-2 Span Accuracy:  0.9578571428571429
LRP-Model exact matches:  594
LRP-Model Strict IoU:  0.8312622629169392
LRP-Model Span IoU:  0.9191226867717615
Total start-end skips:  10
Total empty model answer skips:  0
Total unanswerable:  702


 14%|██████████▍                                                                  | 1612/11873 [03:53<24:00,  7.12it/s]

Total examples:  800
Model exact matches:  662
Model exact match %:  0.8275
Model precision:  0.7938859462738037
Model recall:  0.9275521636009216
Model F1:  0.8555295980663745
Model overlap ratio:  0.8996279835700989
LRP Top-1 model answer hits:  777
LRP Top-1 model answer hit %:  0.97125
LRP Top-1 label answer hits:  747
LRP Top-1 label answer hit %:  0.93375
LRP Top-2 Strict Accuracy:  0.9075
LRP Top-2 Span Accuracy:  0.960625
LRP-Model exact matches:  676
LRP-Model Strict IoU:  0.8306636155606407
LRP-Model Span IoU:  0.9242333132892363
Total start-end skips:  11
Total empty model answer skips:  0
Total unanswerable:  801


 15%|███████████▋                                                                 | 1805/11873 [04:22<33:01,  5.08it/s]

Total examples:  900
Model exact matches:  745
Model exact match %:  0.8277777777777777
Model precision:  0.8007665276527405
Model recall:  0.9288681745529175
Model F1:  0.860073565632518
Model overlap ratio:  0.9032806158065796
LRP Top-1 model answer hits:  877
LRP Top-1 model answer hit %:  0.9744444444444444
LRP Top-1 label answer hits:  844
LRP Top-1 label answer hit %:  0.9377777777777778
LRP Top-2 Strict Accuracy:  0.9083333333333333
LRP Top-2 Span Accuracy:  0.965
LRP-Model exact matches:  759
LRP-Model Strict IoU:  0.8320610687022901
LRP-Model Span IoU:  0.9323671497584541
Total start-end skips:  11
Total empty model answer skips:  0
Total unanswerable:  894


 17%|█████████████                                                                | 2014/11873 [04:52<24:20,  6.75it/s]

Total examples:  1000
Model exact matches:  839
Model exact match %:  0.839
Model precision:  0.8110668063163757
Model recall:  0.933892011642456
Model F1:  0.8681566619586464
Model overlap ratio:  0.9096122980117798
LRP Top-1 model answer hits:  975
LRP Top-1 model answer hit %:  0.975
LRP Top-1 label answer hits:  940
LRP Top-1 label answer hit %:  0.94
LRP Top-2 Strict Accuracy:  0.911
LRP Top-2 Span Accuracy:  0.9665
LRP-Model exact matches:  848
LRP-Model Strict IoU:  0.8365472910927456
LRP-Model Span IoU:  0.9351717464925012
Total start-end skips:  11
Total empty model answer skips:  0
Total unanswerable:  1003


 19%|██████████████▍                                                              | 2220/11873 [05:22<28:48,  5.58it/s]

Total examples:  1100
Model exact matches:  929
Model exact match %:  0.8445454545454546
Model precision:  0.8187071681022644
Model recall:  0.9377298355102539
Model F1:  0.8741857937365202
Model overlap ratio:  0.9122279286384583
LRP Top-1 model answer hits:  1074
LRP Top-1 model answer hit %:  0.9763636363636363
LRP Top-1 label answer hits:  1034
LRP Top-1 label answer hit %:  0.94
LRP Top-2 Strict Accuracy:  0.9113636363636364
LRP Top-2 Span Accuracy:  0.9672727272727273
LRP-Model exact matches:  932
LRP-Model Strict IoU:  0.837160751565762
LRP-Model Span IoU:  0.9366197183098591
Total start-end skips:  12
Total empty model answer skips:  0
Total unanswerable:  1108


 20%|███████████████▍                                                             | 2388/11873 [05:50<16:18,  9.69it/s]

Total examples:  1200
Model exact matches:  1016
Model exact match %:  0.8466666666666667
Model precision:  0.8214691877365112
Model recall:  0.9390409588813782
Model F1:  0.8763291881339298
Model overlap ratio:  0.9140046238899231
LRP Top-1 model answer hits:  1171
LRP Top-1 model answer hit %:  0.9758333333333333
LRP Top-1 label answer hits:  1128
LRP Top-1 label answer hit %:  0.94
LRP Top-2 Strict Accuracy:  0.9108333333333334
LRP Top-2 Span Accuracy:  0.9675
LRP-Model exact matches:  1016
LRP-Model Strict IoU:  0.836266258607498
LRP-Model Span IoU:  0.937046004842615
Total start-end skips:  12
Total empty model answer skips:  0
Total unanswerable:  1176


 22%|████████████████▋                                                            | 2568/11873 [06:19<21:35,  7.18it/s]

Total examples:  1300
Model exact matches:  1100
Model exact match %:  0.8461538461538461
Model precision:  0.7892215251922607
Model recall:  0.9383786916732788
Model F1:  0.8573611591621417
Model overlap ratio:  0.9115230441093445
LRP Top-1 model answer hits:  1267
LRP Top-1 model answer hit %:  0.9746153846153847
LRP Top-1 label answer hits:  1220
LRP Top-1 label answer hit %:  0.9384615384615385
LRP Top-2 Strict Accuracy:  0.9088461538461539
LRP Top-2 Span Accuracy:  0.9665384615384616
LRP-Model exact matches:  1097
LRP-Model Strict IoU:  0.8329221008107155
LRP-Model Span IoU:  0.935243766282099
Total start-end skips:  12
Total empty model answer skips:  0
Total unanswerable:  1256


 23%|██████████████████                                                           | 2788/11873 [06:48<14:40, 10.32it/s]

Total examples:  1400
Model exact matches:  1179
Model exact match %:  0.8421428571428572
Model precision:  0.7928342223167419
Model recall:  0.9276265501976013
Model F1:  0.854950122985319
Model overlap ratio:  0.9085749387741089
LRP Top-1 model answer hits:  1364
LRP Top-1 model answer hit %:  0.9742857142857143
LRP Top-1 label answer hits:  1313
LRP Top-1 label answer hit %:  0.9378571428571428
LRP Top-2 Strict Accuracy:  0.9075
LRP Top-2 Span Accuracy:  0.9657142857142857
LRP-Model exact matches:  1178
LRP-Model Strict IoU:  0.8306636155606407
LRP-Model Span IoU:  0.9337016574585635
Total start-end skips:  12
Total empty model answer skips:  0
Total unanswerable:  1376


 25%|███████████████████▎                                                         | 2984/11873 [07:17<31:34,  4.69it/s]

Total examples:  1500
Model exact matches:  1263
Model exact match %:  0.842
Model precision:  0.7987043857574463
Model recall:  0.9314104318618774
Model F1:  0.8599678926419128
Model overlap ratio:  0.9091017842292786
LRP Top-1 model answer hits:  1463
LRP Top-1 model answer hit %:  0.9753333333333334
LRP Top-1 label answer hits:  1409
LRP Top-1 label answer hit %:  0.9393333333333334
LRP Top-2 Strict Accuracy:  0.906
LRP Top-2 Span Accuracy:  0.9663333333333334
LRP-Model exact matches:  1256
LRP-Model Strict IoU:  0.8281535648994516
LRP-Model Span IoU:  0.9348597226701064
Total start-end skips:  13
Total empty model answer skips:  0
Total unanswerable:  1471


 27%|████████████████████▌                                                        | 3175/11873 [07:48<12:44, 11.38it/s]

Total examples:  1600
Model exact matches:  1332
Model exact match %:  0.8325
Model precision:  0.810977041721344
Model recall:  0.9206355214118958
Model F1:  0.8623340896848848
Model overlap ratio:  0.906546413898468
LRP Top-1 model answer hits:  1562
LRP Top-1 model answer hit %:  0.97625
LRP Top-1 label answer hits:  1504
LRP Top-1 label answer hit %:  0.94
LRP Top-2 Strict Accuracy:  0.9034375
LRP Top-2 Span Accuracy:  0.966875
LRP-Model exact matches:  1331
LRP-Model Strict IoU:  0.8238814477058991
LRP-Model Span IoU:  0.9358741681790683
Total start-end skips:  13
Total empty model answer skips:  0
Total unanswerable:  1551


 29%|██████████████████████                                                       | 3409/11873 [08:18<23:01,  6.13it/s]

Total examples:  1700
Model exact matches:  1415
Model exact match %:  0.8323529411764706
Model precision:  0.8196014761924744
Model recall:  0.9139096140861511
Model F1:  0.864190224120356
Model overlap ratio:  0.9080095887184143
LRP Top-1 model answer hits:  1661
LRP Top-1 model answer hit %:  0.9770588235294118
LRP Top-1 label answer hits:  1604
LRP Top-1 label answer hit %:  0.9435294117647058
LRP Top-2 Strict Accuracy:  0.9023529411764706
LRP Top-2 Span Accuracy:  0.9679411764705882
LRP-Model exact matches:  1409
LRP-Model Strict IoU:  0.8220793140407289
LRP-Model Span IoU:  0.9378740381875178
Total start-end skips:  13
Total empty model answer skips:  0
Total unanswerable:  1661


 30%|███████████████████████                                                      | 3553/11873 [08:47<15:15,  9.08it/s]

Total examples:  1800
Model exact matches:  1499
Model exact match %:  0.8327777777777777
Model precision:  0.8245265483856201
Model recall:  0.9140247702598572
Model F1:  0.8669720368662871
Model overlap ratio:  0.9083263278007507
LRP Top-1 model answer hits:  1761
LRP Top-1 model answer hit %:  0.9783333333333334
LRP Top-1 label answer hits:  1701
LRP Top-1 label answer hit %:  0.945
LRP Top-2 Strict Accuracy:  0.9063888888888889
LRP Top-2 Span Accuracy:  0.9691666666666666
LRP-Model exact matches:  1504
LRP-Model Strict IoU:  0.8288036576073152
LRP-Model Span IoU:  0.9401778496362166
Total start-end skips:  14
Total empty model answer skips:  0
Total unanswerable:  1704


 31%|████████████████████████▏                                                    | 3726/11873 [09:17<25:51,  5.25it/s]

Total examples:  1900
Model exact matches:  1583
Model exact match %:  0.8331578947368421
Model precision:  0.8300277590751648
Model recall:  0.9135764241218567
Model F1:  0.869800381721248
Model overlap ratio:  0.9083919525146484
LRP Top-1 model answer hits:  1860
LRP Top-1 model answer hit %:  0.9789473684210527
LRP Top-1 label answer hits:  1795
LRP Top-1 label answer hit %:  0.9447368421052632
LRP Top-2 Strict Accuracy:  0.906578947368421
LRP Top-2 Span Accuracy:  0.97
LRP-Model exact matches:  1587
LRP-Model Strict IoU:  0.8291215403128761
LRP-Model Span IoU:  0.941747572815534
Total start-end skips:  14
Total empty model answer skips:  0
Total unanswerable:  1777


 33%|█████████████████████████▍                                                   | 3931/11873 [09:46<16:43,  7.91it/s]

Total examples:  2000
Model exact matches:  1663
Model exact match %:  0.8315
Model precision:  0.8328296542167664
Model recall:  0.9165424108505249
Model F1:  0.8726830779412065
Model overlap ratio:  0.9086989164352417
LRP Top-1 model answer hits:  1959
LRP Top-1 model answer hit %:  0.9795
LRP Top-1 label answer hits:  1890
LRP Top-1 label answer hit %:  0.945
LRP Top-2 Strict Accuracy:  0.90475
LRP Top-2 Span Accuracy:  0.97075
LRP-Model exact matches:  1662
LRP-Model Strict IoU:  0.8260671079662177
LRP-Model Span IoU:  0.9431624969638086
Total start-end skips:  14
Total empty model answer skips:  0
Total unanswerable:  1882


 35%|██████████████████████████▊                                                  | 4135/11873 [10:14<18:45,  6.87it/s]

Total examples:  2100
Model exact matches:  1748
Model exact match %:  0.8323809523809523
Model precision:  0.8362514972686768
Model recall:  0.9184877872467041
Model F1:  0.8754426302368653
Model overlap ratio:  0.9094985723495483
LRP Top-1 model answer hits:  2059
LRP Top-1 model answer hit %:  0.9804761904761905
LRP Top-1 label answer hits:  1987
LRP Top-1 label answer hit %:  0.9461904761904761
LRP Top-2 Strict Accuracy:  0.9040476190476191
LRP Top-2 Span Accuracy:  0.9719047619047619
LRP-Model exact matches:  1740
LRP-Model Strict IoU:  0.8248968064305887
LRP-Model Span IoU:  0.9453450671607225
Total start-end skips:  14
Total empty model answer skips:  0
Total unanswerable:  1986


 37%|████████████████████████████▋                                                | 4430/11873 [10:43<06:44, 18.42it/s]

Total examples:  2200
Model exact matches:  1814
Model exact match %:  0.8245454545454546
Model precision:  0.828942060470581
Model recall:  0.9193777441978455
Model F1:  0.8718209101002471
Model overlap ratio:  0.9068245887756348
LRP Top-1 model answer hits:  2159
LRP Top-1 model answer hit %:  0.9813636363636363
LRP Top-1 label answer hits:  2076
LRP Top-1 label answer hit %:  0.9436363636363636
LRP Top-2 Strict Accuracy:  0.9011363636363636
LRP Top-2 Span Accuracy:  0.9679545454545454
LRP-Model exact matches:  1808
LRP-Model Strict IoU:  0.8200620475698035
LRP-Model Span IoU:  0.9378991411583352
Total start-end skips:  14
Total empty model answer skips:  0
Total unanswerable:  2175


 40%|██████████████████████████████▌                                              | 4712/11873 [11:13<15:09,  7.88it/s]

Total examples:  2300
Model exact matches:  1868
Model exact match %:  0.8121739130434783
Model precision:  0.8093007206916809
Model recall:  0.9035379886627197
Model F1:  0.8538269732036426
Model overlap ratio:  0.8992968797683716
LRP Top-1 model answer hits:  2259
LRP Top-1 model answer hit %:  0.9821739130434782
LRP Top-1 label answer hits:  2156
LRP Top-1 label answer hit %:  0.9373913043478261
LRP Top-2 Strict Accuracy:  0.8971739130434783
LRP Top-2 Span Accuracy:  0.9636956521739131
LRP-Model exact matches:  1870
LRP-Model Strict IoU:  0.8135225704711216
LRP-Model Span IoU:  0.9299349695825466
Total start-end skips:  15
Total empty model answer skips:  0
Total unanswerable:  2358


 42%|████████████████████████████████                                             | 4948/11873 [11:42<14:07,  8.17it/s]

Total examples:  2400
Model exact matches:  1947
Model exact match %:  0.81125
Model precision:  0.8139877915382385
Model recall:  0.9030385613441467
Model F1:  0.8562039400134251
Model overlap ratio:  0.8996246457099915
LRP Top-1 model answer hits:  2356
LRP Top-1 model answer hit %:  0.9816666666666667
LRP Top-1 label answer hits:  2251
LRP Top-1 label answer hit %:  0.9379166666666666
LRP Top-2 Strict Accuracy:  0.8945833333333333
LRP Top-2 Span Accuracy:  0.96375
LRP-Model exact matches:  1940
LRP-Model Strict IoU:  0.8092725216735771
LRP-Model Span IoU:  0.9300361881785284
Total start-end skips:  15
Total empty model answer skips:  0
Total unanswerable:  2493


 44%|██████████████████████████████████                                           | 5244/11873 [12:11<14:42,  7.51it/s]

Total examples:  2500
Model exact matches:  2030
Model exact match %:  0.812
Model precision:  0.8192464709281921
Model recall:  0.90508633852005
Model F1:  0.860029786193228
Model overlap ratio:  0.9008876085281372
LRP Top-1 model answer hits:  2455
LRP Top-1 model answer hit %:  0.982
LRP Top-1 label answer hits:  2348
LRP Top-1 label answer hit %:  0.9392
LRP Top-2 Strict Accuracy:  0.8948
LRP Top-2 Span Accuracy:  0.9648
LRP-Model exact matches:  2022
LRP-Model Strict IoU:  0.8096272167933406
LRP-Model Span IoU:  0.9319938176197836
Total start-end skips:  15
Total empty model answer skips:  0
Total unanswerable:  2690


 46%|███████████████████████████████████▏                                         | 5432/11873 [12:41<19:35,  5.48it/s]

Total examples:  2600
Model exact matches:  2103
Model exact match %:  0.8088461538461539
Model precision:  0.8084590435028076
Model recall:  0.9042363166809082
Model F1:  0.8536696539026446
Model overlap ratio:  0.8993727564811707
LRP Top-1 model answer hits:  2554
LRP Top-1 model answer hit %:  0.9823076923076923
LRP Top-1 label answer hits:  2442
LRP Top-1 label answer hit %:  0.9392307692307692
LRP Top-2 Strict Accuracy:  0.8965384615384615
LRP Top-2 Span Accuracy:  0.9653846153846154
LRP-Model exact matches:  2111
LRP-Model Strict IoU:  0.8124782154060648
LRP-Model Span IoU:  0.9330855018587361
Total start-end skips:  15
Total empty model answer skips:  0
Total unanswerable:  2778


 47%|████████████████████████████████████▏                                        | 5587/11873 [13:09<24:03,  4.35it/s]

Total examples:  2700
Model exact matches:  2185
Model exact match %:  0.8092592592592592
Model precision:  0.8044715523719788
Model recall:  0.9053337574005127
Model F1:  0.8519277008534466
Model overlap ratio:  0.898760199546814
LRP Top-1 model answer hits:  2645
LRP Top-1 model answer hit %:  0.9796296296296296
LRP Top-1 label answer hits:  2530
LRP Top-1 label answer hit %:  0.937037037037037
LRP Top-2 Strict Accuracy:  0.8938888888888888
LRP Top-2 Span Accuracy:  0.9627777777777777
LRP-Model exact matches:  2185
LRP-Model Strict IoU:  0.808136614766449
LRP-Model Span IoU:  0.9282271023031602
Total start-end skips:  15
Total empty model answer skips:  0
Total unanswerable:  2833


 49%|█████████████████████████████████████▍                                       | 5770/11873 [13:38<17:39,  5.76it/s]

Total examples:  2800
Model exact matches:  2240
Model exact match %:  0.8
Model precision:  0.7882405519485474
Model recall:  0.898928165435791
Model F1:  0.8399534984073437
Model overlap ratio:  0.8917944431304932
LRP Top-1 model answer hits:  2733
LRP Top-1 model answer hit %:  0.9760714285714286
LRP Top-1 label answer hits:  2609
LRP Top-1 label answer hit %:  0.9317857142857143
LRP Top-2 Strict Accuracy:  0.88875
LRP Top-2 Span Accuracy:  0.9592857142857143
LRP-Model exact matches:  2248
LRP-Model Strict IoU:  0.7997750281214848
LRP-Model Span IoU:  0.9217570350034318
Total start-end skips:  18
Total empty model answer skips:  0
Total unanswerable:  2913


 51%|██████████████████████████████████████▉                                      | 6008/11873 [14:05<09:25, 10.37it/s]

Total examples:  2900
Model exact matches:  2331
Model exact match %:  0.8037931034482758
Model precision:  0.7968436479568481
Model recall:  0.9024685025215149
Model F1:  0.8463733911547223
Model overlap ratio:  0.8943771123886108
LRP Top-1 model answer hits:  2833
LRP Top-1 model answer hit %:  0.9768965517241379
LRP Top-1 label answer hits:  2709
LRP Top-1 label answer hit %:  0.9341379310344827
LRP Top-2 Strict Accuracy:  0.8881034482758621
LRP Top-2 Span Accuracy:  0.9605172413793104
LRP-Model exact matches:  2322
LRP-Model Strict IoU:  0.7987284850364398
LRP-Model Span IoU:  0.9240338364571239
Total start-end skips:  18
Total empty model answer skips:  0
Total unanswerable:  3052


 52%|████████████████████████████████████████▎                                    | 6220/11873 [14:34<09:22, 10.06it/s]

Total examples:  3000
Model exact matches:  2413
Model exact match %:  0.8043333333333333
Model precision:  0.7976213693618774
Model recall:  0.9041184186935425
Model F1:  0.8475375333471997
Model overlap ratio:  0.8944876194000244
LRP Top-1 model answer hits:  2932
LRP Top-1 model answer hit %:  0.9773333333333334
LRP Top-1 label answer hits:  2804
LRP Top-1 label answer hit %:  0.9346666666666666
LRP Top-2 Strict Accuracy:  0.8885
LRP Top-2 Span Accuracy:  0.9611666666666666
LRP-Model exact matches:  2403
LRP-Model Strict IoU:  0.799370220422852
LRP-Model Span IoU:  0.9252366436707845
Total start-end skips:  18
Total empty model answer skips:  0
Total unanswerable:  3164


 54%|█████████████████████████████████████████▊                                   | 6450/11873 [15:03<08:36, 10.50it/s]

Total examples:  3100
Model exact matches:  2506
Model exact match %:  0.8083870967741935
Model precision:  0.8004102110862732
Model recall:  0.90574711561203
Model F1:  0.8498269516571998
Model overlap ratio:  0.8966625332832336
LRP Top-1 model answer hits:  3030
LRP Top-1 model answer hit %:  0.9774193548387097
LRP Top-1 label answer hits:  2899
LRP Top-1 label answer hit %:  0.9351612903225807
LRP Top-2 Strict Accuracy:  0.8893548387096775
LRP Top-2 Span Accuracy:  0.9616129032258065
LRP-Model exact matches:  2488
LRP-Model Strict IoU:  0.8007551553877432
LRP-Model Span IoU:  0.9260639950295123
Total start-end skips:  18
Total empty model answer skips:  0
Total unanswerable:  3294


 56%|███████████████████████████████████████████▏                                 | 6659/11873 [15:32<13:13,  6.57it/s]

Total examples:  3200
Model exact matches:  2596
Model exact match %:  0.81125
Model precision:  0.8023601174354553
Model recall:  0.9071203470230103
Model F1:  0.8515302786990782
Model overlap ratio:  0.8983268737792969
LRP Top-1 model answer hits:  3127
LRP Top-1 model answer hit %:  0.9771875
LRP Top-1 label answer hits:  2995
LRP Top-1 label answer hit %:  0.9359375
LRP Top-2 Strict Accuracy:  0.88953125
LRP Top-2 Span Accuracy:  0.96171875
LRP-Model exact matches:  2570
LRP-Model Strict IoU:  0.8010412269593359
LRP-Model Span IoU:  0.9262603461249059
Total start-end skips:  18
Total empty model answer skips:  0
Total unanswerable:  3402


 57%|████████████████████████████████████████████▎                                | 6824/11873 [16:01<22:35,  3.73it/s]

Total examples:  3300
Model exact matches:  2672
Model exact match %:  0.8096969696969697
Model precision:  0.7985175848007202
Model recall:  0.9058244824409485
Model F1:  0.8487929645986322
Model overlap ratio:  0.8968327641487122
LRP Top-1 model answer hits:  3221
LRP Top-1 model answer hit %:  0.9760606060606061
LRP Top-1 label answer hits:  3084
LRP Top-1 label answer hit %:  0.9345454545454546
LRP Top-2 Strict Accuracy:  0.8895454545454545
LRP Top-2 Span Accuracy:  0.9610606060606061
LRP-Model exact matches:  2654
LRP-Model Strict IoU:  0.8010642652476463
LRP-Model Span IoU:  0.9250401050021876
Total start-end skips:  20
Total empty model answer skips:  0
Total unanswerable:  3465


 59%|█████████████████████████████████████████████▎                               | 6993/11873 [16:30<15:15,  5.33it/s]

Total examples:  3400
Model exact matches:  2754
Model exact match %:  0.81
Model precision:  0.7988244891166687
Model recall:  0.9056718945503235
Model F1:  0.8488992941305443
Model overlap ratio:  0.8960718512535095
LRP Top-1 model answer hits:  3320
LRP Top-1 model answer hit %:  0.9764705882352941
LRP Top-1 label answer hits:  3175
LRP Top-1 label answer hit %:  0.9338235294117647
LRP Top-2 Strict Accuracy:  0.8897058823529411
LRP Top-2 Span Accuracy:  0.9613235294117647
LRP-Model exact matches:  2734
LRP-Model Strict IoU:  0.8013245033112583
LRP-Model Span IoU:  0.9255273962905282
Total start-end skips:  25
Total empty model answer skips:  0
Total unanswerable:  3529


 61%|██████████████████████████████████████████████▋                              | 7197/11873 [16:58<08:32,  9.12it/s]

Total examples:  3500
Model exact matches:  2838
Model exact match %:  0.8108571428571428
Model precision:  0.7999343276023865
Model recall:  0.9071182012557983
Model F1:  0.8501612880803461
Model overlap ratio:  0.8970696330070496
LRP Top-1 model answer hits:  3418
LRP Top-1 model answer hit %:  0.9765714285714285
LRP Top-1 label answer hits:  3273
LRP Top-1 label answer hit %:  0.9351428571428572
LRP Top-2 Strict Accuracy:  0.8895714285714286
LRP Top-2 Span Accuracy:  0.9615714285714285
LRP-Model exact matches:  2813
LRP-Model Strict IoU:  0.8011063939276984
LRP-Model Span IoU:  0.925987068372541
Total start-end skips:  26
Total empty model answer skips:  0
Total unanswerable:  3633


 62%|███████████████████████████████████████████████▉                             | 7401/11873 [17:27<09:52,  7.55it/s]

Total examples:  3600
Model exact matches:  2935
Model exact match %:  0.8152777777777778
Model precision:  0.8023251891136169
Model recall:  0.9090200662612915
Model F1:  0.8523466486736334
Model overlap ratio:  0.8994809985160828
LRP Top-1 model answer hits:  3518
LRP Top-1 model answer hit %:  0.9772222222222222
LRP Top-1 label answer hits:  3371
LRP Top-1 label answer hit %:  0.9363888888888889
LRP Top-2 Strict Accuracy:  0.8901388888888889
LRP Top-2 Span Accuracy:  0.9623611111111111
LRP-Model exact matches:  2895
LRP-Model Strict IoU:  0.8020272806907771
LRP-Model Span IoU:  0.9274528175612368
Total start-end skips:  27
Total empty model answer skips:  0
Total unanswerable:  3735


 64%|█████████████████████████████████████████████████▎                           | 7604/11873 [17:56<12:31,  5.68it/s]

Total examples:  3700
Model exact matches:  3020
Model exact match %:  0.8162162162162162
Model precision:  0.801296591758728
Model recall:  0.9093546271324158
Model F1:  0.8519127164841447
Model overlap ratio:  0.8995237946510315
LRP Top-1 model answer hits:  3615
LRP Top-1 model answer hit %:  0.977027027027027
LRP Top-1 label answer hits:  3464
LRP Top-1 label answer hit %:  0.9362162162162162
LRP Top-2 Strict Accuracy:  0.8910810810810811
LRP Top-2 Span Accuracy:  0.9625675675675676
LRP-Model exact matches:  2983
LRP-Model Strict IoU:  0.8035583719229832
LRP-Model Span IoU:  0.9278363944249055
Total start-end skips:  28
Total empty model answer skips:  0
Total unanswerable:  3837


 66%|██████████████████████████████████████████████████▌                          | 7800/11873 [18:25<15:10,  4.47it/s]

Total examples:  3800
Model exact matches:  3108
Model exact match %:  0.8178947368421052
Model precision:  0.8044580817222595
Model recall:  0.9104650616645813
Model F1:  0.8541851916877549
Model overlap ratio:  0.9007386565208435
LRP Top-1 model answer hits:  3715
LRP Top-1 model answer hit %:  0.9776315789473684
LRP Top-1 label answer hits:  3562
LRP Top-1 label answer hit %:  0.9373684210526316
LRP Top-2 Strict Accuracy:  0.8917105263157895
LRP Top-2 Span Accuracy:  0.963421052631579
LRP-Model exact matches:  3066
LRP-Model Strict IoU:  0.804582690252879
LRP-Model Span IoU:  0.9294237116019294
Total start-end skips:  28
Total empty model answer skips:  0
Total unanswerable:  3933


 67%|███████████████████████████████████████████████████▉                         | 8000/11873 [18:55<06:45,  9.56it/s]

Total examples:  3900
Model exact matches:  3193
Model exact match %:  0.8187179487179487
Model precision:  0.8066762089729309
Model recall:  0.9088983535766602
Model F1:  0.8547418389269397
Model overlap ratio:  0.9008413553237915
LRP Top-1 model answer hits:  3812
LRP Top-1 model answer hit %:  0.9774358974358974
LRP Top-1 label answer hits:  3656
LRP Top-1 label answer hit %:  0.9374358974358974
LRP Top-2 Strict Accuracy:  0.8923076923076924
LRP Top-2 Span Accuracy:  0.9634615384615385
LRP-Model exact matches:  3152
LRP-Model Strict IoU:  0.8055555555555556
LRP-Model Span IoU:  0.9294990723562152
Total start-end skips:  30
Total empty model answer skips:  0
Total unanswerable:  4031


 69%|█████████████████████████████████████████████████████▏                       | 8204/11873 [19:24<11:32,  5.30it/s]

Total examples:  4000
Model exact matches:  3281
Model exact match %:  0.82025
Model precision:  0.8086392283439636
Model recall:  0.9083332419395447
Model F1:  0.8555919265495044
Model overlap ratio:  0.9013624787330627
LRP Top-1 model answer hits:  3907
LRP Top-1 model answer hit %:  0.97675
LRP Top-1 label answer hits:  3748
LRP Top-1 label answer hit %:  0.937
LRP Top-2 Strict Accuracy:  0.891875
LRP Top-2 Span Accuracy:  0.962875
LRP-Model exact matches:  3232
LRP-Model Strict IoU:  0.8048505358150028
LRP-Model Span IoU:  0.9284078582620224
Total start-end skips:  30
Total empty model answer skips:  0
Total unanswerable:  4135


 71%|██████████████████████████████████████████████████████▌                      | 8422/11873 [19:53<08:45,  6.56it/s]

Total examples:  4100
Model exact matches:  3372
Model exact match %:  0.8224390243902439
Model precision:  0.8119385838508606
Model recall:  0.9098213911056519
Model F1:  0.8580976472869877
Model overlap ratio:  0.9028902053833008
LRP Top-1 model answer hits:  4007
LRP Top-1 model answer hit %:  0.9773170731707317
LRP Top-1 label answer hits:  3846
LRP Top-1 label answer hit %:  0.9380487804878048
LRP Top-2 Strict Accuracy:  0.8932926829268293
LRP Top-2 Span Accuracy:  0.9636585365853658
LRP-Model exact matches:  3322
LRP-Model Strict IoU:  0.8071625344352618
LRP-Model Span IoU:  0.9298658507884208
Total start-end skips:  30
Total empty model answer skips:  0
Total unanswerable:  4253


 73%|███████████████████████████████████████████████████████▉                     | 8632/11873 [20:22<09:34,  5.64it/s]

Total examples:  4200
Model exact matches:  3452
Model exact match %:  0.8219047619047619
Model precision:  0.8136682510375977
Model recall:  0.9099587798118591
Model F1:  0.8591238773053211
Model overlap ratio:  0.90289306640625
LRP Top-1 model answer hits:  4106
LRP Top-1 model answer hit %:  0.9776190476190476
LRP Top-1 label answer hits:  3941
LRP Top-1 label answer hit %:  0.9383333333333334
LRP Top-2 Strict Accuracy:  0.8939285714285714
LRP Top-2 Span Accuracy:  0.9641666666666666
LRP-Model exact matches:  3407
LRP-Model Strict IoU:  0.8082014853083629
LRP-Model Span IoU:  0.9308125502815768
Total start-end skips:  31
Total empty model answer skips:  0
Total unanswerable:  4362


 74%|█████████████████████████████████████████████████████████▎                   | 8841/11873 [20:50<09:05,  5.56it/s]

Total examples:  4300
Model exact matches:  3526
Model exact match %:  0.82
Model precision:  0.8123955130577087
Model recall:  0.908002495765686
Model F1:  0.8575424403213986
Model overlap ratio:  0.9012925028800964
LRP Top-1 model answer hits:  4204
LRP Top-1 model answer hit %:  0.9776744186046512
LRP Top-1 label answer hits:  4031
LRP Top-1 label answer hit %:  0.9374418604651162
LRP Top-2 Strict Accuracy:  0.8941860465116279
LRP Top-2 Span Accuracy:  0.9641860465116279
LRP-Model exact matches:  3490
LRP-Model Strict IoU:  0.8086225026288117
LRP-Model Span IoU:  0.9308486753480018
Total start-end skips:  31
Total empty model answer skips:  0
Total unanswerable:  4471


 76%|██████████████████████████████████████████████████████████▌                  | 9035/11873 [21:19<10:22,  4.56it/s]

Total examples:  4400
Model exact matches:  3611
Model exact match %:  0.8206818181818182
Model precision:  0.8132675886154175
Model recall:  0.9068170189857483
Model F1:  0.8574983894245285
Model overlap ratio:  0.9014876484870911
LRP Top-1 model answer hits:  4303
LRP Top-1 model answer hit %:  0.9779545454545454
LRP Top-1 label answer hits:  4125
LRP Top-1 label answer hit %:  0.9375
LRP Top-2 Strict Accuracy:  0.894090909090909
LRP Top-2 Span Accuracy:  0.9644318181818182
LRP-Model exact matches:  3569
LRP-Model Strict IoU:  0.8084669132757912
LRP-Model Span IoU:  0.9313069241742565
Total start-end skips:  31
Total empty model answer skips:  0
Total unanswerable:  4565


 77%|███████████████████████████████████████████████████████████▌                 | 9187/11873 [21:47<10:01,  4.46it/s]

Total examples:  4500
Model exact matches:  3693
Model exact match %:  0.8206666666666667
Model precision:  0.8144549131393433
Model recall:  0.908018171787262
Model F1:  0.8586954045362648
Model overlap ratio:  0.9024031162261963
LRP Top-1 model answer hits:  4395
LRP Top-1 model answer hit %:  0.9766666666666667
LRP Top-1 label answer hits:  4220
LRP Top-1 label answer hit %:  0.9377777777777778
LRP Top-2 Strict Accuracy:  0.8925555555555555
LRP Top-2 Span Accuracy:  0.9632222222222222
LRP-Model exact matches:  3642
LRP-Model Strict IoU:  0.8059596669007726
LRP-Model Span IoU:  0.9290536919944272
Total start-end skips:  33
Total empty model answer skips:  0
Total unanswerable:  4615


 79%|████████████████████████████████████████████████████████████▊                | 9372/11873 [22:16<06:45,  6.16it/s]

Total examples:  4600
Model exact matches:  3776
Model exact match %:  0.8208695652173913
Model precision:  0.814910888671875
Model recall:  0.9090603590011597
Model F1:  0.8594147796953379
Model overlap ratio:  0.9025987982749939
LRP Top-1 model answer hits:  4492
LRP Top-1 model answer hit %:  0.9765217391304348
LRP Top-1 label answer hits:  4314
LRP Top-1 label answer hit %:  0.9378260869565217
LRP Top-2 Strict Accuracy:  0.8925
LRP Top-2 Span Accuracy:  0.9632608695652174
LRP-Model exact matches:  3723
LRP-Model Strict IoU:  0.8058690744920993
LRP-Model Span IoU:  0.9291256028517509
Total start-end skips:  35
Total empty model answer skips:  0
Total unanswerable:  4698


 81%|██████████████████████████████████████████████████████████████               | 9569/11873 [22:45<04:39,  8.23it/s]

Total examples:  4700
Model exact matches:  3854
Model exact match %:  0.82
Model precision:  0.8132187724113464
Model recall:  0.9098823666572571
Model F1:  0.8588392224634069
Model overlap ratio:  0.9024918079376221
LRP Top-1 model answer hits:  4590
LRP Top-1 model answer hit %:  0.9765957446808511
LRP Top-1 label answer hits:  4407
LRP Top-1 label answer hit %:  0.9376595744680851
LRP Top-2 Strict Accuracy:  0.8919148936170213
LRP Top-2 Span Accuracy:  0.963404255319149
LRP-Model exact matches:  3798
LRP-Model Strict IoU:  0.804915514592934
LRP-Model Span IoU:  0.9293924466338259
Total start-end skips:  35
Total empty model answer skips:  0
Total unanswerable:  4795


 82%|███████████████████████████████████████████████████████████████▎             | 9762/11873 [23:14<04:53,  7.20it/s]

Total examples:  4800
Model exact matches:  3931
Model exact match %:  0.8189583333333333
Model precision:  0.8119990229606628
Model recall:  0.9109820127487183
Model F1:  0.8586472966977761
Model overlap ratio:  0.9023342728614807
LRP Top-1 model answer hits:  4685
LRP Top-1 model answer hit %:  0.9760416666666667
LRP Top-1 label answer hits:  4497
LRP Top-1 label answer hit %:  0.936875
LRP Top-2 Strict Accuracy:  0.8910416666666666
LRP Top-2 Span Accuracy:  0.963125
LRP-Model exact matches:  3874
LRP-Model Strict IoU:  0.8034942701484126
LRP-Model Span IoU:  0.9288728149487643
Total start-end skips:  37
Total empty model answer skips:  0
Total unanswerable:  4886


 84%|████████████████████████████████████████████████████████████████▌            | 9952/11873 [23:42<04:38,  6.89it/s]

Total examples:  4900
Model exact matches:  4016
Model exact match %:  0.8195918367346938
Model precision:  0.810009241104126
Model recall:  0.9120262265205383
Model F1:  0.8579958839407358
Model overlap ratio:  0.9027081727981567
LRP Top-1 model answer hits:  4782
LRP Top-1 model answer hit %:  0.9759183673469388
LRP Top-1 label answer hits:  4593
LRP Top-1 label answer hit %:  0.9373469387755102
LRP Top-2 Strict Accuracy:  0.8904081632653061
LRP Top-2 Span Accuracy:  0.9630612244897959
LRP-Model exact matches:  3949
LRP-Model Strict IoU:  0.8024645944454663
LRP-Model Span IoU:  0.9287541822475891
Total start-end skips:  37
Total empty model answer skips:  0
Total unanswerable:  4976


 85%|████████████████████████████████████████████████████████████████▉           | 10146/11873 [24:11<03:28,  8.30it/s]

Total examples:  5000
Model exact matches:  4103
Model exact match %:  0.8206
Model precision:  0.8106536269187927
Model recall:  0.9124166965484619
Model F1:  0.8585301415091439
Model overlap ratio:  0.9033253788948059
LRP Top-1 model answer hits:  4882
LRP Top-1 model answer hit %:  0.9764
LRP Top-1 label answer hits:  4691
LRP Top-1 label answer hit %:  0.9382
LRP Top-2 Strict Accuracy:  0.8903
LRP Top-2 Span Accuracy:  0.9634
LRP-Model exact matches:  4026
LRP-Model Strict IoU:  0.802288906911778
LRP-Model Span IoU:  0.9293845263360988
Total start-end skips:  37
Total empty model answer skips:  0
Total unanswerable:  5070


 87%|██████████████████████████████████████████████████████████████████▏         | 10346/11873 [24:39<04:19,  5.89it/s]

Total examples:  5100
Model exact matches:  4187
Model exact match %:  0.8209803921568627
Model precision:  0.810005784034729
Model recall:  0.9132314324378967
Model F1:  0.8585268880754444
Model overlap ratio:  0.9035385847091675
LRP Top-1 model answer hits:  4980
LRP Top-1 model answer hit %:  0.9764705882352941
LRP Top-1 label answer hits:  4784
LRP Top-1 label answer hit %:  0.9380392156862745
LRP Top-2 Strict Accuracy:  0.89
LRP Top-2 Span Accuracy:  0.9635294117647059
LRP-Model exact matches:  4104
LRP-Model Strict IoU:  0.8018018018018018
LRP-Model Span IoU:  0.9296254256526674
Total start-end skips:  38
Total empty model answer skips:  0
Total unanswerable:  5169


 89%|███████████████████████████████████████████████████████████████████▍        | 10543/11873 [25:08<03:42,  5.98it/s]

Total examples:  5200
Model exact matches:  4273
Model exact match %:  0.8217307692307693
Model precision:  0.8106072545051575
Model recall:  0.9141000509262085
Model F1:  0.8592485580490925
Model overlap ratio:  0.9039345383644104
LRP Top-1 model answer hits:  5075
LRP Top-1 model answer hit %:  0.9759615384615384
LRP Top-1 label answer hits:  4878
LRP Top-1 label answer hit %:  0.938076923076923
LRP Top-2 Strict Accuracy:  0.8898076923076923
LRP Top-2 Span Accuracy:  0.9629807692307693
LRP-Model exact matches:  4185
LRP-Model Strict IoU:  0.8014896934003118
LRP-Model Span IoU:  0.9286045433472415
Total start-end skips:  38
Total empty model answer skips:  0
Total unanswerable:  5266


 90%|████████████████████████████████████████████████████████████████████▋       | 10732/11873 [25:37<03:31,  5.40it/s]

Total examples:  5300
Model exact matches:  4360
Model exact match %:  0.8226415094339623
Model precision:  0.8115215301513672
Model recall:  0.9140421152114868
Model F1:  0.8597363046591522
Model overlap ratio:  0.904396653175354
LRP Top-1 model answer hits:  5174
LRP Top-1 model answer hit %:  0.9762264150943396
LRP Top-1 label answer hits:  4974
LRP Top-1 label answer hit %:  0.9384905660377358
LRP Top-2 Strict Accuracy:  0.890188679245283
LRP Top-2 Span Accuracy:  0.9633962264150944
LRP-Model exact matches:  4268
LRP-Model Strict IoU:  0.8021081264875892
LRP-Model Span IoU:  0.9293775027302512
Total start-end skips:  38
Total empty model answer skips:  0
Total unanswerable:  5355


 92%|██████████████████████████████████████████████████████████████████████      | 10938/11873 [26:06<02:27,  6.35it/s]

Total examples:  5400
Model exact matches:  4448
Model exact match %:  0.8237037037037037
Model precision:  0.8126438856124878
Model recall:  0.9151841998100281
Model F1:  0.8608713453143236
Model overlap ratio:  0.9049806594848633
LRP Top-1 model answer hits:  5272
LRP Top-1 model answer hit %:  0.9762962962962963
LRP Top-1 label answer hits:  5065
LRP Top-1 label answer hit %:  0.937962962962963
LRP Top-2 Strict Accuracy:  0.8909259259259259
LRP Top-2 Span Accuracy:  0.9637037037037037
LRP-Model exact matches:  4356
LRP-Model Strict IoU:  0.8033060611120387
LRP-Model Span IoU:  0.9299499642601858
Total start-end skips:  39
Total empty model answer skips:  0
Total unanswerable:  5460


 94%|███████████████████████████████████████████████████████████████████████▎    | 11148/11873 [26:35<01:18,  9.20it/s]

Total examples:  5500
Model exact matches:  4531
Model exact match %:  0.8238181818181818
Model precision:  0.8138829469680786
Model recall:  0.9154514074325562
Model F1:  0.8616844826927826
Model overlap ratio:  0.9054386615753174
LRP Top-1 model answer hits:  5372
LRP Top-1 model answer hit %:  0.9767272727272728
LRP Top-1 label answer hits:  5163
LRP Top-1 label answer hit %:  0.9387272727272727
LRP Top-2 Strict Accuracy:  0.8911818181818182
LRP Top-2 Span Accuracy:  0.9643636363636363
LRP-Model exact matches:  4437
LRP-Model Strict IoU:  0.8037222267770763
LRP-Model Span IoU:  0.9311797752808989
Total start-end skips:  39
Total empty model answer skips:  0
Total unanswerable:  5570


 96%|████████████████████████████████████████████████████████████████████████▊   | 11384/11873 [27:04<00:46, 10.52it/s]

Total examples:  5600
Model exact matches:  4605
Model exact match %:  0.8223214285714285
Model precision:  0.8151670694351196
Model recall:  0.9130415320396423
Model F1:  0.8613328151592078
Model overlap ratio:  0.9050140976905823
LRP Top-1 model answer hits:  5470
LRP Top-1 model answer hit %:  0.9767857142857143
LRP Top-1 label answer hits:  5257
LRP Top-1 label answer hit %:  0.93875
LRP Top-2 Strict Accuracy:  0.8911607142857143
LRP Top-2 Span Accuracy:  0.964375
LRP-Model exact matches:  4518
LRP-Model Strict IoU:  0.8036878975762944
LRP-Model Span IoU:  0.9312009656004828
Total start-end skips:  41
Total empty model answer skips:  0
Total unanswerable:  5705


 98%|██████████████████████████████████████████████████████████████████████████▏ | 11594/11873 [27:32<00:51,  5.46it/s]

Total examples:  5700
Model exact matches:  4688
Model exact match %:  0.8224561403508772
Model precision:  0.8150673508644104
Model recall:  0.9143727421760559
Model F1:  0.8618689617144503
Model overlap ratio:  0.9052829742431641
LRP Top-1 model answer hits:  5564
LRP Top-1 model answer hit %:  0.976140350877193
LRP Top-1 label answer hits:  5349
LRP Top-1 label answer hit %:  0.9384210526315789
LRP Top-2 Strict Accuracy:  0.8907017543859649
LRP Top-2 Span Accuracy:  0.9639473684210527
LRP-Model exact matches:  4597
LRP-Model Strict IoU:  0.8029416416258105
LRP-Model Span IoU:  0.9304038608077216
Total start-end skips:  41
Total empty model answer skips:  0
Total unanswerable:  5814


 99%|███████████████████████████████████████████████████████████████████████████▍| 11790/11873 [28:01<00:09,  8.98it/s]

Total examples:  5800
Model exact matches:  4774
Model exact match %:  0.823103448275862
Model precision:  0.816024661064148
Model recall:  0.9146488904953003
Model F1:  0.8625266737179793
Model overlap ratio:  0.9053937196731567
LRP Top-1 model answer hits:  5660
LRP Top-1 model answer hit %:  0.9758620689655172
LRP Top-1 label answer hits:  5443
LRP Top-1 label answer hit %:  0.9384482758620689
LRP Top-2 Strict Accuracy:  0.890603448275862
LRP Top-2 Span Accuracy:  0.9635344827586206
LRP-Model exact matches:  4678
LRP-Model Strict IoU:  0.8027818789338721
LRP-Model Span IoU:  0.9296348665058638
Total start-end skips:  45
Total empty model answer skips:  0
Total unanswerable:  5906


100%|████████████████████████████████████████████████████████████████████████████| 11873/11873 [28:13<00:00,  7.01it/s]


# Dummy Test (Don't run if you want to run the full SQuADv2 benchmark)

In [8]:
context = "Welcome to the final examination for this term's offering of CS100. Please remove all headphones and earbuds, as well as hats and hoods. Place your bag under your desk so that it does not block the aisle. You are permitted writing instruments, a clear water bottle, and any aids listed on the front of your booklet. The exam will be 150 minutes in duration. You may now begin."
question = "What is this?"

In [9]:
input_ids = tokenizer(question, context, return_tensors="pt")["input_ids"]

In [10]:
output = model(input_ids.to(device))

In [11]:
g = make_graph(output.start_logits)
g[2], len(g[2]), g[4]

({'CloneBackward0': 1,
  'SqueezeBackward1': 1,
  'SplitBackward0': 1,
  'ViewBackward0': 386,
  'AddBackward0': 195,
  'AccumulateGrad': 391,
  'MmBackward0': 145,
  'NativeLayerNormBackward0': 49,
  'GeluBackward0': 24,
  'TransposeBackward0': 24,
  'ScaledDotProductEfficientAttentionBackward0': 24,
  'PermuteBackward0': 72,
  'EmbeddingBackward0': 3,
  'TBackward0': 145},
 14,
 1461)

In [10]:
start = torch.argmax(output.start_logits)
end = torch.argmax(output.end_logits) + 1

answer_tokens = input_ids[0][start:end]
answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
print(answer)

 final examination


In [10]:
checkpoint_vals, param_vals = lrp.run((output.start_logits, output.end_logits))

In [11]:
checkpoint_vals1, param_vals1 = lrp.run((output.start_logits, output.end_logits))

In [12]:
# Check results diff between 1st pass and 2nd pass on same input (should be ~0)
[
    ((p1 - p0)**2).sum() for p1, p0 in zip(param_vals1, param_vals)
]

[tensor(0., device='cuda:0'),
 tensor(0., device='cuda:0'),
 tensor(0., device='cuda:0')]

In [13]:
# Top 5 tokens from LRP
lrp_answer_ids = input_ids[0][param_vals[1].flatten().topk(k=5).indices.cpu()]
print(tokenizer.decode(lrp_answer_ids))

Welcome hooduds aisle booklet


In [None]:
from tqdm import tqdm

results = []

for example in tqdm(dataset["validation"].select(range(100))):
    question = example["question"]
    context = example["context"]
    answers = example["answers"]["text"]

    input_ids = tokenizer(question, context, return_tensors="pt")["input_ids"]
    output = model(input_ids.to(device))

    start = torch.argmax(output.start_logits)
    end = torch.argmax(output.end_logits) + 1
    model_answer = tokenizer.decode(input_ids[0][start:end], skip_special_tokens=True)

    checkpoint_vals, param_vals = lrp.run((output.start_logits, output.end_logits))

    lrp_top5 = param_vals[1].flatten().topk(k=5)
    lrp_top5_tokens = tokenizer.decode(input_ids[0][lrp_top5.indices.cpu()], skip_special_tokens=True)
    lrp_start_end = param_vals[1].flatten().topk(k=2).indices.cpu().sort()
    lrp_start = lrp_start_end[0][0]
    if lrp_start == 0:
        lrp_start = lrp_start_end[0][1]
    lrp_end = lrp_start_end[0][-1]
    lrp_answer_ids = input_ids[0][lrp_start:lrp_end + 1]
    lrp_answer = tokenizer.decode(lrp_answer_ids, skip_special_tokens=True)

    results.append({
        "example": example,
        "model_answer": model_answer,
        "lrp_answer": lrp_answer,
        "lrp_top5_tokens": lrp_top5_tokens,
        "lrp_top5_relevances": lrp_top5.values.cpu(),
        "is_impossible": len(answers) == 0
    })

In [None]:
for res in results:
    print("Q: ", res["example"]["question"])
    print("A (labels): ", res["example"]["answers"]["text"])
    print("Model: ", res["model_answer"])
    print("LRP: ", res["lrp_answer"])
    print("LRP top5: ", res["lrp_top5_tokens"])
    print("LRP top5 attributions: ", res["lrp_top5_relevances"], '\n')