## Inspect Prediction Results

In [1]:
import json

MODELS = {
    "mukund_privbert": "PrivBERT",
    "bert_base_uncased": "BERT",
    "roberta_base":"RoBERTa",
    "nlpaueb_legal_bert_base_uncased": "LegalBERT",
}
TASKS= {
    "opp_115",
    "piextract",
    "policy_detection",
    "policy_ie_a",
    "policy_ie_b",
    "policy_qa",
    "privacy_qa"
}

In [2]:
predictions = {}
for model in MODELS:
    predictions[model] = {}
    for task in TASKS:
        predictions[model][task] = []
        for i in range(10):
            with open(f"../runs/{model}/{task}/seed_{i}/predictions.json", "r") as pred_file:
                preds = json.load(pred_file)
                predictions[model][task].append(preds)

Extract instances of non matching predictions from different model of which one is correct prediction.

In [3]:
from itertools import permutations
from collections import defaultdict

def first_model_wrong_second_model_correct(r1, r2):
    return r1["predicted_label"] != r1["gold_label"] and r2["predicted_label"] == r2["gold_label"] 

interesting_results=defaultdict(dict)
for model1, model2 in permutations(MODELS.keys(), 2):
    for task in TASKS - {"policy_qa"}:
        interesting_results[(model1, model2)][task]=[]
        for i in range(10):
            for result1, result2 in zip(predictions[model1][task][i], predictions[model2][task][i]):
                if first_model_wrong_second_model_correct(result1, result2) and len(result1["text"]) < 1000:
                    interesting_results[(model1, model2)][task].append((result1, result2, i))

In [4]:
interesting_results.keys()

dict_keys([('mukund_privbert', 'bert_base_uncased'), ('mukund_privbert', 'roberta_base'), ('mukund_privbert', 'nlpaueb_legal_bert_base_uncased'), ('bert_base_uncased', 'mukund_privbert'), ('bert_base_uncased', 'roberta_base'), ('bert_base_uncased', 'nlpaueb_legal_bert_base_uncased'), ('roberta_base', 'mukund_privbert'), ('roberta_base', 'bert_base_uncased'), ('roberta_base', 'nlpaueb_legal_bert_base_uncased'), ('nlpaueb_legal_bert_base_uncased', 'mukund_privbert'), ('nlpaueb_legal_bert_base_uncased', 'bert_base_uncased'), ('nlpaueb_legal_bert_base_uncased', 'roberta_base')])

In [10]:
for model in MODELS.keys() - {"mukund_privbert"}:
    for other_result, privBERT_result, seed in interesting_results[(model,"mukund_privbert")]["privacy_qa"]:
        if "data" in privBERT_result["text"] and "shar" in privBERT_result["text"]:
            print("-"*30)
            print(f"id: {other_result['id']}")
            print(f"question: {other_result['question']}")
            print(f"text: {other_result['text']}")
            print(f"true label: {other_result['gold_label']}")
            print(f"predicted by {MODELS[model]}: {other_result['predicted_label']}")
            print(f"predicted by {MODELS['mukund_privbert']}: {privBERT_result['predicted_label']}")
            
            


------------------------------
id: 9575
question: are my statistics kept private?
text: We will never share with or sell the information gained through the use of Apple HealthKit, such as age, weight and heart rate data, to advertisers or other agencies without your authorization.
true label: Relevant
predicted by LegalBERT: Irrelevant
predicted by PrivBERT: Relevant
------------------------------
id: 10706
question: is any information shared with third parties?
text: We will never share with or sell the information gained through the use of Apple HealthKit, such as age, weight and heart rate data, to advertisers or other agencies without your authorization.
true label: Relevant
predicted by LegalBERT: Irrelevant
predicted by PrivBERT: Relevant
------------------------------
id: 10880
question: what information will this app have access to of mine?
text: We will never share with or sell the information gained through the use of Apple HealthKit, such as age, weight and heart rate data, 

interesting examples id: policy_detection: 31, policy_qa: 9575, 10706