In [1]:
import sys
sys.path.append("..")

import json
import numpy as np

In [2]:
def get_results(filename):
    with open(filename, "r") as f:
        data = json.load(f)
    #print("Number of data points: ", len(data))
    return data

In [3]:
def get_test_data_annotations():
    visual_news_data = json.load(open("../../datasets/visualnews/origin/data.json"))
    visual_news_data_mapping = {ann["id"]: ann for ann in visual_news_data}

    test_data = json.load(open("../../news_clippings/news_clippings/data/merged_balanced/test.json"))
    annotations = test_data["annotations"]
    return annotations
test_data_annotations = get_test_data_annotations()
test_data_annotations = test_data_annotations[:1000]

In [4]:
def fix_unsures(data):
    """
    In instances where one of the models does not have any output, consider the output of the other model to be the truth
    """
    num_unsures = 0
    for i in range(len(data)):
        if data[i]['falsified'] == "Unsure":
            num_unsures += 1
            if data[i]['output']['model_0'] == "" and data[i]['output']['model_1'] != "":
                if "YES" in data[i]['output']['model_1'] or "Yes" in data[i]['output']['model_1']:
                    data[i]['falsified'] = True
                elif "NO" in data[i]['output']['model_1'] or "No" in data[i]['output']['model_1']:
                    data[i]['falsified'] = False
            elif data[i]['output']['model_0'] != "" and data[i]['output']['model_1'] == "":
                if "YES" in data[i]['output']['model_0'] or "Yes" in data[i]['output']['model_0']:
                    data[i]['falsified'] = True
                elif "NO" in data[i]['output']['model_0'] or "No" in data[i]['output']['model_0']:
                    data[i]['falsified'] = False
    print("Num unsures: ", num_unsures)
    return data

In [5]:
def num_disagreements(data):
    """
    Function to check when models actually disagree and when they are just unsure
    """
    disagreements = 0
    for i in range(len(data)):
        m0 = data[i]['output']['model_0']
        m1 = data[i]['output']['model_1']
        if data[i]['falsified'] == "Unsure":
            #check if models disagree
            if ("YES" in m0 or "Yes" in m0) and ("NO" in m1 or "No" in m1 or "Unsure" in m1):
                disagreements += 1
            elif ("NO" in m0 or "No" in m0) and ("YES" in m1 or "Yes" in m1 or "Unsure" in m1):
                disagreements += 1
            elif "Unsure" in m0 and ("YES" in m1 or "NO" in m1 or "Yes" in m1 or "No" in m1):
                disagreements += 1
    print("Num disagreements: ", disagreements)

In [6]:
def get_accuracy(data, annotations):
    num_correct = 0
    data = fix_unsures(data)
    incorrect_idx = []
    for i in range(len(data)):
        if bool(data[i]['falsified']) == annotations[i]['falsified']:
            num_correct += 1
        else:
            incorrect_idx.append(i)
    print("Num incorrects: ", len(incorrect_idx))
    return num_correct/len(data)

In [7]:
def get_incorrect_idx(data, annotations):
    num_correct = 0
    data = fix_unsures(data)
    incorrect_idx = []
    for i in range(len(data)):
        if bool(data[i]['falsified']) != annotations[i]['falsified']:
            incorrect_idx.append(i)
    return incorrect_idx

In [8]:
def get_acc_without_unsures(data, annotations):
    num_correct, num_unsures = 0,0
    for i in range(len(data)):
        if data[i]['falsified'] == 'Unsure':
            num_unsures += 1
        elif bool(data[i]['falsified']) == annotations[i]['falsified']:
            num_correct += 1
    return num_correct/(len(data) - num_unsures)

In [9]:
def true_positives(data, annotations):
    num_tp = 0
    for i in range(len(data)):
        if annotations[i]['falsified'] == True and bool(data[i]['falsified']) == True:
            num_tp += 1
    return num_tp

In [10]:
def false_positives(data, annotations):
    num_fp = 0
    for i in range(len(data)):
        if annotations[i]['falsified'] == False and bool(data[i]['falsified']) == True:
            num_fp += 1
    return num_fp

In [11]:
def false_negatives(data, annotations):
    num_fn = 0
    for i in range(len(data)):
        if annotations[i]['falsified'] == True and bool(data[i]['falsified']) == False:
            num_fn += 1
    return num_fn

In [12]:
def get_precision_and_recall(data, annotations):
    tp,fp,fn = true_positives(data, annotations), false_positives(data, annotations), false_negatives(data, annotations)
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    return precision, recall

### no web access

In [180]:
no_web_file = "../results/results_no_web_access.json"
result_data = get_results(no_web_file)
result_data = result_data[:1000]
num_disagreements(result_data)
precision, recall = get_precision_and_recall(result_data, test_data_annotations)
print("Accuracy: {:.4f}".format(get_accuracy(result_data, test_data_annotations)))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("Accuracy without unsures: {:.4f}".format(get_acc_without_unsures(result_data, test_data_annotations)))


Num disagreements:  4
Num unsures:  675
Num incorrects:  428
Accuracy: 0.5720
Precision: 0.5448
Recall: 0.8640
Accuracy without unsures: 0.7515


### with web access (only when model unsure)

In [38]:
web_access_file = "../results/results_web_access_no_initial_context.json"
result_data2 = get_results(web_access_file)
num_disagreements(result_data2)
precision, recall = get_precision_and_recall(result_data2, test_data_annotations)
print("Accuracy: {:.4f}".format(get_accuracy(result_data2, test_data_annotations)))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("Accuracy without unsures: {:.4f}".format(get_acc_without_unsures(result_data2, test_data_annotations)))

Num disagreements:  16
Num unsures:  329
Accuracy: 0.5960
Precision: 0.5804
Recall: 0.6860
Accuracy without unsures: 0.6433


In [41]:
web_access_file = "../results/results_initial_context.json"
result_data3 = get_results(web_access_file)
num_disagreements(result_data3)
precision, recall = get_precision_and_recall(result_data3, test_data_annotations)
print("Accuracy: {:.4f}".format(get_accuracy(result_data3, test_data_annotations)))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("Accuracy without unsures: {:.4f}".format(get_acc_without_unsures(result_data3, test_data_annotations)))

Num disagreements:  7
Num unsures:  9
Accuracy: 0.8580
Precision: 0.8266
Recall: 0.9060
Accuracy without unsures: 0.8587


### with disambiguation queries

In [16]:
web_access_file = "../results/results_with_disambiguation.json"
result_data4 = get_results(web_access_file)
num_disagreements(result_data4)
precision, recall = get_precision_and_recall(result_data4, test_data_annotations)
print("Accuracy: {:.4f}".format(get_accuracy(result_data4, test_data_annotations)))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("Accuracy without unsures: {:.4f}".format(get_acc_without_unsures(result_data4, test_data_annotations)))

Num disagreements:  5
Num unsures:  12
Accuracy: 0.7730
Precision: 0.7468
Recall: 0.8260
Accuracy without unsures: 0.7786


### with opposite stances and disambiguation queries

In [None]:
web_access_file = "../scripts/temp_final_disamb.json"
result_data5 = get_results(web_access_file)
num_disagreements(result_data5)
precision, recall = get_precision_and_recall(result_data5, test_data_annotations)
print("Accuracy: {:.4f}".format(get_accuracy(result_data5, test_data_annotations)))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("Accuracy without unsures: {:.4f}".format(get_acc_without_unsures(result_data5, test_data_annotations)))

In [24]:
web_access_file = "../scripts/disamb_res_updated_1.json"
result_data5 = get_results(web_access_file)
print(len(result_data5))
num_disagreements(result_data5)
precision, recall = get_precision_and_recall(result_data5, test_data_annotations)
print("Accuracy: {:.4f}".format(get_accuracy(result_data5, test_data_annotations)))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("Accuracy without unsures: {:.4f}".format(get_acc_without_unsures(result_data5, test_data_annotations)))

253
Num disagreements:  1
Num unsures:  6
Accuracy: 0.7352
Precision: 0.7153
Recall: 0.7778
Accuracy without unsures: 0.7368


### actor-skeptic setup

In [198]:
web_access_file = "../results/as_temp.json"
result_data6 = get_results(web_access_file)
print(len(result_data6))
#num_disagreements(result_data6)
precision, recall = get_precision_and_recall(result_data6, test_data_annotations)
#print("Accuracy: {:.4f}".format(get_accuracy(result_data6, test_data_annotations)))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("Accuracy without unsures: {:.4f}".format(get_acc_without_unsures(result_data6, test_data_annotations)))

1000
Precision: 0.8166
Recall: 0.8640
Accuracy without unsures: 0.8398


### Finetuned model results

In [13]:
web_access_file = "../results/finetuned_results.json"
result_data7 = get_results(web_access_file)
print(len(result_data7))
num_disagreements(result_data7)
precision, recall = get_precision_and_recall(result_data7, test_data_annotations)
print("Accuracy: {:.4f}".format(get_accuracy(result_data7, test_data_annotations)))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("Accuracy without unsures: {:.4f}".format(get_acc_without_unsures(result_data7, test_data_annotations)))

1000
Num disagreements:  8
Num unsures:  25
Num incorrects:  182
Accuracy: 0.8180
Precision: 0.7449
Recall: 0.9580
Accuracy without unsures: 0.8192


### Analysing where model fails (what kinds of examples the model fails on)

In [190]:
from PIL import Image
from utils.data import get_data, show_data

In [191]:
def retrieve_summary(key):
    with open("../utils/summaries.json", "r") as f:
        data = json.load(f)
    return data[key]

In [195]:
def failed_samples(incorrect_idx, res_data):
    for i in incorrect_idx:
        img, caption, _, annotation = get_data(i)
        display(img)
        show_data(i)
        key = str(annotation['id'])+"_"+str(annotation['image_id'])
        print("Associated summary: ", retrieve_summary(key))
        print("Model_prediction: ", res_data[i]['falsified'])
        print("Model arguments: ", res_data[i]['output'])
        cont = input()
        if "exit" == cont:
            break

In [197]:
incorrect_idx = get_incorrect_idx(result_data7, test_data_annotations)
print(incorrect_idx)
#failed_samples(incorrect_idx, result_data7)

Num unsures:  39
[2, 14, 20, 23, 31, 36, 44, 54, 62, 64, 74, 76, 78, 80, 86, 98, 110, 112, 114, 116, 117, 118, 120, 130, 136, 140, 142, 150, 152, 154, 158, 160, 164, 166, 167, 168, 170, 174, 175, 183, 184, 189, 190, 199, 205, 206, 218, 224, 229, 234, 235, 241, 242, 244, 245, 246, 254, 260, 267, 272, 274, 284, 286, 294, 296, 300, 304, 308, 316, 322, 324, 330, 334, 335, 338, 341, 342, 343, 348, 354, 360, 361, 362, 366, 371, 372, 375, 376, 380, 382, 384, 386, 398, 404, 408, 410, 412, 416, 418, 425, 426, 428, 432, 436, 438, 456, 464, 466, 470, 473, 480, 481, 490, 498, 502, 516, 520, 524, 528, 530, 532, 534, 535, 543, 552, 554, 556, 562, 566, 568, 574, 580, 582, 586, 590, 596, 600, 602, 608, 613, 614, 623, 626, 628, 636, 638, 640, 642, 644, 648, 650, 651, 652, 656, 666, 672, 676, 678, 684, 688, 693, 698, 700, 710, 712, 720, 721, 722, 724, 726, 728, 732, 734, 736, 738, 740, 744, 760, 762, 768, 771, 772, 774, 778, 786, 790, 794, 798, 802, 806, 810, 814, 816, 822, 824, 826, 833, 835, 836, 838,