In [1]:
import sys
sys.path.append("..")

import json
import numpy as np

In [2]:
def get_results(filename):
    with open(filename, "r") as f:
        data = json.load(f)
    #print("Number of data points: ", len(data))
    return data

In [3]:
def get_test_data_annotations():
    visual_news_data = json.load(open("../../datasets/visualnews/origin/data.json"))
    visual_news_data_mapping = {ann["id"]: ann for ann in visual_news_data}

    test_data = json.load(open("../../news_clippings/news_clippings/data/merged_balanced/test.json"))
    annotations = test_data["annotations"]
    return annotations
test_data_annotations = get_test_data_annotations()

In [4]:
def fix_unsures(data):
    """
    In instances where one of the models does not have any output, consider the output of the other model to be the truth
    """
    num_unsures = 0
    for i in range(len(data)):
        if data[i]['falsified'] == "Unsure":
            num_unsures += 1
            if data[i]['output']['model_0'] == "" and data[i]['output']['model_1'] != "":
                if "YES" in data[i]['output']['model_1']:
                    data[i]['falsified'] = True
                elif "NO" in data[i]['output']['model_1']:
                    data[i]['falsified'] = False
            elif data[i]['output']['model_0'] != "" and data[i]['output']['model_1'] == "":
                if "YES" in data[i]['output']['model_0']:
                    data[i]['falsified'] = True
                elif "NO" in data[i]['output']['model_0']:
                    data[i]['falsified'] = False
    print("Num unsures: ", num_unsures)
    return data

In [5]:
def num_disagreements(data):
    """
    Function to check when models actually disagree and when they are just unsure
    """
    disagreements = 0
    for i in range(len(data)):
        m0 = data[i]['output']['model_0']
        m1 = data[i]['output']['model_1']
        if data[i]['falsified'] == "Unsure":
            #check if models disagree
            if "YES" in m0 and ("NO" in m1 or "UNSURE" in m1):
                disagreements += 1
            elif "NO" in m0 and ("YES" in m1 or "UNSURE" in m1):
                disagreements += 1
            elif "UNSURE" in m0 and ("YES" in m1 or "NO" in m1):
                disagreements += 1
    print("Num disagreements: ", disagreements)

In [6]:
def get_accuracy(data, annotations):
    num_correct = 0
    data = fix_unsures(data)
    incorrect_idx = []
    for i in range(len(data)):
        if bool(data[i]['falsified']) == annotations[i]['falsified']:
            num_correct += 1
        else:
            incorrect_idx.append(i)
    return num_correct/len(data)

In [7]:
def get_incorrect_idx(data, annotations):
    num_correct = 0
    data = fix_unsures(data)
    incorrect_idx = []
    for i in range(len(data)):
        if bool(data[i]['falsified']) != annotations[i]['falsified']:
            incorrect_idx.append(i)
    return incorrect_idx

In [8]:
def get_acc_without_unsures(data, annotations):
    num_correct, num_unsures = 0,0
    for i in range(len(data)):
        if data[i]['falsified'] == 'Unsure':
            num_unsures += 1
        elif bool(data[i]['falsified']) == annotations[i]['falsified']:
            num_correct += 1
    return num_correct/(len(data) - num_unsures)

In [9]:
def true_positives(data, annotations):
    num_tp = 0
    for i in range(len(data)):
        if annotations[i]['falsified'] == True and bool(data[i]['falsified']) == True:
            num_tp += 1
    return num_tp

In [10]:
def false_positives(data, annotations):
    num_fp = 0
    for i in range(len(data)):
        if annotations[i]['falsified'] == False and bool(data[i]['falsified']) == True:
            num_fp += 1
    return num_fp

In [11]:
def false_negatives(data, annotations):
    num_fn = 0
    for i in range(len(data)):
        if annotations[i]['falsified'] == True and bool(data[i]['falsified']) == False:
            num_fn += 1
    return num_fn

In [12]:
def get_precision_and_recall(data, annotations):
    tp,fp,fn = true_positives(data, annotations), false_positives(data, annotations), false_negatives(data, annotations)
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    return precision, recall

### no web access

In [15]:
no_web_file = "../results/results_no_web_access.json"
result_data = get_results(no_web_file)
result_data = result_data[:1000]
num_disagreements(result_data)
precision, recall = get_precision_and_recall(result_data, test_data_annotations)
print("Accuracy: {:.4f}".format(get_accuracy(result_data, test_data_annotations)))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("Accuracy without unsures: {:.4f}".format(get_acc_without_unsures(result_data, test_data_annotations)))


Num disagreements:  8
Num unsures:  675
Accuracy: 0.5720
Precision: 0.5448
Recall: 0.8640
Accuracy without unsures: 0.7515


### with web access (only when model unsure)

In [16]:
web_access_file = "../results/results_web_access_no_initial_context.json"
result_data2 = get_results(web_access_file)
num_disagreements(result_data2)
precision, recall = get_precision_and_recall(result_data2, test_data_annotations)
print("Accuracy: {:.4f}".format(get_accuracy(result_data2, test_data_annotations)))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("Accuracy without unsures: {:.4f}".format(get_acc_without_unsures(result_data2, test_data_annotations)))

Num disagreements:  16
Num unsures:  329
Accuracy: 0.5960
Precision: 0.5804
Recall: 0.6860
Accuracy without unsures: 0.6433


In [14]:
web_access_file = "../results/results_initial_context.json"
result_data3 = get_results(web_access_file)
num_disagreements(result_data3)
precision, recall = get_precision_and_recall(result_data3, test_data_annotations)
print("Accuracy: {:.4f}".format(get_accuracy(result_data3, test_data_annotations)))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("Accuracy without unsures: {:.4f}".format(get_acc_without_unsures(result_data3, test_data_annotations)))

Num disagreements:  7
Num unsures:  9
Accuracy: 0.8580
Precision: 0.8266
Recall: 0.9060
Accuracy without unsures: 0.8587


### Analysing where model fails (what kinds of examples the model fails on)

In [19]:
from PIL import Image
from utils.data import get_data, show_data

In [20]:
def retrieve_summary(key):
    with open("../scripts/final_summaries.json", "r") as f:
        data = json.load(f)
    return data[key]

In [21]:
def failed_samples(incorrect_idx, res_data):
    for i in incorrect_idx:
        img, caption, _, annotation = get_data(i)
        display(img)
        show_data(i)
        key = str(annotation['id'])+"_"+str(annotation['image_id'])
        print("Associated summary: ", retrieve_summary(key))
        print("Model_prediction: ", res_data[i]['falsified'])
        print("Model arguments: ", res_data[i]['output'])
        cont = input()
        if "exit" == cont:
            break

In [23]:
incorrect_idx = get_incorrect_idx(result_data3, test_data_annotations)
print(incorrect_idx)
#failed_samples(incorrect_idx, result_data3)

Num unsures:  8
[3, 4, 14, 17, 19, 31, 36, 38, 44, 47, 53, 62, 64, 67, 74, 76, 78, 85, 88, 110, 112, 113, 116, 117, 120, 132, 141, 146, 149, 150, 152, 154, 164, 167, 170, 172, 174, 175, 183, 190, 199, 201, 203, 205, 207, 216, 218, 223, 224, 226, 229, 234, 235, 237, 241, 242, 245, 253, 254, 256, 260, 267, 272, 274, 278, 286, 295, 296, 304, 305, 312, 316, 321, 322, 324, 332, 338, 339, 341, 342, 343, 344, 348, 352, 354, 367, 369, 371, 376, 384, 394, 398, 400, 404, 405, 414, 415, 418, 424, 425, 426, 427, 428, 432, 438, 451, 453, 454, 455, 456, 464, 466, 470, 480, 481, 484, 488, 493, 496, 502, 503, 505, 510, 513, 517, 518, 528, 529, 531, 532, 533, 535, 540, 541, 551, 556, 559, 566, 568, 580, 582, 593, 596, 600, 602, 608, 613, 614, 623, 626, 628, 636, 638, 640, 644, 651, 657, 661, 669, 672, 678, 689, 694, 698, 700, 708, 710, 712, 716, 719, 722, 724, 726, 728, 732, 736, 738, 740, 757, 759, 762, 763, 768, 772, 774, 784, 786, 787, 788, 790, 794, 811, 814, 819, 823, 824, 826, 827, 833, 835, 843,