In [30]:
import json 
import pandas as pd 
from pycocotools.coco import COCO


def preds_to_df(preds_path):

    coco_train = COCO('../coco/annotations/instances_train2014.json')
    coco_val = COCO('../coco/annotations/instances_val2014.json')

    with open(preds_path) as fp:
        model_preds = json.load(fp)

    preproc = []

    for i in model_preds:
        img_id = str(i['img_id'])
        if 'val' in img_id:
            loaded_img = coco_val.loadImgs([int(img_id[-6:])])
        else:
            loaded_img = coco_train.loadImgs([int(img_id[-6:])])
        img_url = loaded_img[0]['coco_url']

        label = i['label']
        label_list = list(label.keys())
        i['label'] = ', '.join(label_list)
        i['img_url'] = img_url
        preproc.append(i)

    df = pd.json_normalize(preproc)

    return df

def validation_per_question(df):
    validation_questions = {}
    unique_questions = df['question_type'].unique()
    grouped = df.groupby(df.question_type)
    for ques in unique_questions:
        df1 = grouped.get_group(ques)
        val_score = df1['score'].sum() / len(df1)
        validation_questions[ques] = val_score
    return validation_questions
    
def validation_per_answer(df):
    validation_answers = {}
    unique_answers= df['answer_type'].unique()
    grouped = df.groupby(df.answer_type)
    for ans in unique_answers:
        df1 = grouped.get_group(ans)
        val_score = df1['score'].sum() / len(df1)
        validation_answers[ans] = val_score
    return validation_answers


def compare_dfs(full_df, sampled_df):
    # segment all the preds that full model got wrong -- 0.0
    # segment all the preds sampled model got right -- > 0.0 
    # look at overlap between subsets and vice versa

    full_df_wrong_preds = full_df[full_df['score'] == 0.0]
    sampled_df_wrong_preds = sampled_df[sampled_df['score'] == 0.0]

    full_df_right_preds = full_df[full_df['score'] > 0.0]
    sampled_df_right_preds = sampled_df[sampled_df['score'] > 0.0]

    # for the overlap, question types, target types, and for each quest type and target type, plot a few
    # independent of overlap, look at what each model is getting wrong -- question types, target types, and for each quest type and target type, plot a few

    






In [21]:
normal_near_mode_var_path = '/home/jaspreet/vl-pretraining/snap/vqa/lxr111_multilabel_normal_near_mode_var0.1_0.4_results/minival_predict.json'
#global_random_path = ''
beta_pvals_var_path = '/home/jaspreet/vl-pretraining/snap/vqa/lxr111_multilabel_beta_pvals_var_21_results/minival_predict.json'
full_model_path = '/home/jaspreet/vl-pretraining/snap/vqa/lxr111_multilabel_full_run_results/minival_predict.json'
df = preds_to_df(normal_near_mode_var_path)

loading annotations into memory...
Done (t=9.45s)
creating index...
index created!
loading annotations into memory...
Done (t=5.25s)
creating index...
index created!


In [31]:
validation_per_question(df)
validation_per_answer(df)

Question:  what color is the
Score:  0.7312953995157384
Question:  what type of
Score:  0.5243353783231084
Question:  why
Score:  0.15159574468085107
Question:  what kind of
Score:  0.5130116959064327
Question:  what is the
Score:  0.42981684981684987
Question:  are these
Score:  0.7264864864864865
Question:  none of the above
Score:  0.5401886792452829
Question:  what is on the
Score:  0.40355731225296443
Question:  is this
Score:  0.7249492900608518
Question:  how many
Score:  0.4489361702127659
Question:  does this
Score:  0.7291338582677165
Question:  does the
Score:  0.7294594594594594
Question:  is the person
Score:  0.7413461538461538
Question:  are
Score:  0.6915492957746479
Question:  how many people are
Score:  0.5265560165975104
Question:  is the man
Score:  0.7443661971830986
Question:  do
Score:  0.7427710843373495
Question:  is it
Score:  0.8524137931034482
Question:  are there
Score:  0.762883435582822
Question:  what
Score:  0.4233468972533062
Question:  is the
Score:  

{'other': 0.5070316186880603,
 'yes/no': 0.7418102130261952,
 'number': 0.3946382242721245}