In [30]:
import json 
import pandas as pd 
from pycocotools.coco import COCO
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from skimage import io 
import pickle
import seaborn as sns 

In [None]:
def preds_to_df(preds_path):

    coco_train = COCO('../coco/annotations/instances_train2014.json')
    coco_val = COCO('../coco/annotations/instances_val2014.json')

    with open(preds_path) as fp:
        model_preds = json.load(fp)

    preproc = []

    for i in model_preds:
        img_id = str(i['img_id'])
        if 'val' in img_id:
            loaded_img = coco_val.loadImgs([int(img_id[-6:])])
        else:
            loaded_img = coco_train.loadImgs([int(img_id[-6:])])
        img_url = loaded_img[0]['coco_url']

        label = i['label']
        label_list = list(label.keys())
        i['label'] = ', '.join(label_list)
        i['img_url'] = img_url
        preproc.append(i)

    df = pd.json_normalize(preproc)

    return df

def validation_per_question(df):
    validation_questions = {}
    unique_questions = df['question_type'].unique()
    grouped = df.groupby(df.question_type)
    for ques in unique_questions:
        df1 = grouped.get_group(ques)
        val_score = df1['score'].sum() / len(df1)
        validation_questions[ques] = val_score
    return validation_questions
    
def validation_per_answer(df):
    validation_answers = {}
    unique_answers= df['answer_type'].unique()
    grouped = df.groupby(df.answer_type)
    for ans in unique_answers:
        df1 = grouped.get_group(ans)
        val_score = df1['score'].sum() / len(df1)
        validation_answers[ans] = val_score
    return validation_answers

def plot_preds_question_type(df):
    unique_questions = df['question_type'].unique()

    grouped = df.groupby(df.question_type)
    for ques in unique_questions:
        print("QUESTION: ", ques)
        df_ques = grouped.get_group(ques)
        for j in range(3):
            example = df_ques.iloc[[j]]
            ans = example['answer']
            question = example['question']
            label = example['label']
            img_url = example['img_url']
            a = io.imread(img_url)
            plt.figure()
            plt.imshow(a)
            plt.title("Labels: "+ label + "\n Preds: " + ans + "\n Question: " + question)

def compare_dfs_stats(df1, df2):
    # take the intersection of two dfs and print question types, answer types, and target stats
    df1_question_ids = set(df1['question_id'].tolist())
    df2_question_ids = set(df2['question_id'].tolist())

    ids_intersection = df1_question_ids.intersection(df2_question_ids)

    df_intersection = df1[df1['question_id'].isin(ids_intersection)]


    df_intersection_questions = df_intersection['question_type'].unique()
    df_intersection_questions_count = df_intersection['question_type'].value_counts()
    df_intersection_questions_count.plot(kind='pie')

    # segment df_intersection by question type and plot preds 
    plot_preds_question_type(df_intersection, df_intersection_questions)




    df_intersection_answers = df_intersection['answer_type'].unique()
    df_intersection_answers_count = df_intersection['answer_type'].value_counts()
    df_intersection_answers_count.plot(kind='pie')

    all_targets = []
    df_intersection_targets = df_intersection['label'].tolist()
    for i in df_intersection_targets:
        target_list = [x.strip() for x in i.split(',')]
        all_targets.extend(target_list)

    #convert list to string and generate
    unique_string=(" ").join(all_targets)
    wordcloud = WordCloud(width = 1000, height = 500).generate(unique_string)
    plt.figure(figsize=(15,8))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
    plt.close()



def df_stats(df):
    
    df_questions = df['question_type'].unique()
    df_question_counts = df['question_type'].value_counts()
    df_question_counts.plot(kind='pie')


    df_answers = df['answer_type'].unique()
    df_answers_count = df['answer_type'].value_counts()
    df_answers_count.plot(kind='pie')


    df_targets = df['label'].tolist()
    all_targets = []
    for i in df_targets:
        target_list = [x.strip() for x in i.split(',')]
        all_targets.extend(target_list)

    #convert list to string and generate
    unique_string=(" ").join(all_targets)
    wordcloud = WordCloud(width = 1000, height = 500).generate(unique_string)
    plt.figure(figsize=(15,8))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
    plt.close()

    plot_preds_question_type(df)

def calc_question_types(df):
    questions = df['Question'].tolist()
    questions = [i[0].lower() for i in questions]
    df['Question'] = questions

    with open("../../../data/vqa/mscoco_question_types.txt") as f:
        question_types_list = f.read().splitlines() 
    question_types_list = list(set(question_types_list))
    question_types_list = sorted(question_types_list, key=len) 
    question_types_list.reverse()

    df_question_types = []

    for idx, question in enumerate(questions):
        counter = 0
        matches = []
        for question_type in question_types_list:  
            if question.startswith(question_type):
                matches.append(question_type)
        if len(matches) != 0:
            quest_type_max = max(matches, key=len)
            df_question_types.append(quest_type_max)
            counter+=1
        if counter == 0:
            df_question_types.append('none of the above')

    assert len(df_question_types) == len(questions)

    df['question_type'] = df_question_types
    return df

def variability_histogram(base_path, sampling_ids=None):
    df = pd.read_pickle(base_path+"datamap_metrics.pkl")
    df = calc_question_types(df)
    if sampling_ids != None:
        with open(sampling_ids, 'rb') as f:
            sampled_ids = pickle.load(f)
        df = df.loc[df['question_id'].isin(sampled_ids)]
    unique_questions = df['question_type'].unique()
    grouped = df.groupby(df.question_type)
    for ques in unique_questions:
        df1 = grouped.get_group(ques)
        plt.figure()
        sns.histplot(df1['variability'].tolist(), kde=True).set(title=ques)
        plt.show()


def confidence_histogram(base_path, sampling_ids=None):
    df = pd.read_pickle(base_path+"datamap_metrics.pkl")
    df = calc_question_types(df)
    if sampling_ids != None:
        with open(sampling_ids, 'rb') as f:
            sampled_ids = pickle.load(f)
        df = df.loc[df['question_id'].isin(sampled_ids)]
    unique_questions = df['question_type'].unique()
    grouped = df.groupby(df.question_type)
    for ques in unique_questions:
        df1 = grouped.get_group(ques)
        plt.figure()
        sns.histplot(df1['confidence'].tolist(), kde=True).set(title=ques)
        plt.show()


def compare_dfs(full_df, sampled_df):
    # segment all the preds that full model got wrong -- 0.0
    # segment all the preds sampled model got right -- > 0.0 
    # case 1 - sampled model gets it wrong, full model gets it right
    # case 2 - sampled model gets it right, full model gets it wrong

    full_df_wrong_preds = full_df[full_df['score'] == 0.0]
    sampled_df_wrong_preds = sampled_df[sampled_df['score'] == 0.0]

    full_df_right_preds = full_df[full_df['score'] > 0.0]
    sampled_df_right_preds = sampled_df[sampled_df['score'] > 0.0]

    # case 1:
    compare_dfs_stats(full_df_wrong_preds, sampled_df_right_preds)
    # case 2: 
    compare_dfs_stats(full_df_right_preds, sampled_df_wrong_preds)

    # independent of overlap, look at what each model is getting wrong -- question types, target types, and for each quest type and target type, plot a few
    df_stats(full_df_wrong_preds)
    df_stats(sampled_df_wrong_preds)
    df_stats(full_df)
    df_stats(sampled_df)

In [21]:
normal_near_mode_var_path = '/home/jaspreet/vl-pretraining/snap/vqa/lxr111_multilabel_normal_near_mode_var0.1_0.4_results/minival_predict.json'
#global_random_path = ''
beta_pvals_var_path = '/home/jaspreet/vl-pretraining/snap/vqa/lxr111_multilabel_beta_pvals_var_21_results/minival_predict.json'
full_model_path = '/home/jaspreet/vl-pretraining/snap/vqa/lxr111_multilabel_full_run_results/minival_predict.json'
base_path = '../../../snap/vqa/lxr111_multilabel_full_run_3/'

df_normal_near_mode_var = preds_to_df(normal_near_mode_var_path)
df_beta_pvals_var = preds_to_df(beta_pvals_var_path)


loading annotations into memory...
Done (t=9.45s)
creating index...
index created!
loading annotations into memory...
Done (t=5.25s)
creating index...
index created!


In [32]:
df

Unnamed: 0,answer,answer_type,img_id,label,question,question_id,question_type,score,img_url
0,pink,other,COCO_val2014_000000393267,"black, blonde",What color is the woman's shirt on the left?,393267000,what color is the,0.0,http://images.cocodataset.org/val2014/COCO_val...
1,wine,other,COCO_val2014_000000393267,wine,What type of beverage is being displayed?,393267001,what type of,1.0,http://images.cocodataset.org/val2014/COCO_val...
2,costume,other,COCO_val2014_000000393267,"fashion, style, they aren't",Why are some people wearing hats?,393267002,why,0.0,http://images.cocodataset.org/val2014/COCO_val...
3,meeting,other,COCO_val2014_000000393267,"wedding, wine, wine tasting",What kind of event are the people involved in?,393267003,what kind of,0.0,http://images.cocodataset.org/val2014/COCO_val...
4,brick,other,COCO_val2014_000000262228,"brick, bricks",What is the ground made of?,262228000,what is the,1.0,http://images.cocodataset.org/val2014/COCO_val...
...,...,...,...,...,...,...,...,...,...
25989,10,number,COCO_val2014_000000131063,"100, 20, 40, 50, lot, many",How many tiles are there?,131063001,how many,0.0,http://images.cocodataset.org/val2014/COCO_val...
25990,no,yes/no,COCO_val2014_000000131063,yes,Was this photo taken at someone's house?,131063002,was,0.0,http://images.cocodataset.org/val2014/COCO_val...
25991,person,other,COCO_val2014_000000131063,"family, people, woman, women",Who uses this bathroom?,131063003,none of the above,0.0,http://images.cocodataset.org/val2014/COCO_val...
25992,marble,other,COCO_val2014_000000131063,"bathroom, don't know, not sure",What is the name of this his and her bathroom?,131063004,what is the name,0.0,http://images.cocodataset.org/val2014/COCO_val...


In [31]:
validation_per_question(df)
validation_per_answer(df)

Question:  what color is the
Score:  0.7312953995157384
Question:  what type of
Score:  0.5243353783231084
Question:  why
Score:  0.15159574468085107
Question:  what kind of
Score:  0.5130116959064327
Question:  what is the
Score:  0.42981684981684987
Question:  are these
Score:  0.7264864864864865
Question:  none of the above
Score:  0.5401886792452829
Question:  what is on the
Score:  0.40355731225296443
Question:  is this
Score:  0.7249492900608518
Question:  how many
Score:  0.4489361702127659
Question:  does this
Score:  0.7291338582677165
Question:  does the
Score:  0.7294594594594594
Question:  is the person
Score:  0.7413461538461538
Question:  are
Score:  0.6915492957746479
Question:  how many people are
Score:  0.5265560165975104
Question:  is the man
Score:  0.7443661971830986
Question:  do
Score:  0.7427710843373495
Question:  is it
Score:  0.8524137931034482
Question:  are there
Score:  0.762883435582822
Question:  what
Score:  0.4233468972533062
Question:  is the
Score:  

{'other': 0.5070316186880603,
 'yes/no': 0.7418102130261952,
 'number': 0.3946382242721245}