In [None]:
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
import math 
import collections
from pycocotools.coco import COCO
import requests
import plotly.express as px
import plotly.graph_objects as go
from os import listdir
from os.path import isfile, join
import base64
import itertools
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', -1)
sns.set(style='whitegrid', font_scale=1.6, font='Georgia', context='paper')

from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import operator
import plotly.figure_factory as ff
import scipy
import pickle 
from sklearn.neighbors import KernelDensity
from tqdm import tqdm
from collections import OrderedDict

In [None]:
def plot_stacked_region(base_path, sampling_base, sampling_ids_list):

    region_counts = {
        "easy": [],
        "ambiguous": [],
        "hard": [],
    }

    x_axis = []

    df = pd.read_pickle(base_path+"datamap_metrics.pkl")

    for sampled_question_ids_path in sampling_ids_list:
        split_path = sampled_question_ids_path.split('/')
        if 'beta_kernel' in sampled_question_ids_path:
            sampling_name = split_path[2] + '_' + split_path[-1][:-14]
        elif 'beta_pvals' in sampled_question_ids_path or 'beta_var_counts' in sampled_question_ids_path:
            sampling_name = split_path[1] + '_' + split_path[-1][:-14]
        elif 'global' in sampled_question_ids_path:
            sampling_name = split_path[0]
        else:
            sampling_name = 'classwise_' + split_path[0]
        x_axis.append(sampling_name)
        
        with open(sampled_question_ids_path, 'rb') as f:
            sampled_ids = pickle.load(f)

        df_sampled = df.loc[df['question_id'].isin(sampled_ids)]

        df_easy = df_sampled.loc[(df_sampled['confidence'] > 0.6) & (df_sampled['variability'] < 0.15)]
        df_hard = df_sampled.loc[(df_sampled['confidence'] < 0.4) & (df_sampled['variability'] < 0.2)]
        df_ambiguous = df_sampled.loc[df_sampled['variability'] > 0.3]

        region_counts['easy'].append(len(df_easy))
        region_counts['hard'].append(len(df_hard))
        region_counts['ambiguous'].append(len(df_ambiguous))
    return region_counts, x_axis


def plot_stacked_variability(base_path, sampling_base, sampling_ids_list):

    region_counts = {
        "0-0.1": [],
        "0.1-0.2": [],
        "0.2-0.3": [],
        "0.3-0.4": [],
        "0.4-0.5": [],
    }

    x_axis = []

    df = pd.read_pickle(base_path+"datamap_metrics.pkl")

    for sampled_question_ids_path in sampling_ids_list:
        split_path = sampled_question_ids_path.split('/')
        if 'beta_kernel' in sampled_question_ids_path:
            sampling_name = split_path[2] + '_' + split_path[-1][:-14]
        elif 'beta_pvals' in sampled_question_ids_path or 'beta_var_counts' in sampled_question_ids_path:
            sampling_name = split_path[1] + '_' + split_path[-1][:-14]
        elif 'global' in sampled_question_ids_path:
            sampling_name = split_path[0]
        else:
            sampling_name = 'classwise_' + split_path[0]
        x_axis.append(sampling_name)
        
        with open(sampled_question_ids_path, 'rb') as f:
            sampled_ids = pickle.load(f)

        df_sampled = df.loc[df['question_id'].isin(sampled_ids)]

        df_one = df_sampled.loc[(df_sampled['variability'] > 0.0) & (df_sampled['variability'] <= 0.1)]
        df_two = df_sampled.loc[(df_sampled['variability'] > 0.1) & (df_sampled['variability'] <= 0.2)]
        df_three = df_sampled.loc[(df_sampled['variability'] > 0.2) & (df_sampled['variability'] <= 0.3)]
        df_four = df_sampled.loc[(df_sampled['variability'] > 0.3) & (df_sampled['variability'] <= 0.4)]
        df_five = df_sampled.loc[(df_sampled['variability'] > 0.4) & (df_sampled['variability'] <= 0.5)]

        region_counts['0-0.1'].append(len(df_one))
        region_counts['0.1-0.2'].append(len(df_two))
        region_counts['0.2-0.3'].append(len(df_three))
        region_counts['0.3-0.4'].append(len(df_four))
        region_counts['0.4-0.5'].append(len(df_five))

    return region_counts, x_axis

def plot_stacked_confidence(base_path, sampling_base, sampling_ids_list):

    region_counts = {
        "0-0.2": [],
        "0.2-0.4": [],
        "0.4-0.6": [],
        "0.6-0.8": [],
        "0.8-1.0": [],
    }

    x_axis = []

    df = pd.read_pickle(base_path+"datamap_metrics.pkl")

    for sampled_question_ids_path in sampling_ids_list:
        split_path = sampled_question_ids_path.split('/')
        if 'beta_kernel' in sampled_question_ids_path:
            sampling_name = split_path[2] + '_' + split_path[-1][:-14]
        elif 'beta_pvals' in sampled_question_ids_path or 'beta_var_counts' in sampled_question_ids_path:
            sampling_name = split_path[1] + '_' + split_path[-1][:-14]
        elif 'global' in sampled_question_ids_path:
            sampling_name = split_path[0]
        else:
            sampling_name = 'classwise_' + split_path[0]
        x_axis.append(sampling_name)
        
        with open(sampled_question_ids_path, 'rb') as f:
            sampled_ids = pickle.load(f)

        df_sampled = df.loc[df['question_id'].isin(sampled_ids)]

        df_one = df_sampled.loc[(df_sampled['confidence'] > 0.0) & (df_sampled['confidence'] <= 0.2)]
        df_two = df_sampled.loc[(df_sampled['confidence'] > 0.2) & (df_sampled['confidence'] <= 0.4)]
        df_three = df_sampled.loc[(df_sampled['confidence'] > 0.4) & (df_sampled['confidence'] <= 0.6)]
        df_four = df_sampled.loc[(df_sampled['confidence'] > 0.6) & (df_sampled['confidence'] <= 0.8)]
        df_five = df_sampled.loc[(df_sampled['confidence'] > 0.8) & (df_sampled['confidence'] <= 1.0)]

        region_counts['0-0.2'].append(len(df_one))
        region_counts['0.2-0.4'].append(len(df_two))
        region_counts['0.4-0.6'].append(len(df_three))
        region_counts['0.6-0.8'].append(len(df_four))
        region_counts['0.8-1.0'].append(len(df_five))

    return region_counts, x_axis




base_path = '../../../snap/vqa/lxr111_multilabel_full_run_3/'
sampling_base = '../../../src/dataset_selection/sampling/samples/LXR111/multilabel_full/'
sampling_ids_list = ['beta/beta_kernel/tophat/seed_965/alpha_2_beta_1_budget_30.pkl',
                'beta/beta_kernel/tophat/seed_965/alpha_2_beta_2_budget_30.pkl',
                'beta/beta_kernel/tophat/seed_965/alpha_1_beta_1_budget_30.pkl',
                'beta/beta_kernel/tophat/seed_965/alpha_1_beta_2_budget_30.pkl',
                'beta/beta_kernel/cosine/seed_965/alpha_2_beta_1_budget_30.pkl',
                'beta/beta_kernel/cosine/seed_965/alpha_2_beta_2_budget_30.pkl',
                'beta/beta_kernel/cosine/seed_965/alpha_1_beta_1_budget_30.pkl',
                'beta/beta_kernel/cosine/seed_965/alpha_1_beta_2_budget_30.pkl',
                'beta/beta_kernel/epanechnikov/seed_965/alpha_2_beta_1_budget_30.pkl',
                'beta/beta_kernel/epanechnikov/seed_965/alpha_2_beta_2_budget_30.pkl',
                'beta/beta_kernel/epanechnikov/seed_965/alpha_1_beta_1_budget_30.pkl',
                'beta/beta_kernel/epanechnikov/seed_965/alpha_1_beta_2_budget_30.pkl',
                'beta/beta_kernel/exponential/seed_965/alpha_2_beta_1_budget_30.pkl',
                'beta/beta_kernel/exponential/seed_965/alpha_2_beta_2_budget_30.pkl',
                'beta/beta_kernel/exponential/seed_965/alpha_1_beta_1_budget_30.pkl',
                'beta/beta_kernel/exponential/seed_965/alpha_1_beta_2_budget_30.pkl',
                'beta/beta_kernel/gaussian/seed_965/alpha_2_beta_1_budget_30.pkl',
                'beta/beta_kernel/gaussian/seed_965/alpha_2_beta_2_budget_30.pkl',
                'beta/beta_kernel/gaussian/seed_965/alpha_1_beta_1_budget_30.pkl',
                'beta/beta_kernel/gaussian/seed_965/alpha_1_beta_2_budget_30.pkl',
                'beta/beta_kernel/linear/seed_965/alpha_2_beta_1_budget_30.pkl',
                'beta/beta_kernel/linear/seed_965/alpha_2_beta_2_budget_30.pkl',
                'beta/beta_kernel/linear/seed_965/alpha_1_beta_1_budget_30.pkl',
                'beta/beta_kernel/linear/seed_965/alpha_1_beta_2_budget_30.pkl',
                'beta/beta_pvals/seed_965/alpha_2_beta_1_budget_30.pkl',
                'beta/beta_pvals/seed_965/alpha_2_beta_2_budget_30.pkl',
                'beta/beta_pvals/seed_965/alpha_1_beta_1_budget_30.pkl',
                'beta/beta_pvals/seed_965/alpha_1_beta_2_budget_30.pkl',
                'beta/beta_var_counts/seed_965/alpha_2_beta_1_budget_30.pkl',
                'beta/beta_var_counts/seed_965/alpha_2_beta_2_budget_30.pkl',
                'beta/beta_var_counts/seed_965/alpha_1_beta_1_budget_30.pkl',
                'beta/beta_var_counts/seed_965/alpha_1_beta_2_budget_30.pkl',
                'global_max_confidence/seed_965/budget_30.pkl',
                'global_max_variability/seed_965/budget_30.pkl',
                'global_min_confidence/seed_965/budget_30.pkl',
                'global_min_variability/seed_965/budget_30.pkl',
                'global_random/seed_965/budget_30.pkl',
                'max_confidence/seed_965/budget_30.pkl',
                'max_variability/seed_965/budget_30.pkl',
                'min_confidence/seed_965/budget_30.pkl',
                'min_variability/seed_965/budget_30.pkl',
                'random/seed_965/budget_30.pkl']




: 

In [None]:
region_counts_datamap, x_axis_datamap =  plot_stacked_region(base_path, sampling_base, sampling_ids_list)

fig, ax = plt.subplots()
bottom = np.zeros(3)

for boolean, weight_count in region_counts_datamap.items():
    p = ax.bar(x_axis_datamap, weight_count, width, label=boolean, bottom=bottom)
    bottom += weight_count

ax.set_title("Datamap Region Frequency Across Selection Methods")
ax.legend(loc="upper right")

plt.show()

In [None]:
# for random sampling and beta sampling, get top question types like you did in test.ipynb and plot their binned confidence scores and binned variability scores


# also calculate skew metrics comparing random and beta sampling class distributions 