In [32]:
import scipy
from scipy import stats
import pandas as pd
import pickle 
import multiprocessing
import numpy as np
from sklearn.neighbors import KernelDensity


# Load pickle files 

In [19]:
with open('variability_vqa_multilabel.pkl', 'rb') as f:
    variability_scores = pickle.load(f)

with open('confidence_vqa_multilabel.pkl', 'rb') as f:
    confidence_scores = pickle.load(f)

In [20]:
assert len(variability_scores) == len(confidence_scores)

# Beta Sampling

In [23]:
# multithreaded function for score_samples since it takes a lot of time to run on its own 

def parallel_score_samples(kde, samples, thread_count=int(0.875 * multiprocessing.cpu_count())):
    with multiprocessing.Pool(thread_count) as p:
        return np.concatenate(p.map(kde.score_samples, np.array_split(samples, thread_count)))

In [28]:
alpha = 1
beta = 1
norm = 'gaussian'
training_budget = 30
bandwidth = 0.01

num_total_samples = round(len(variability_scores) * (training_budget * 0.01)) # final number of samples training_budget * num_total samples
beta_distribution = scipy.stats.beta(alpha, beta)
p_vals = beta_distribution.pdf(variability_scores)



In [34]:
if norm == 'pvals':
    p_vals /= p_vals.sum()
    plt.plot(variability_scores, p_vals, label='pdf')
elif norm in ['gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine']:
    vars_kde =np.array(variability_scores).reshape(-1, 1)
    kde = KernelDensity(bandwidth=bandwidth, kernel=norm, atol=0.0005, rtol=0.01)
    kde.fit(vars_kde)
    #kde = KernelDensity(bandwidth=2.0,atol=0.0005,rtol=0.01).fit(sample) 
    logprob = parallel_score_samples(kde, vars_kde)
    # score_samples returns the log of the probability density
    #logprob = kde.score_samples(vars_kde)
    p_vals /= np.exp(logprob)
    p_vals /= p_vals.sum()
else:
    print('Norm not implemented')


selected_samples = np.random.choice(np.arange(len(variability_scores)), num_total_samples, replace=False, p=p_vals)