In [1]:
import numpy as np
import pandas
import pandas as pd
import matplotlib.pyplot as plt
import torch
import random
from tqdm.notebook import trange, tqdm
from time import sleep


from Helper.ImportDatasetsFOMLAS import df_epsilon, df_epsilon_crit
from Helper.ImportDatasetsFairness import df_epsilon as df_epsilon_2
from Helper.ImportDatasetsFairness import df_epsilon_crit as df_epsilon_crit_2

# drop mnist-net, mnist-net_256x4 and mnist_relu_4_1024 from first df's
df_epsilon = df_epsilon[df_epsilon['network'] != 'mnist-net']
df_epsilon = df_epsilon[df_epsilon['network'] != 'mnist-net_256x4']
df_epsilon = df_epsilon[df_epsilon['network'] != 'mnist_relu_4_1024']

df_epsilon_crit = df_epsilon_crit[df_epsilon_crit['network'] != 'mnist-net']
df_epsilon_crit = df_epsilon_crit[df_epsilon_crit['network'] != 'mnist-net_256x4']
df_epsilon_crit = df_epsilon_crit[df_epsilon_crit['network'] != 'mnist_relu_4_1024']

# concat
df_epsilon = pd.concat([df_epsilon_2, df_epsilon])
df_epsilon_crit = pd.concat([df_epsilon_crit_2, df_epsilon_crit])
plt.ioff()

<contextlib.ExitStack at 0x12dff52c390>

In [4]:
# Importing the data
distributions_df_1 = pd.read_pickle('Datasets/distributions_fairness.pkl')
distributions_df_2 = pd.read_pickle('Datasets/distributions_fomlas.pkl')

distributions_df = pd.concat([distributions_df_1, distributions_df_2])

In [5]:
from scipy.special import comb

def binomial(n, p, x):
    '''
    :param n: number of trials
    :param p: probability of success, value of a (quantile)
    :param x: number of successes

    :return: probability of x successes
    '''

    return comb(n, x) * (p ** x) * ((1 - p) ** (n - x))

def binomial_bounds(n, p, alpha):
    '''
    :param n: number of trials
    :param p: probability of success, value of a (quantile)
    :param alpha: confidence interval

    :return: lower and upper bound of confidence interval
    '''
    probs = np.arange(0, n + 1)
    probs = binomial(n, p, probs)

    # take sum of probabilities until we reach alpha/2
    cumulated_probs = np.cumsum(probs)
    lower_index = np.where(cumulated_probs <= alpha / 2)[0][-1]+1
    upper_index = np.where(cumulated_probs >= 1 - alpha / 2)[0][0]+1

    return lower_index, upper_index

def get_quantile(network, sigma):
    '''
    :param network: name of network
    :param sigma: quantile

    :return: confidence interval for sigma quantile
    '''

    # Take all critical epsilons of the test set and put into numpy array
    df_for_network = df_epsilon_crit[df_epsilon_crit['network'] == network]
    df_for_network = df_for_network[df_for_network['ds'] == 'test']
    crit_epsilons = df_for_network['Epsilon'].to_numpy()
    # remove nans
    df_for_network = df_for_network.dropna()
    
    n = len(crit_epsilons)

    # We sort the critical epsilons
    order_statistics = np.sort(crit_epsilons)
    # We use the order statistics to estimate the sigma quantile
    index = int(n * sigma) + 1  # As given by David et al. 1986
    lower_index, upper_index = binomial_bounds(n, sigma, 0.05)
    return order_statistics[index], order_statistics[lower_index], order_statistics[upper_index]

In [7]:
# make 95% confidence intervals from the distributions
conf_intervals = np.tile(0., (len(networks), 100, 2))

for net,network in enumerate(networks):
    for run in range(100):
        final_bins = distributions_df[distributions_df['network'] == network]['bins'].iloc[run]
        final_distribution = distributions_df[distributions_df['network'] == network]['distribution'].iloc[run]
        cumsum = np.cumsum(final_distribution)
        lower_index = np.where(cumsum <= 0.025)[0][-1]
        upper_index = np.where(cumsum >= 0.975)[0][0]
        
        bin_size = final_bins[1] - final_bins[0]
        
        conf_intervals[net, run, 0] = final_bins[lower_index]
        conf_intervals[net, run, 1] = final_bins[upper_index] + bin_size

In [28]:
# Get the quantiles for the networks
networks = distributions_df['network'].unique()

in_confidence_interval = np.tile(0, (len(networks), 100))
overlap_prob = np.tile(0., (len(networks), 100))
overlap_size = np.tile(0., (len(networks), 100))

for net,network in enumerate(networks):
    for run in range(100):
        final_bins = distributions_df[distributions_df['network'] == network]['bins'].iloc[run]
        final_distribution = distributions_df[distributions_df['network'] == network]['distribution'].iloc[run]
        quantile, lower, upper = get_quantile(network, 0.05)
        upper = upper + 0.002
        
        lower_bound_area = conf_intervals[net, run, 0]
        upper_bound_area = conf_intervals[net, run, 1]
        
        if lower_bound_area < upper <= upper_bound_area:
            metric = 1
        elif lower_bound_area <= lower < upper_bound_area:
            metric = 1
        elif lower <= lower_bound_area and upper >= upper_bound_area:
            metric = 1
        else:
            metric = 0
        
        in_confidence_interval[net, run] = metric
        
        # Metric 2, probability given to the area
        if metric == 1:
            lower_bound_index = torch.where(final_bins >= lower)[0]
            lower_bound_index = lower_bound_index[0]
    
            upper_bound_index = torch.where(final_bins + bin_size <= upper)[
                0]  # we don't include bins who's right side is larger than the quantile
            upper_bound_index = upper_bound_index[-1]
    
            metric_2 = torch.sum(final_distribution[lower_bound_index:upper_bound_index + 1]).item()
    
        else:
            metric_2 = 0
        
        overlap_prob[net, run] = metric_2
        # Metric 3, Area of overlap
        if metric == 1:
            metric_3 = final_bins[upper_bound_index] - final_bins[lower_bound_index] + bin_size

        else:
            metric_3 = 0
        
        size = final_bins[-1] - final_bins[0] + bin_size
        overlap_size[net, run] = metric_3/size
            

In [30]:
overlap_size

array([[0.52189523, 0.42554799, 0.48668227, ..., 0.42915717, 0.50175625,
        0.42919394],
       [0.        , 0.41917801, 0.28487223, ..., 0.28350392, 0.        ,
        0.25827324],
       [0.34561378, 0.48164874, 0.22800662, ..., 0.2279824 , 0.32374328,
        0.33041891],
       ...,
       [0.        , 0.90045929, 0.78100729, ..., 0.83494377, 0.66973722,
        0.5894286 ],
       [0.79032296, 0.56099623, 0.54119807, ..., 0.86089855, 0.62486941,
        0.42090073],
       [0.64175224, 0.89527428, 0.82082838, ..., 0.81492001, 0.7960645 ,
        0.78606957]])

In [36]:
total = in_confidence_interval.sum(axis=1)
average_prob = overlap_prob.mean(axis=1)
average_size = overlap_size.mean(axis=1)

# make latex table

for net,network in enumerate(networks):
    name = network.replace('_',' ')
    print(f'{name} & {total[net]} & {average_prob[net]:.2f} & {average_size[net]:.2f} \\\\')

AxisError: axis 1 is out of bounds for array of dimension 1