### s1_cookies_vs_profile


In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import itertools
from scipy.stats import permutation_test
import os

def bootstrap_ci(data, statistic, alpha):
    """Compute bootstrap confidence interval for given data and statistic."""
    bs_replicates = np.empty(10000)
    
    for i in range(10000):
        bs_sample = np.random.choice(data, len(data))
        bs_replicates[i] = statistic(bs_sample)
        
    return np.percentile(bs_replicates, [100 * alpha / 2., 100 * (1 - alpha / 2.)])

# Define the function for test statistic (difference of means)
def test_statistic(a, b):
    return np.mean(a) - np.mean(b)

# Load the data into a DataFrame
path = os.getcwd() + "/datasets-stats/s1_cookies_vs_profile.csv"
df = pd.read_csv(path)

# Get unique alias
aliases = df['alias'].unique()

# Generate all pairs of aliases
pairs = list(itertools.combinations(aliases, 2))

# Perform permutation test for each pair
for pair in pairs:
    group1 = df[df['alias'] == pair[0]]['ct'].to_numpy()
    group2 = df[df['alias'] == pair[1]]['ct'].to_numpy()
    
    result = stats.permutation_test((group1, group2), 
                                    test_statistic,
                                    vectorized=False,
                                    alternative='two-sided',
                                    permutation_type='independent')
    
    # Calculate bootstrap confidence intervals for the mean
    ci_group1 = bootstrap_ci(group1, np.mean, 0.05)
    ci_group2 = bootstrap_ci(group2, np.mean, 0.05)
    
    print(f'Pair: {pair}, P-value: {result.pvalue}, CI Group1: {ci_group1}, CI Group2: {ci_group2}')


### s2_1s-3rd_vs_profile

##### 1st-party

In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import itertools
from scipy.stats import permutation_test
import os

path = os.getcwd() + "/datasets-stats/s2_1s-3rd_vs_profile.csv"

df = pd.read_csv(path)

df = df.dropna(subset=['is_third_party'])

# Create a new column combining 'alias' and 'is_third_party'
df['alias_third_party'] = df['alias'].astype(str) + '_' + df['is_third_party'].astype(str)

# Get unique alias_third_party
aliases_third_party = df['alias_third_party'].unique()

# Generate all pairs of alias_third_party
pairs = list(itertools.combinations(aliases_third_party, 2))


def bootstrap_ci(data, statistic, alpha):
    """Compute bootstrap confidence interval for given data and statistic."""
    bs_replicates = np.empty(10000)
    
    for i in range(10000):
        bs_sample = np.random.choice(data, len(data))
        bs_replicates[i] = statistic(bs_sample)
        
    return np.percentile(bs_replicates, [100 * alpha / 2., 100 * (1 - alpha / 2.)])

# Perform permutation test for each pair
for pair in pairs:
    group1 = df[df['alias_third_party'] == pair[0]]['ct'].to_numpy()
    group2 = df[df['alias_third_party'] == pair[1]]['ct'].to_numpy()
    
    result = stats.permutation_test((group1, group2), 
                                    #func=test_statistic, 
                                    #method='exact',
                                    test_statistic,
                                    vectorized=False,
                                    alternative='two-sided',
                                    permutation_type='independent')
    
    # Calculate bootstrap confidence intervals for the mean
    ci_group1 = bootstrap_ci(group1, np.mean, 0.05)
    ci_group2 = bootstrap_ci(group2, np.mean, 0.05)
    
    
    print(f'Pair: {pair}, P-value: {result.pvalue}, CI Group1: {ci_group1}, CI Group2: {ci_group2}')


### s3_cat_cookies_vs_profile

In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import itertools
from scipy.stats import permutation_test

# Load the dataset
path = os.getcwd() + "/datasets-stats/s3_cat_cookies_vs_profile.csv"

df = pd.read_csv(path) 

# Get unique categories
categories = df['category'].unique()


def bootstrap_ci(data, statistic, alpha):
    """Compute bootstrap confidence interval for given data and statistic."""
    bs_replicates = np.empty(10000)
    
    for i in range(10000):
        bs_sample = np.random.choice(data, len(data))
        bs_replicates[i] = statistic(bs_sample)
        
    return np.percentile(bs_replicates, [100 * alpha / 2., 100 * (1 - alpha / 2.)])

# Prepare function for permutation test
def test_statistic(a, b):
    return np.mean(a) - np.mean(b)

# Iterate over each category
for category in categories:
    # Get unique aliases within this category
    aliases_in_category = df[df['category'] == category]['alias'].unique()

    # Generate all pairs of aliases
    pairs = list(itertools.combinations(aliases_in_category, 2))

    # Perform permutation test for each pair
    for pair in pairs:
        group1 = df[(df['alias'] == pair[0]) & (df['category'] == category)]['ct'].to_numpy()
        group2 = df[(df['alias'] == pair[1]) & (df['category'] == category)]['ct'].to_numpy()

        result = permutation_test((group1, group2), 
                                  test_statistic,
                                  vectorized=False,
                                  alternative='two-sided',
                                  permutation_type='independent')

        # Calculate bootstrap confidence intervals for the mean
        ci_group1 = bootstrap_ci(group1, np.mean, 0.05)
        ci_group2 = bootstrap_ci(group2, np.mean, 0.05)
        

        print(f'Category: {category}, Pair: {pair}, P-value: {result.pvalue}, CI Group1: {ci_group1}, CI Group2: {ci_group2}')
 


In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import itertools
import os
from scipy.stats import permutation_test

# Load the dataset
path = os.getcwd() + "/datasets-stats/s3_cat_cookies_vs_profile.csv"
df = pd.read_csv(path) 

# Get unique categories
categories = df['category'].unique()

def bootstrap_ci(data, statistic, alpha):
    """Compute bootstrap confidence interval for given data and statistic."""
    bs_replicates = np.empty(10000)
    
    for i in range(10000):
        bs_sample = np.random.choice(data, len(data))
        bs_replicates[i] = statistic(bs_sample)
        
    return np.percentile(bs_replicates, [100 * alpha / 2., 100 * (1 - alpha / 2.)])

def test_statistic(a, b):
    """Statistic for permutation test."""
    return np.mean(a) - np.mean(b)

def cohen_d(x, y):
    """Calculate Cohen's d for effect size."""
    nx = len(x)
    ny = len(y)
    dof = nx + ny - 2
    return (np.mean(x) - np.mean(y)) / np.sqrt(((nx-1)*np.std(x, ddof=1) ** 2 + (ny-1)*np.std(y, ddof=1) ** 2) / dof)

# Iterate over each category
for category in categories:
    # Get unique aliases within this category
    aliases_in_category = df[df['category'] == category]['alias'].unique()

    # Generate all pairs of aliases
    pairs = list(itertools.combinations(aliases_in_category, 2))

    # Perform permutation test for each pair
    for pair in pairs:
        group1 = df[(df['alias'] == pair[0]) & (df['category'] == category)]['ct'].to_numpy()
        group2 = df[(df['alias'] == pair[1]) & (df['category'] == category)]['ct'].to_numpy()

        result = permutation_test((group1, group2), 
                                  test_statistic,
                                  vectorized=False,
                                  alternative='two-sided',
                                  permutation_type='independent')

        # Calculate bootstrap confidence intervals for the mean
        ci_group1 = bootstrap_ci(group1, np.mean, 0.05)
        ci_group2 = bootstrap_ci(group2, np.mean, 0.05)
        
        # Calculate Cohen's d for effect size
        d = cohen_d(group1, group2)

        print(f'Category: {category}, Pair: {pair}, P-value: {result.pvalue}, CI Group1: {ci_group1}, CI Group2: {ci_group2}, Cohen\'s d: {d}')


### s4_localstorage_vs_profile

In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import itertools
from scipy.stats import permutation_test
import os

# Load the data into a DataFrame
path = os.getcwd() + "/datasets-stats/s4_localstorage_vs_profile.csv"
df = pd.read_csv(path)

def bootstrap_ci(data, statistic, alpha):
    """Compute bootstrap confidence interval for given data and statistic."""
    bs_replicates = np.empty(10000)
    
    for i in range(10000):
        bs_sample = np.random.choice(data, len(data))
        bs_replicates[i] = statistic(bs_sample)
        
    return np.percentile(bs_replicates, [100 * alpha / 2., 100 * (1 - alpha / 2.)])

# Define the function for test statistic (difference of means)
def test_statistic(a, b):
    return np.mean(a) - np.mean(b)


# Get unique alias
aliases = df['alias'].unique()

# Generate all pairs of aliases
pairs = list(itertools.combinations(aliases, 2))

# Perform permutation test for each pair
for pair in pairs:
    group1 = df[df['alias'] == pair[0]]['ct'].to_numpy()
    group2 = df[df['alias'] == pair[1]]['ct'].to_numpy()
    
    result = stats.permutation_test((group1, group2), 
                                    test_statistic,
                                    vectorized=False,
                                    alternative='two-sided',
                                    permutation_type='independent')
    
    # Calculate bootstrap confidence intervals for the mean
    ci_group1 = bootstrap_ci(group1, np.mean, 0.05)
    ci_group2 = bootstrap_ci(group2, np.mean, 0.05)
    
    print(f'Pair: {pair}, P-value: {result.pvalue}, CI Group1: {ci_group1}, CI Group2: {ci_group2}')


### s5_tracking_vs_profile

In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import itertools
from scipy.stats import permutation_test
import os

# Load the data into a DataFrame
path = os.getcwd() + "/datasets-stats/s5_tracking_vs_profile.csv"
df = pd.read_csv(path)

def bootstrap_ci(data, statistic, alpha):
    """Compute bootstrap confidence interval for given data and statistic."""
    bs_replicates = np.empty(10000)
    
    for i in range(10000):
        bs_sample = np.random.choice(data, len(data))
        bs_replicates[i] = statistic(bs_sample)
        
    return np.percentile(bs_replicates, [100 * alpha / 2., 100 * (1 - alpha / 2.)])

# Define the function for test statistic (difference of means)
def test_statistic(a, b):
    return np.mean(a) - np.mean(b)


# Get unique alias
aliases = df['alias'].unique()

# Generate all pairs of aliases
pairs = list(itertools.combinations(aliases, 2))

# Perform permutation test for each pair
for pair in pairs:
    group1 = df[df['alias'] == pair[0]]['ct'].to_numpy()
    group2 = df[df['alias'] == pair[1]]['ct'].to_numpy()
    
    result = stats.permutation_test((group1, group2), 
                                    test_statistic,
                                    vectorized=False,
                                    alternative='two-sided',
                                    permutation_type='independent')
    
    # Calculate bootstrap confidence intervals for the mean
    ci_group1 = bootstrap_ci(group1, np.mean, 0.05)
    ci_group2 = bootstrap_ci(group2, np.mean, 0.05)
    
    print(f'Pair: {pair}, P-value: {result.pvalue}, CI Group1: {ci_group1}, CI Group2: {ci_group2}')


### s6_first_third_rejectall

In [None]:
import numpy as np
import pandas as pd
from scipy import stats
import itertools
from scipy.stats import permutation_test

# Assuming df is your DataFrame
path = os.getcwd() + "/datasets-stats/s2_1s-3rd_vs_profile.csv"

df=pd.read_csv(path)
df = df.dropna(subset=['is_third_party'])

df = df[df['alias'].isin([2, 4])]

# Create a new column combining 'alias' and 'is_third_party'
df['alias_third_party'] = df['alias'].astype(str) + '_' + df['is_third_party'].astype(str)

# Get unique alias_third_party
aliases_third_party = df['alias_third_party'].unique()

# Generate all pairs of alias_third_party
pairs = list(itertools.combinations(aliases_third_party, 2))

# Define the function for test statistic (difference of means)
def test_statistic(a, b):
    return np.mean(a) - np.mean(b)


def bootstrap_ci(data, statistic, alpha):
    """Compute bootstrap confidence interval for given data and statistic."""
    bs_replicates = np.empty(10000)
    
    for i in range(10000):
        bs_sample = np.random.choice(data, len(data))
        bs_replicates[i] = statistic(bs_sample)
        
    return np.percentile(bs_replicates, [100 * alpha / 2., 100 * (1 - alpha / 2.)])

# Perform permutation test for each pair
for pair in pairs:
    group1 = df[df['alias_third_party'] == pair[0]]['ct'].to_numpy()
    group2 = df[df['alias_third_party'] == pair[1]]['ct'].to_numpy()

    result = stats.permutation_test((group1, group2), test_statistic, 
                                    vectorized=False, alternative='two-sided',
                                    permutation_type='independent')
    

    # Calculate bootstrap confidence intervals for the mean
    ci_group1 = bootstrap_ci(group1, np.mean, 0.05)
    ci_group2 = bootstrap_ci(group2, np.mean, 0.05)
    print(f'Pair: {pair}, P-value: {result.pvalue}, CI Group1: {ci_group1}, CI Group2: {ci_group2}')
 

print("Only check 2_1.0, 4_1.0 and 2_0.0, 4_0.0")

### s7_localstorage_vs_profile

In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import itertools
from scipy.stats import permutation_test
import os

def bootstrap_ci(data, statistic, alpha):
    """Compute bootstrap confidence interval for given data and statistic."""
    bs_replicates = np.empty(10000)
    
    for i in range(10000):
        bs_sample = np.random.choice(data, len(data))
        bs_replicates[i] = statistic(bs_sample)
        
    return np.percentile(bs_replicates, [100 * alpha / 2., 100 * (1 - alpha / 2.)])

# Define the function for test statistic (difference of means)
def test_statistic(a, b):
    return np.mean(a) - np.mean(b)

# Load the data into a DataFrame
path = os.getcwd() + "/datasets-stats/s1_cookies_vs_profile.csv"
df = pd.read_csv(path)
df = df[df['alias'].isin(["#3", "#7","#8"])]


# Get unique alias
aliases = df['alias'].unique()

# Generate all pairs of aliases
pairs = list(itertools.combinations(aliases, 2))

# Perform permutation test for each pair
for pair in pairs:
    group1 = df[df['alias'] == pair[0]]['ct'].to_numpy()
    group2 = df[df['alias'] == pair[1]]['ct'].to_numpy()
    
    result = stats.permutation_test((group1, group2), 
                                    test_statistic,
                                    vectorized=False,
                                    alternative='two-sided',
                                    permutation_type='independent')
    
    # Calculate bootstrap confidence intervals for the mean
    ci_group1 = bootstrap_ci(group1, np.mean, 0.05)
    ci_group2 = bootstrap_ci(group2, np.mean, 0.05)
    
    print(f'Pair: {pair}, P-value: {result.pvalue}, CI Group1: {ci_group1}, CI Group2: {ci_group2}')


## s8_cat_ls_vs_profile

In [None]:
import pandas as pd
from scipy.stats import f_oneway
import os

path = os.getcwd() + "/datasets-stats/s8_cat_ls_vs_profile.csv"

df_main = pd.read_csv(path)

#Unknown
df = df_main[df_main['category']=='Unknown']  
uniq = pd.unique(df['alias'])
params=[]
for item in uniq:
    params.append(df[df['alias']==item]['ct'])
cat_unknown=f_oneway(*params)

#Targeting/Advertising
df = df_main[df_main['category']=='Targeting/Advertising']
uniq = pd.unique(df['alias'])
params=[]
for item in uniq:
    params.append(df[df['alias']==item]['ct'])
cat_targeting=f_oneway(*params)

#Performance
df = df_main[df_main['category']=='Performance']
uniq = pd.unique(df['alias'])
params=[]
for item in uniq:
    params.append(df[df['alias']==item]['ct'])
cat_perf=f_oneway(*params)

#Strictly Necessary
df = df_main[df_main['category']=='Strictly Necessary']
uniq = pd.unique(df['alias'])
params=[]
for item in uniq:
    params.append(df[df['alias']==item]['ct'])
cat_strict=f_oneway(*params)
    

#Functionality
df = df_main[df_main['category']=='Functionality']
uniq = pd.unique(df['alias'])
params=[]
for item in uniq:
    params.append(df[df['alias']==item]['ct'])
cat_func=f_oneway(*params)
 


results={}
results['Unknown']=cat_unknown
results['Targeting/Advertising']=cat_targeting
results['Performance']=cat_perf
results['Strictly Necessary']=cat_strict
results['Functionality']=cat_func

results

### s9_tracking_vs_profile

In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import itertools
from scipy.stats import permutation_test
import os

def bootstrap_ci(data, statistic, alpha):
    """Compute bootstrap confidence interval for given data and statistic."""
    bs_replicates = np.empty(10000)
    
    for i in range(10000):
        bs_sample = np.random.choice(data, len(data))
        bs_replicates[i] = statistic(bs_sample)
        
    return np.percentile(bs_replicates, [100 * alpha / 2., 100 * (1 - alpha / 2.)])

# Define the function for test statistic (difference of means)
def test_statistic(a, b):
    return np.mean(a) - np.mean(b)

# Load the data into a DataFrame
path = os.getcwd() + "/datasets-stats/s9_tracking_vs_profile.csv"
df = pd.read_csv(path)

# Get unique alias
aliases = df['alias'].unique()

# Generate all pairs of aliases
pairs = list(itertools.combinations(aliases, 2))

# Perform permutation test for each pair
for pair in pairs:
    group1 = df[df['alias'] == pair[0]]['ct'].to_numpy()
    group2 = df[df['alias'] == pair[1]]['ct'].to_numpy()
    
    result = stats.permutation_test((group1, group2), 
                                    test_statistic,
                                    vectorized=False,
                                    alternative='two-sided',
                                    permutation_type='independent')
    
    # Calculate bootstrap confidence intervals for the mean
    ci_group1 = bootstrap_ci(group1, np.mean, 0.05)
    ci_group2 = bootstrap_ci(group2, np.mean, 0.05)
    
    print(f'Pair: {pair}, P-value: {result.pvalue}, CI Group1: {ci_group1}, CI Group2: {ci_group2}')


## s10 multiple runs

In [None]:
import pandas as pd
from skbio.stats.distance import permanova
from skbio import DistanceMatrix
from sklearn.metrics.pairwise import euclidean_distances
import os 

# Load the data
path = os.getcwd() + "/datasets-stats/s10_multiple_runs_scope.zip"
df = pd.read_csv(path)


# Create a pivot table with sum of ct for each combination of measurement_id, browser_id, and category
pivot_table = pd.pivot_table(df, values='ct', index=['measurement_id', 'browser_id'], columns=['category'], aggfunc='sum', fill_value=0)

# Calculate Euclidean distance matrix
distance_matrix = euclidean_distances(pivot_table.values)
distance_matrix = DistanceMatrix(distance_matrix)

# Prepare the grouping variable (measurement_id)
grouping = pivot_table.index.get_level_values('measurement_id')

# Perform PERMANOVA
result = permanova(distance_matrix, grouping, permutations=9999)

print(result)


### s11: o matic all configurations

In [None]:
import pandas as pd
import os
import scipy.stats as stats
import numpy as np
import scikit_posthocs as sp

# Load the data
df = pd.read_csv(os.getcwd() + '/datasets-plots/p11_stats_omatic_all_settings.zip')

# Initialize a dictionary to store results
results = {}

# Perform Kruskal-Wallis test for each category, and if significant, perform Dunn's test
for category in df['category'].unique():
    data = {browser: df.loc[(df['category'] == category) & (df['browser_id'] == browser), 'ct'].values for browser in df['browser_id'].unique()}
    h_statistic, p_value = stats.kruskal(*data.values())
    if p_value < 0.05:
        posthoc = sp.posthoc_dunn(df[df['category'] == category], val_col='ct', group_col='browser_id', p_adjust = 'holm')
        results[category] = posthoc

# Print results and mark the significant differences with "✓" and non-significant with "✖"
for category, result in results.items():
    print("Category:", category)
    result = result.applymap(lambda x: round(x,2) if x < 0.05 else round(x,2))
    print(result)
    print('\n')


In [None]:
## Here we calculate kruskal +  η2

import pandas as pd
from scipy.stats import kruskal
import os

path = os.getcwd() + "/datasets-stats/s10_multiple_runs_scope.zip"
df = pd.read_csv(path)

# Replace spaces with underscore in the 'category' column
df['category'] = df['category'].str.replace(' ', '_')

# Create a new column 'browser_category'
df['browser_category'] = df['category'] + "_" + df['browser_id']

# Get the unique browser_categories
browser_categories = df['browser_category'].unique()

# Get the unique measurement_ids
measurement_ids = df['measurement_id'].unique()

results = []

# Perform Kruskal-Wallis test for each browser_category across different measurement_id
for browser_category in browser_categories:
    data = [df[(df['browser_category'] == browser_category) & (df['measurement_id'] == measurement_id)]['ct']
            for measurement_id in measurement_ids]
    H, pval = kruskal(*data)
    # calculate eta squared
    n = sum([len(d) for d in data])
    eta_squared = H / (n - 1)
    results.append((browser_category, H, round(pval,2), eta_squared))

# Convert the results into a DataFrame
results_df = pd.DataFrame(results, columns=['browser_category', 'H-statistic', 'p-value', 'eta-squared'])

print(results_df)


In [None]:
# effect size (eta-squared)  results_df
import numpy as np

print("mean: " + str(np.mean(results_df["eta-squared"])))
print("std: " + str(np.std(results_df["eta-squared"])))
print("min: " + str(np.min(results_df["eta-squared"])))
print("max: " + str(np.max(results_df["eta-squared"])))