### s1_cookies_vs_profile


In [10]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import itertools
from scipy.stats import permutation_test
import os

def bootstrap_ci(data, statistic, alpha):
    """Compute bootstrap confidence interval for given data and statistic."""
    bs_replicates = np.empty(10000)
    
    for i in range(10000):
        bs_sample = np.random.choice(data, len(data))
        bs_replicates[i] = statistic(bs_sample)
        
    return np.percentile(bs_replicates, [100 * alpha / 2., 100 * (1 - alpha / 2.)])

# Define the function for test statistic (difference of means)
def test_statistic(a, b):
    return np.mean(a) - np.mean(b)

# Load the data into a DataFrame
path = os.getcwd() + "/datasets-stats/s1_cookies_vs_profile.csv"
df = pd.read_csv(path)

# Get unique alias
aliases = df['alias'].unique()

# Generate all pairs of aliases
pairs = list(itertools.combinations(aliases, 2))

# Perform permutation test for each pair
for pair in pairs:
    group1 = df[df['alias'] == pair[0]]['ct'].to_numpy()
    group2 = df[df['alias'] == pair[1]]['ct'].to_numpy()
    
    result = stats.permutation_test((group1, group2), 
                                    test_statistic,
                                    vectorized=False,
                                    alternative='two-sided',
                                    permutation_type='independent')
    
    # Calculate bootstrap confidence intervals for the mean
    ci_group1 = bootstrap_ci(group1, np.mean, 0.05)
    ci_group2 = bootstrap_ci(group2, np.mean, 0.05)
    
    print(f'Pair: {pair}, P-value: {result.pvalue}, CI Group1: {ci_group1}, CI Group2: {ci_group2}')


Pair: ('#1', '#2'), P-value: 0.0004, CI Group1: [31.88521183 33.05816523], CI Group2: [30.14130065 31.25527888]
Pair: ('#1', '#3'), P-value: 0.0002, CI Group1: [31.88543871 33.07015799], CI Group2: [35.25710117 36.54462627]
Pair: ('#1', '#4'), P-value: 0.0002, CI Group1: [31.88088941 33.06460949], CI Group2: [29.95791511 31.05916741]
Pair: ('#1', '#5'), P-value: 0.0002, CI Group1: [31.88898176 33.06381718], CI Group2: [15.15284171 15.74863438]
Pair: ('#1', '#6'), P-value: 0.0002, CI Group1: [31.8750482  33.05991442], CI Group2: [35.21164357 36.57936763]
Pair: ('#1', '#7'), P-value: 0.0002, CI Group1: [31.87170969 33.06886843], CI Group2: [37.92008085 39.44775138]
Pair: ('#1', '#8'), P-value: 0.0002, CI Group1: [31.87322848 33.06687826], CI Group2: [30.16748295 31.26853093]
Pair: ('#1', '#9'), P-value: 0.0002, CI Group1: [31.86443669 33.07373748], CI Group2: [64.37913237 67.21028414]
Pair: ('#2', '#3'), P-value: 0.0002, CI Group1: [30.13666407 31.25408756], CI Group2: [35.25841426 36.57

### s2_1s-3rd_vs_profile

##### 1st-party

In [19]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import itertools
from scipy.stats import permutation_test
import os

path = os.getcwd() + "/datasets-stats/s2_1s-3rd_vs_profile.csv"

df = pd.read_csv(path)

df = df.dropna(subset=['is_third_party'])

# Create a new column combining 'alias' and 'is_third_party'
df['alias_third_party'] = df['alias'].astype(str) + '_' + df['is_third_party'].astype(str)

# Get unique alias_third_party
aliases_third_party = df['alias_third_party'].unique()

# Generate all pairs of alias_third_party
pairs = list(itertools.combinations(aliases_third_party, 2))


def bootstrap_ci(data, statistic, alpha):
    """Compute bootstrap confidence interval for given data and statistic."""
    bs_replicates = np.empty(10000)
    
    for i in range(10000):
        bs_sample = np.random.choice(data, len(data))
        bs_replicates[i] = statistic(bs_sample)
        
    return np.percentile(bs_replicates, [100 * alpha / 2., 100 * (1 - alpha / 2.)])

# Perform permutation test for each pair
for pair in pairs:
    group1 = df[df['alias_third_party'] == pair[0]]['ct'].to_numpy()
    group2 = df[df['alias_third_party'] == pair[1]]['ct'].to_numpy()
    
    result = stats.permutation_test((group1, group2), 
                                    #func=test_statistic, 
                                    #method='exact',
                                    test_statistic,
                                    vectorized=False,
                                    alternative='two-sided',
                                    permutation_type='independent')
    
    # Calculate bootstrap confidence intervals for the mean
    ci_group1 = bootstrap_ci(group1, np.mean, 0.05)
    ci_group2 = bootstrap_ci(group2, np.mean, 0.05)
    
    
    print(f'Pair: {pair}, P-value: {result.pvalue}, CI Group1: {ci_group1}, CI Group2: {ci_group2}')


Pair: ('1_1.0', '1_0.0'), P-value: 0.0002, CI Group1: [22.93514277 24.07044874], CI Group2: [14.14035375 14.57784401]
Pair: ('1_1.0', '2_1.0'), P-value: 0.0002, CI Group1: [22.92738136 24.08533719], CI Group2: [21.09909374 22.18805303]
Pair: ('1_1.0', '2_0.0'), P-value: 0.0002, CI Group1: [22.93190849 24.08486745], CI Group2: [13.92219456 14.34218595]
Pair: ('1_1.0', '3_1.0'), P-value: 0.0002, CI Group1: [22.92012943 24.08102293], CI Group2: [25.9213198  27.20516679]
Pair: ('1_1.0', '3_0.0'), P-value: 0.0002, CI Group1: [22.93323967 24.07610837], CI Group2: [14.86319917 15.31242532]
Pair: ('1_1.0', '4_0.0'), P-value: 0.0002, CI Group1: [22.92564146 24.06261779], CI Group2: [13.80192118 14.22799428]
Pair: ('1_1.0', '4_1.0'), P-value: 0.0002, CI Group1: [22.92747502 24.09520606], CI Group2: [21.06423333 22.11794637]
Pair: ('1_1.0', '5_1.0'), P-value: 0.0002, CI Group1: [22.93135502 24.0715273 ], CI Group2: [ 9.57231003 10.08839415]
Pair: ('1_1.0', '5_0.0'), P-value: 0.0002, CI Group1: [2

### s3_cat_cookies_vs_profile

In [4]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import itertools
from scipy.stats import permutation_test

# Load the dataset
path = os.getcwd() + "/datasets-stats/s3_cat_cookies_vs_profile.csv"

df = pd.read_csv(path) 

# Get unique categories
categories = df['category'].unique()


def bootstrap_ci(data, statistic, alpha):
    """Compute bootstrap confidence interval for given data and statistic."""
    bs_replicates = np.empty(10000)
    
    for i in range(10000):
        bs_sample = np.random.choice(data, len(data))
        bs_replicates[i] = statistic(bs_sample)
        
    return np.percentile(bs_replicates, [100 * alpha / 2., 100 * (1 - alpha / 2.)])

# Prepare function for permutation test
def test_statistic(a, b):
    return np.mean(a) - np.mean(b)

# Iterate over each category
for category in categories:
    # Get unique aliases within this category
    aliases_in_category = df[df['category'] == category]['alias'].unique()

    # Generate all pairs of aliases
    pairs = list(itertools.combinations(aliases_in_category, 2))

    # Perform permutation test for each pair
    for pair in pairs:
        group1 = df[(df['alias'] == pair[0]) & (df['category'] == category)]['ct'].to_numpy()
        group2 = df[(df['alias'] == pair[1]) & (df['category'] == category)]['ct'].to_numpy()

        result = permutation_test((group1, group2), 
                                  test_statistic,
                                  vectorized=False,
                                  alternative='two-sided',
                                  permutation_type='independent')

        # Calculate bootstrap confidence intervals for the mean
        ci_group1 = bootstrap_ci(group1, np.mean, 0.05)
        ci_group2 = bootstrap_ci(group2, np.mean, 0.05)
        

        print(f'Category: {category}, Pair: {pair}, P-value: {result.pvalue}, CI Group1: {ci_group1}, CI Group2: {ci_group2}')
 


Category: Unknown, Pair: ('#1', '#2'), P-value: 0.0002, CI Group1: [4.89436715 5.011774  ], CI Group2: [4.69351749 4.8191838 ]
Category: Unknown, Pair: ('#1', '#3'), P-value: 0.0002, CI Group1: [4.8927108  5.01152485], CI Group2: [5.13864221 5.26160795]
Category: Unknown, Pair: ('#1', '#4'), P-value: 0.0002, CI Group1: [4.89352417 5.01180362], CI Group2: [4.60679215 4.72927428]
Category: Unknown, Pair: ('#1', '#5'), P-value: 0.0002, CI Group1: [4.89214696 5.01121091], CI Group2: [3.06573669 3.14392195]
Category: Unknown, Pair: ('#1', '#6'), P-value: 0.0002, CI Group1: [4.89592428 5.01251407], CI Group2: [5.14928389 5.27999473]
Category: Unknown, Pair: ('#1', '#7'), P-value: 0.0002, CI Group1: [4.89426349 5.01098878], CI Group2: [5.61488346 5.75807269]
Category: Unknown, Pair: ('#1', '#8'), P-value: 0.0002, CI Group1: [4.89371668 5.01253221], CI Group2: [4.54834263 4.66895414]
Category: Unknown, Pair: ('#1', '#9'), P-value: 0.0002, CI Group1: [4.89398472 5.01110132], CI Group2: [8.90209

In [2]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import itertools
import os
from scipy.stats import permutation_test

# Load the dataset
path = os.getcwd() + "/datasets-stats/s3_cat_cookies_vs_profile.csv"
df = pd.read_csv(path) 

# Get unique categories
categories = df['category'].unique()

def bootstrap_ci(data, statistic, alpha):
    """Compute bootstrap confidence interval for given data and statistic."""
    bs_replicates = np.empty(10000)
    
    for i in range(10000):
        bs_sample = np.random.choice(data, len(data))
        bs_replicates[i] = statistic(bs_sample)
        
    return np.percentile(bs_replicates, [100 * alpha / 2., 100 * (1 - alpha / 2.)])

def test_statistic(a, b):
    """Statistic for permutation test."""
    return np.mean(a) - np.mean(b)

def cohen_d(x, y):
    """Calculate Cohen's d for effect size."""
    nx = len(x)
    ny = len(y)
    dof = nx + ny - 2
    return (np.mean(x) - np.mean(y)) / np.sqrt(((nx-1)*np.std(x, ddof=1) ** 2 + (ny-1)*np.std(y, ddof=1) ** 2) / dof)

# Iterate over each category
for category in categories:
    # Get unique aliases within this category
    aliases_in_category = df[df['category'] == category]['alias'].unique()

    # Generate all pairs of aliases
    pairs = list(itertools.combinations(aliases_in_category, 2))

    # Perform permutation test for each pair
    for pair in pairs:
        group1 = df[(df['alias'] == pair[0]) & (df['category'] == category)]['ct'].to_numpy()
        group2 = df[(df['alias'] == pair[1]) & (df['category'] == category)]['ct'].to_numpy()

        result = permutation_test((group1, group2), 
                                  test_statistic,
                                  vectorized=False,
                                  alternative='two-sided',
                                  permutation_type='independent')

        # Calculate bootstrap confidence intervals for the mean
        ci_group1 = bootstrap_ci(group1, np.mean, 0.05)
        ci_group2 = bootstrap_ci(group2, np.mean, 0.05)
        
        # Calculate Cohen's d for effect size
        d = cohen_d(group1, group2)

        print(f'Category: {category}, Pair: {pair}, P-value: {result.pvalue}, CI Group1: {ci_group1}, CI Group2: {ci_group2}, Cohen\'s d: {d}')


Category: Unknown, Pair: ('#1', '#2'), P-value: 0.0002, CI Group1: [4.89445378 5.01297277], CI Group2: [4.69278195 4.82008563], Cohen's d: 0.024235528191900585
Category: Unknown, Pair: ('#1', '#3'), P-value: 0.0002, CI Group1: [4.89309583 5.01310604], CI Group2: [5.13887827 5.26017813], Cohen's d: -0.030371356687908174
Category: Unknown, Pair: ('#1', '#4'), P-value: 0.0002, CI Group1: [4.89299143 5.01131531], CI Group2: [4.60674756 4.72900409], Cohen's d: 0.03588193715045524
Category: Unknown, Pair: ('#1', '#5'), P-value: 0.0002, CI Group1: [4.89404358 5.01306199], CI Group2: [3.06634841 3.14516324], Cohen's d: 0.2794122484898809
Category: Unknown, Pair: ('#1', '#6'), P-value: 0.0002, CI Group1: [4.89238501 5.01058931], CI Group2: [5.15071244 5.28018101], Cohen's d: -0.030688687142356275
Category: Unknown, Pair: ('#1', '#7'), P-value: 0.0002, CI Group1: [4.89506501 5.01278285], CI Group2: [5.61412298 5.75815673], Cohen's d: -0.08194319696497586
Category: Unknown, Pair: ('#1', '#8'), P-

### s4_localstorage_vs_profile

In [10]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import itertools
from scipy.stats import permutation_test
import os

# Load the data into a DataFrame
path = os.getcwd() + "/datasets-stats/s4_localstorage_vs_profile.csv"
df = pd.read_csv(path)

def bootstrap_ci(data, statistic, alpha):
    """Compute bootstrap confidence interval for given data and statistic."""
    bs_replicates = np.empty(10000)
    
    for i in range(10000):
        bs_sample = np.random.choice(data, len(data))
        bs_replicates[i] = statistic(bs_sample)
        
    return np.percentile(bs_replicates, [100 * alpha / 2., 100 * (1 - alpha / 2.)])

# Define the function for test statistic (difference of means)
def test_statistic(a, b):
    return np.mean(a) - np.mean(b)


# Get unique alias
aliases = df['alias'].unique()

# Generate all pairs of aliases
pairs = list(itertools.combinations(aliases, 2))

# Perform permutation test for each pair
for pair in pairs:
    group1 = df[df['alias'] == pair[0]]['ct'].to_numpy()
    group2 = df[df['alias'] == pair[1]]['ct'].to_numpy()
    
    result = stats.permutation_test((group1, group2), 
                                    test_statistic,
                                    vectorized=False,
                                    alternative='two-sided',
                                    permutation_type='independent')
    
    # Calculate bootstrap confidence intervals for the mean
    ci_group1 = bootstrap_ci(group1, np.mean, 0.05)
    ci_group2 = bootstrap_ci(group2, np.mean, 0.05)
    
    print(f'Pair: {pair}, P-value: {result.pvalue}, CI Group1: {ci_group1}, CI Group2: {ci_group2}')


Pair: (1, 2), P-value: 0.2698, CI Group1: [13.03146222 14.23092769], CI Group2: [12.56046052 13.72401135]
Pair: (1, 3), P-value: 0.0634, CI Group1: [13.02731523 14.22493937], CI Group2: [13.83077537 15.01735272]
Pair: (1, 4), P-value: 0.1624, CI Group1: [13.01889201 14.20642073], CI Group2: [12.44833194 13.59890078]
Pair: (1, 5), P-value: 0.0714, CI Group1: [13.02301187 14.22323047], CI Group2: [12.3019518  13.44683995]
Pair: (1, 6), P-value: 0.0002, CI Group1: [13.0229656  14.19540624], CI Group2: [10.52595943 11.23825   ]
Pair: (1, 7), P-value: 0.0456, CI Group1: [13.01728523 14.23553261], CI Group2: [13.89226086 14.844887  ]
Pair: (1, 8), P-value: 0.0042, CI Group1: [13.02148966 14.22843854], CI Group2: [12.11820261 13.04978267]
Pair: (1, 9), P-value: 0.0002, CI Group1: [13.01582684 14.22006638], CI Group2: [16.41768113 17.65784121]
Pair: (2, 3), P-value: 0.0014, CI Group1: [12.57276279 13.71826764], CI Group2: [13.83971702 15.01716021]
Pair: (2, 4), P-value: 0.7838, CI Group1: [12.

### s5_tracking_vs_profile

In [5]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import itertools
from scipy.stats import permutation_test
import os

# Load the data into a DataFrame
path = os.getcwd() + "/datasets-stats/s5_tracking_vs_profile.csv"
df = pd.read_csv(path)

def bootstrap_ci(data, statistic, alpha):
    """Compute bootstrap confidence interval for given data and statistic."""
    bs_replicates = np.empty(10000)
    
    for i in range(10000):
        bs_sample = np.random.choice(data, len(data))
        bs_replicates[i] = statistic(bs_sample)
        
    return np.percentile(bs_replicates, [100 * alpha / 2., 100 * (1 - alpha / 2.)])

# Define the function for test statistic (difference of means)
def test_statistic(a, b):
    return np.mean(a) - np.mean(b)


# Get unique alias
aliases = df['alias'].unique()

# Generate all pairs of aliases
pairs = list(itertools.combinations(aliases, 2))

# Perform permutation test for each pair
for pair in pairs:
    group1 = df[df['alias'] == pair[0]]['ct'].to_numpy()
    group2 = df[df['alias'] == pair[1]]['ct'].to_numpy()
    
    result = stats.permutation_test((group1, group2), 
                                    test_statistic,
                                    vectorized=False,
                                    alternative='two-sided',
                                    permutation_type='independent')
    
    # Calculate bootstrap confidence intervals for the mean
    ci_group1 = bootstrap_ci(group1, np.mean, 0.05)
    ci_group2 = bootstrap_ci(group2, np.mean, 0.05)
    
    print(f'Pair: {pair}, P-value: {result.pvalue}, CI Group1: {ci_group1}, CI Group2: {ci_group2}')


Pair: (1, 2), P-value: 0.0002, CI Group1: [98.07855821 99.20995079], CI Group2: [93.57843429 94.59432868]
Pair: (1, 3), P-value: 0.0002, CI Group1: [98.0882461  99.22067603], CI Group2: [103.08507869 104.21367739]
Pair: (1, 4), P-value: 0.0002, CI Group1: [98.093106   99.22433404], CI Group2: [94.88100429 95.91054308]
Pair: (1, 5), P-value: 0.0002, CI Group1: [98.08327641 99.21366965], CI Group2: [96.51792276 97.56923934]
Pair: (1, 6), P-value: 0.0002, CI Group1: [98.08941439 99.21222146], CI Group2: [100.83773582 101.9208071 ]
Pair: (1, 7), P-value: 0.0002, CI Group1: [98.09448919 99.21423985], CI Group2: [105.54101899 106.75193262]
Pair: (1, 8), P-value: 0.0002, CI Group1: [98.08505461 99.21570778], CI Group2: [92.63612413 93.70397991]
Pair: (1, 9), P-value: 0.0002, CI Group1: [98.0880957 99.2181461], CI Group2: [115.17785425 116.61186481]
Pair: (2, 3), P-value: 0.0002, CI Group1: [93.58995184 94.58363238], CI Group2: [103.0842562  104.20386602]
Pair: (2, 4), P-value: 0.0006, CI Grou

### s6_first_third_rejectall

In [9]:
import numpy as np
import pandas as pd
from scipy import stats
import itertools
from scipy.stats import permutation_test

# Assuming df is your DataFrame
path = os.getcwd() + "/datasets-stats/s2_1s-3rd_vs_profile.csv"

df=pd.read_csv(path)
df = df.dropna(subset=['is_third_party'])

df = df[df['alias'].isin([2, 4])]

# Create a new column combining 'alias' and 'is_third_party'
df['alias_third_party'] = df['alias'].astype(str) + '_' + df['is_third_party'].astype(str)

# Get unique alias_third_party
aliases_third_party = df['alias_third_party'].unique()

# Generate all pairs of alias_third_party
pairs = list(itertools.combinations(aliases_third_party, 2))

# Define the function for test statistic (difference of means)
def test_statistic(a, b):
    return np.mean(a) - np.mean(b)


def bootstrap_ci(data, statistic, alpha):
    """Compute bootstrap confidence interval for given data and statistic."""
    bs_replicates = np.empty(10000)
    
    for i in range(10000):
        bs_sample = np.random.choice(data, len(data))
        bs_replicates[i] = statistic(bs_sample)
        
    return np.percentile(bs_replicates, [100 * alpha / 2., 100 * (1 - alpha / 2.)])

# Perform permutation test for each pair
for pair in pairs:
    group1 = df[df['alias_third_party'] == pair[0]]['ct'].to_numpy()
    group2 = df[df['alias_third_party'] == pair[1]]['ct'].to_numpy()

    result = stats.permutation_test((group1, group2), test_statistic, 
                                    vectorized=False, alternative='two-sided',
                                    permutation_type='independent')
    

    # Calculate bootstrap confidence intervals for the mean
    ci_group1 = bootstrap_ci(group1, np.mean, 0.05)
    ci_group2 = bootstrap_ci(group2, np.mean, 0.05)
    print(f'Pair: {pair}, P-value: {result.pvalue}, CI Group1: {ci_group1}, CI Group2: {ci_group2}')
 

print("Only check 2_1.0, 4_1.0 and 2_0.0, 4_0.0")

Pair: ('2_1.0', '2_0.0'), P-value: 0.0002, CI Group1: [21.09339213 22.1903351 ], CI Group2: [13.92110341 14.33900115]
Pair: ('2_1.0', '4_0.0'), P-value: 0.0002, CI Group1: [21.11400069 22.18120255], CI Group2: [13.80806452 14.22653616]
Pair: ('2_1.0', '4_1.0'), P-value: 0.8946, CI Group1: [21.10609979 22.16704773], CI Group2: [21.06451839 22.13605747]
Pair: ('2_0.0', '4_0.0'), P-value: 0.4386, CI Group1: [13.92198858 14.34364603], CI Group2: [13.79921436 14.22476847]
Pair: ('2_0.0', '4_1.0'), P-value: 0.0002, CI Group1: [13.9152005  14.34573317], CI Group2: [21.06434649 22.10894453]
Pair: ('4_0.0', '4_1.0'), P-value: 0.0002, CI Group1: [13.79546956 14.22955385], CI Group2: [21.06732609 22.12710864]
Only check 2_1.0, 4_1.0 and 2_0.0, 4_0.0


### s7_localstorage_vs_profile

In [8]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import itertools
from scipy.stats import permutation_test
import os

def bootstrap_ci(data, statistic, alpha):
    """Compute bootstrap confidence interval for given data and statistic."""
    bs_replicates = np.empty(10000)
    
    for i in range(10000):
        bs_sample = np.random.choice(data, len(data))
        bs_replicates[i] = statistic(bs_sample)
        
    return np.percentile(bs_replicates, [100 * alpha / 2., 100 * (1 - alpha / 2.)])

# Define the function for test statistic (difference of means)
def test_statistic(a, b):
    return np.mean(a) - np.mean(b)

# Load the data into a DataFrame
path = os.getcwd() + "/datasets-stats/s1_cookies_vs_profile.csv"
df = pd.read_csv(path)
df = df[df['alias'].isin(["#3", "#7","#8"])]


# Get unique alias
aliases = df['alias'].unique()

# Generate all pairs of aliases
pairs = list(itertools.combinations(aliases, 2))

# Perform permutation test for each pair
for pair in pairs:
    group1 = df[df['alias'] == pair[0]]['ct'].to_numpy()
    group2 = df[df['alias'] == pair[1]]['ct'].to_numpy()
    
    result = stats.permutation_test((group1, group2), 
                                    test_statistic,
                                    vectorized=False,
                                    alternative='two-sided',
                                    permutation_type='independent')
    
    # Calculate bootstrap confidence intervals for the mean
    ci_group1 = bootstrap_ci(group1, np.mean, 0.05)
    ci_group2 = bootstrap_ci(group2, np.mean, 0.05)
    
    print(f'Pair: {pair}, P-value: {result.pvalue}, CI Group1: {ci_group1}, CI Group2: {ci_group2}')


Pair: ('#3', '#7'), P-value: 0.0002, CI Group1: [35.26030627 36.56885125], CI Group2: [37.92220219 39.44034554]
Pair: ('#3', '#8'), P-value: 0.0002, CI Group1: [35.25093375 36.56601149], CI Group2: [30.17406807 31.26127439]
Pair: ('#7', '#8'), P-value: 0.0002, CI Group1: [37.9114682  39.41743742], CI Group2: [30.16776596 31.26246092]


## s8_cat_ls_vs_profile

In [12]:
import pandas as pd
from scipy.stats import f_oneway
import os

path = os.getcwd() + "/datasets-stats/s8_cat_ls_vs_profile.csv"

df_main = pd.read_csv(path)

#Unknown
df = df_main[df_main['category']=='Unknown']  
uniq = pd.unique(df['alias'])
params=[]
for item in uniq:
    params.append(df[df['alias']==item]['ct'])
cat_unknown=f_oneway(*params)

#Targeting/Advertising
df = df_main[df_main['category']=='Targeting/Advertising']
uniq = pd.unique(df['alias'])
params=[]
for item in uniq:
    params.append(df[df['alias']==item]['ct'])
cat_targeting=f_oneway(*params)

#Performance
df = df_main[df_main['category']=='Performance']
uniq = pd.unique(df['alias'])
params=[]
for item in uniq:
    params.append(df[df['alias']==item]['ct'])
cat_perf=f_oneway(*params)

#Strictly Necessary
df = df_main[df_main['category']=='Strictly Necessary']
uniq = pd.unique(df['alias'])
params=[]
for item in uniq:
    params.append(df[df['alias']==item]['ct'])
cat_strict=f_oneway(*params)
    

#Functionality
df = df_main[df_main['category']=='Functionality']
uniq = pd.unique(df['alias'])
params=[]
for item in uniq:
    params.append(df[df['alias']==item]['ct'])
cat_func=f_oneway(*params)
 


results={}
results['Unknown']=cat_unknown
results['Targeting/Advertising']=cat_targeting
results['Performance']=cat_perf
results['Strictly Necessary']=cat_strict
results['Functionality']=cat_func

results

{'Unknown': F_onewayResult(statistic=31.42085716122143, pvalue=1.046346986816549e-49),
 'Targeting/Advertising': F_onewayResult(statistic=1.0384622078210886, pvalue=0.40404808318790303),
 'Performance': F_onewayResult(statistic=0.27352215880199654, pvalue=0.9746808839711721),
 'Strictly Necessary': F_onewayResult(statistic=0.4772166318223201, pvalue=0.8729543784168162),
 'Functionality': F_onewayResult(statistic=0.21258864895974963, pvalue=0.9888430159660452)}

### s9_tracking_vs_profile

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import itertools
from scipy.stats import permutation_test
import os

def bootstrap_ci(data, statistic, alpha):
    """Compute bootstrap confidence interval for given data and statistic."""
    bs_replicates = np.empty(10000)
    
    for i in range(10000):
        bs_sample = np.random.choice(data, len(data))
        bs_replicates[i] = statistic(bs_sample)
        
    return np.percentile(bs_replicates, [100 * alpha / 2., 100 * (1 - alpha / 2.)])

# Define the function for test statistic (difference of means)
def test_statistic(a, b):
    return np.mean(a) - np.mean(b)

# Load the data into a DataFrame
path = os.getcwd() + "/datasets-stats/s9_tracking_vs_profile.csv"
df = pd.read_csv(path)

# Get unique alias
aliases = df['alias'].unique()

# Generate all pairs of aliases
pairs = list(itertools.combinations(aliases, 2))

# Perform permutation test for each pair
for pair in pairs:
    group1 = df[df['alias'] == pair[0]]['ct'].to_numpy()
    group2 = df[df['alias'] == pair[1]]['ct'].to_numpy()
    
    result = stats.permutation_test((group1, group2), 
                                    test_statistic,
                                    vectorized=False,
                                    alternative='two-sided',
                                    permutation_type='independent')
    
    # Calculate bootstrap confidence intervals for the mean
    ci_group1 = bootstrap_ci(group1, np.mean, 0.05)
    ci_group2 = bootstrap_ci(group2, np.mean, 0.05)
    
    print(f'Pair: {pair}, P-value: {result.pvalue}, CI Group1: {ci_group1}, CI Group2: {ci_group2}')


Pair: ('#1', '#2'), P-value: 0.0002, CI Group1: [15.77923734 16.31666303], CI Group2: [15.25115028 15.49554881]
Pair: ('#1', '#3'), P-value: 0.0002, CI Group1: [15.78392989 16.29813855], CI Group2: [16.61155359 17.03643985]
Pair: ('#1', '#4'), P-value: 0.0002, CI Group1: [15.78527324 16.31055756], CI Group2: [15.29085255 15.69761532]
Pair: ('#1', '#5'), P-value: 0.0002, CI Group1: [15.78355537 16.30816792], CI Group2: [16.68629503 16.95946818]
Pair: ('#1', '#6'), P-value: 0.9542, CI Group1: [15.78160798 16.315259  ], CI Group2: [15.85766051 16.10395206]
Pair: ('#1', '#7'), P-value: 0.0002, CI Group1: [15.78214104 16.3032774 ], CI Group2: [17.11617174 17.39227744]
Pair: ('#1', '#8'), P-value: 0.0078, CI Group1: [15.78182618 16.30686763], CI Group2: [15.28052549 15.90071925]
Pair: ('#1', '#9'), P-value: 0.0002, CI Group1: [15.78678525 16.30757642], CI Group2: [20.31540689 20.65704671]
Pair: ('#2', '#3'), P-value: 0.0002, CI Group1: [15.25222336 15.4945765 ], CI Group2: [16.60898633 17.03

## s10 multiple runs

In [2]:
import pandas as pd
from skbio.stats.distance import permanova
from skbio import DistanceMatrix
from sklearn.metrics.pairwise import euclidean_distances
import os 

# Load the data
path = os.getcwd() + "/datasets-stats/s10_multiple_runs_scope.zip"
df = pd.read_csv(path)


# Create a pivot table with sum of ct for each combination of measurement_id, browser_id, and category
pivot_table = pd.pivot_table(df, values='ct', index=['measurement_id', 'browser_id'], columns=['category'], aggfunc='sum', fill_value=0)

# Calculate Euclidean distance matrix
distance_matrix = euclidean_distances(pivot_table.values)
distance_matrix = DistanceMatrix(distance_matrix)

# Prepare the grouping variable (measurement_id)
grouping = pivot_table.index.get_level_values('measurement_id')

# Perform PERMANOVA
result = permanova(distance_matrix, grouping, permutations=9999)

print(result)


method name               PERMANOVA
test statistic name        pseudo-F
sample size                      27
number of groups                  3
test statistic             0.210757
p-value                      0.8713
number of permutations         9999
Name: PERMANOVA results, dtype: object


### s11: o matic all configurations

In [22]:
import pandas as pd
import os
import scipy.stats as stats
import numpy as np
import scikit_posthocs as sp

# Load the data
df = pd.read_csv(os.getcwd() + '/datasets-plots/p11_stats_omatic_all_settings.zip')

# Initialize a dictionary to store results
results = {}

# Perform Kruskal-Wallis test for each category, and if significant, perform Dunn's test
for category in df['category'].unique():
    data = {browser: df.loc[(df['category'] == category) & (df['browser_id'] == browser), 'ct'].values for browser in df['browser_id'].unique()}
    h_statistic, p_value = stats.kruskal(*data.values())
    if p_value < 0.05:
        posthoc = sp.posthoc_dunn(df[df['category'] == category], val_col='ct', group_col='browser_id', p_adjust = 'holm')
        results[category] = posthoc

# Print results and mark the significant differences with "✓" and non-significant with "✖"
for category, result in results.items():
    print("Category:", category)
    result = result.applymap(lambda x: round(x,2) if x < 0.05 else round(x,2))
    print(result)
    print('\n')


Category: Tracking requests
                 openwpm_nothing  openwpm_omatic  openwpm_omaticA   
openwpm_nothing              1.0            0.00             0.00  \
openwpm_omatic               0.0            1.00             0.03   
openwpm_omaticA              0.0            0.03             1.00   
openwpm_omaticB              0.0            0.00             0.39   
openwpm_omaticC              0.0            1.00             0.12   
openwpm_omaticD              0.0            0.09             1.00   
openwpm_omaticE              0.0            0.00             0.08   
openwpm_omaticF              0.0            1.00             0.26   

                 openwpm_omaticB  openwpm_omaticC  openwpm_omaticD   
openwpm_nothing             0.00             0.00             0.00  \
openwpm_omatic              0.00             1.00             0.09   
openwpm_omaticA             0.39             0.12             1.00   
openwpm_omaticB             1.00             0.00             0.19   


In [1]:
## Here we calculate kruskal +  η2

import pandas as pd
from scipy.stats import kruskal
import os

path = os.getcwd() + "/datasets-stats/s10_multiple_runs_scope.zip"
df = pd.read_csv(path)

# Replace spaces with underscore in the 'category' column
df['category'] = df['category'].str.replace(' ', '_')

# Create a new column 'browser_category'
df['browser_category'] = df['category'] + "_" + df['browser_id']

# Get the unique browser_categories
browser_categories = df['browser_category'].unique()

# Get the unique measurement_ids
measurement_ids = df['measurement_id'].unique()

results = []

# Perform Kruskal-Wallis test for each browser_category across different measurement_id
for browser_category in browser_categories:
    data = [df[(df['browser_category'] == browser_category) & (df['measurement_id'] == measurement_id)]['ct']
            for measurement_id in measurement_ids]
    H, pval = kruskal(*data)
    # calculate eta squared
    n = sum([len(d) for d in data])
    eta_squared = H / (n - 1)
    results.append((browser_category, H, round(pval,2), eta_squared))

# Convert the results into a DataFrame
results_df = pd.DataFrame(results, columns=['browser_category', 'H-statistic', 'p-value', 'eta-squared'])

print(results_df)


            browser_category  H-statistic  p-value  eta-squared
0       Tracking_requests_#1    13.724815     0.00     0.000142
1             Performance_#1    36.286949     0.00     0.000490
2                 Unknown_#1     5.520514     0.06     0.000067
3      Strictly_Necessary_#1    10.048000     0.01     0.000223
4   Targeting/Advertising_#1     1.924505     0.38     0.000035
5           Functionality_#1     6.378924     0.04     0.000208
6       Tracking_requests_#2     4.237398     0.12     0.000043
7                 Unknown_#2     3.499560     0.17     0.000042
8             Performance_#2    27.969458     0.00     0.000393
9   Targeting/Advertising_#2     6.926378     0.03     0.000135
10     Strictly_Necessary_#2     6.777803     0.03     0.000145
11          Functionality_#2     1.598093     0.45     0.000053
12      Tracking_requests_#3    13.862557     0.00     0.000142
13                Unknown_#3    25.808631     0.00     0.000306
14     Strictly_Necessary_#3     9.52935

In [15]:
# effect size (eta-squared)  results_df
import numpy as np

print("mean: " + str(np.mean(results_df["eta-squared"])))
print("std: " + str(np.std(results_df["eta-squared"])))
print("min: " + str(np.min(results_df["eta-squared"])))
print("max: " + str(np.max(results_df["eta-squared"])))

mean: 0.000287652471833338
std: 0.000247746819896906
min: 1.6382290903112786e-05
max: 0.0013106040584470059
