In [3]:
import pandas as pd
from scipy.stats import wilcoxon
pd.set_option('display.max_rows', 500)

df = pd.read_csv('../reports/raw_metrics.csv')
df.head()

Unnamed: 0,Dataset,Sensitive Attribute,Mitigation,accuracy,f1_score,SPD,DI,EOD,AOD,Pipeline
0,compas,race,baseline,0.680527,0.606319,-0.196925,0.539225,-0.221744,-0.167383,base
1,compas,race,baseline,0.663895,0.589331,-0.188922,0.565712,-0.249795,-0.169028,base
2,compas,race,baseline,0.661123,0.581694,-0.110155,0.722155,-0.121066,-0.093272,base
3,compas,race,baseline,0.683992,0.608247,-0.181815,0.565107,-0.152574,-0.148753,base
4,compas,race,baseline,0.686071,0.605057,-0.138029,0.647506,-0.168502,-0.109302,base


In [None]:
# 1) Count dataset-attribute-method combinations, excluded baseline
combos = []
for (dataset, attr), group in df.groupby(['Dataset','Sensitive Attribute']):
    methods = [m for m in group['Mitigation'].unique() if m.lower() != 'baseline']
    for method in methods:
        combos.append({'Dataset': dataset, 'Attribute': attr, 'Method': method})
combo_df = pd.DataFrame(combos)
print(f"Number of (Dataset, Attribute, Method) combos: {len(combo_df)}")
combo_df

# 2) Perform Wilcoxon tests for each metric
df['fold'] = df.groupby(['Dataset','Sensitive Attribute','Mitigation']).cumcount()
METRICS = ['accuracy', 'f1_score', 'SPD', 'DI', 'EOD', 'AOD']

results = []
for (dataset, attr), sub in df.groupby(['Dataset','Sensitive Attribute']):
    for metric in METRICS:
        pivot = sub.pivot(index='fold', columns='Mitigation', values=metric)
        for method in pivot.columns:
            if method.lower() == 'baseline':
                continue
            x = pivot['baseline']
            y = pivot[method]
            if len(x) == len(y):
                W, p = wilcoxon(x, y, zero_method='wilcox')
                results.append({
                    'Dataset': dataset,
                    'Attribute': attr,
                    'Method': method,
                    'Metric': metric,
                    'W': W,
                    'p-value': p
                })

res_df = pd.DataFrame(results)
res_df['p_adj'] = (res_df['p-value'] * len(METRICS)).clip(upper=1.0)
print(f"Performed {len(res_df)} Wilcoxon tests (6 metrics × {len(combo_df)} combos = {len(res_df)}).")
res_df

Number of (Dataset, Attribute, Method) combos: 16
Performed 96 Wilcoxon tests (6 metrics × 16 combos = 96).




Unnamed: 0,Dataset,Attribute,Method,Metric,W,p-value,p_adj
0,adult,race,disparate impact remover,accuracy,81.0,0.02747941,0.1648765
1,adult,race,equalized odds postprocessing,accuracy,0.0,5.960464e-08,3.576279e-07
2,adult,race,meta fair classifier,accuracy,0.0,5.960464e-08,3.576279e-07
3,adult,race,prejudice remover,accuracy,111.5,0.2711364,1.0
4,adult,race,reweighing,accuracy,107.0,0.1409152,0.8454913
5,adult,race,disparate impact remover,f1_score,151.0,0.7711594,1.0
6,adult,race,equalized odds postprocessing,f1_score,0.0,5.960464e-08,3.576279e-07
7,adult,race,meta fair classifier,f1_score,8.0,1.490116e-06,8.940697e-06
8,adult,race,prejudice remover,f1_score,160.0,0.957845,1.0
9,adult,race,reweighing,f1_score,83.0,0.03180784,0.190847
