In [1]:
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.insert(0, project_root)

import pandas as pd
from src.statistical_testing import perform_wilcoxon

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
df = pd.read_csv('../reports/raw_metrics.csv')
df.head()

Unnamed: 0,Dataset,Sensitive Attribute,Mitigation,accuracy,f1_score,SPD,DI,EOD,AOD,Pipeline
0,adult,race,baseline,0.853721,0.661294,-0.081909,0.599984,-0.040931,-0.036119,base
1,adult,race,baseline,0.852902,0.658589,-0.090835,0.556516,-0.081591,-0.054249,base
2,adult,race,baseline,0.847989,0.649847,-0.085304,0.587117,-0.076511,-0.054843,base
3,adult,race,baseline,0.85454,0.658824,-0.084947,0.574141,-0.028706,-0.029367,base
4,adult,race,baseline,0.849626,0.650986,-0.081713,0.59767,-0.056171,-0.03799,base


In [3]:
wilcoxon_results = perform_wilcoxon(df)

Number of (Dataset, Attribute, Method) combos: 22
Performed 132 Wilcoxon tests (6 metrics × 22 combos = 132).




In [4]:
wilcoxon_results[37:40]

Unnamed: 0,Dataset,Attribute,Pipeline,Method,Metric,W,p-value,p_adj,significant,significance
37,adult,sex,postprocessing,equalized odds postprocessing,accuracy,0.0,0.0,0.0,True,***
38,adult,sex,inprocessing,meta fair classifier,accuracy,0.0,0.0,0.0,True,***
39,adult,sex,inprocessing,prejudice remover,accuracy,156.0,0.874,1.0,False,


In [5]:
wilcoxon_results

Unnamed: 0,Dataset,Attribute,Pipeline,Method,Metric,W,p-value,p_adj,significant,significance
0,adult,race,preprocessing,disparate impact remover,accuracy,81.0,0.027,0.165,False,
1,adult,race,postprocessing,equalized odds postprocessing,accuracy,0.0,0.0,0.0,True,***
2,adult,race,inprocessing,meta fair classifier,accuracy,0.0,0.0,0.0,True,***
3,adult,race,inprocessing,prejudice remover,accuracy,111.5,0.271,1.0,False,
4,adult,race,postprocessing,reject option classification,accuracy,0.0,0.0,0.0,True,***
5,adult,race,preprocessing,reweighing,accuracy,107.0,0.141,0.845,False,
6,adult,race,preprocessing,disparate impact remover,f1_score,151.0,0.771,1.0,False,
7,adult,race,postprocessing,equalized odds postprocessing,f1_score,0.0,0.0,0.0,True,***
8,adult,race,inprocessing,meta fair classifier,f1_score,8.0,0.0,0.0,True,***
9,adult,race,inprocessing,prejudice remover,f1_score,160.0,0.958,1.0,False,


In [6]:
results_nested = {}
for method, sub in wilcoxon_results.groupby('Method'):
    results_nested[method] = {}
    for (ds, attr), grp in sub.groupby(['Dataset','Attribute']):
        results_nested[method][(ds,attr)] = (
            grp.set_index('Metric')[['W','p-value','p_adj', 'significant', 'significance']]
        )

# e.g. access Reweighing → (Adult_income, race)
results_nested['meta fair classifier'][('adult','sex')]

Unnamed: 0_level_0,W,p-value,p_adj,significant,significance
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
accuracy,0.0,0.0,0.0,True,***
f1_score,32.0,0.0,0.001,True,**
SPD,0.0,0.0,0.0,True,***
DI,0.0,0.0,0.0,True,***
EOD,39.0,0.0,0.003,True,**
AOD,9.0,0.0,0.0,True,***


In [7]:
results_nested['prejudice remover'][('compas','sex')]

Unnamed: 0_level_0,W,p-value,p_adj,significant,significance
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
accuracy,0.0,0.0,0.0,True,***
f1_score,0.0,0.0,0.0,True,***
SPD,0.0,0.0,0.0,True,***
DI,0.0,0.0,0.0,True,***
EOD,0.0,0.0,0.0,True,***
AOD,0.0,0.0,0.0,True,***


In [8]:
columns = ["Dataset", "Attribute", "Pipeline", "Method", "Metric", "W", "p-value", "p_adj", "significant", "significance"]
df = pd.DataFrame(wilcoxon_results, columns=columns)

# Pivot to wide format as one row per (Method, Pipeline, Dataset, Attribute)
pivot_df = df.pivot_table(
    index=["Method", "Pipeline", "Dataset", "Attribute"],
    columns="Metric",
    values=["W", "p-value", "p_adj", "significance"],
    aggfunc="first"
).reset_index()

pivot_df.columns = [
    f"{outer}_{inner}" if inner else outer
    for outer, inner in pivot_df.columns.to_flat_index()
]

pivot_df.to_csv("../reports/wilcoxon/wilcoxon_results.csv", index=False)

In [9]:
pivot_df

Unnamed: 0,Method,Pipeline,Dataset,Attribute,W_AOD,W_DI,W_EOD,W_SPD,W_accuracy,W_f1_score,p-value_AOD,p-value_DI,p-value_EOD,p-value_SPD,p-value_accuracy,p-value_f1_score,p_adj_AOD,p_adj_DI,p_adj_EOD,p_adj_SPD,p_adj_accuracy,p_adj_f1_score,significance_AOD,significance_DI,significance_EOD,significance_SPD,significance_accuracy,significance_f1_score
0,disparate impact remover,preprocessing,adult,race,1.0,1.0,1.0,8.0,81.0,151.0,0.0,0.0,0.0,0.0,0.027,0.771,0.0,0.0,0.0,0.0,0.165,1.0,***,***,***,***,,
1,disparate impact remover,preprocessing,adult,sex,27.0,18.0,22.0,45.0,33.0,13.0,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.005,0.001,0.0,***,***,***,**,**,***
2,disparate impact remover,preprocessing,compas,race,3.0,9.0,0.0,3.0,33.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.0,***,***,***,***,**,***
3,disparate impact remover,preprocessing,compas,sex,3.0,1.0,2.0,3.0,23.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,***,***,***,***,***,***
4,equalized odds postprocessing,postprocessing,adult,race,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,***,***,***,***,***,***
5,equalized odds postprocessing,postprocessing,adult,sex,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,***,***,***,***,***,***
6,equalized odds postprocessing,postprocessing,compas,race,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,***,***,***,***,***,***
7,equalized odds postprocessing,postprocessing,compas,sex,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,***,***,***,***,***,***
8,meta fair classifier,inprocessing,adult,race,10.0,60.0,13.0,6.0,0.0,8.0,0.0,0.005,0.0,0.0,0.0,0.0,0.0,0.028,0.0,0.0,0.0,0.0,***,*,***,***,***,***
9,meta fair classifier,inprocessing,adult,sex,9.0,0.0,39.0,0.0,0.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003,0.0,0.0,0.001,***,***,**,***,***,**
