Statistical validation of differences in performance among algorithms across benchmark datasets.
@author: igraugar@vub.be

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import wilcoxon
from scipy.stats import friedmanchisquare
from statsmodels.stats.multitest import multipletests

  import pandas.util.testing as tm


In [2]:
df = pd.read_csv("example.csv") # csv file with observations in rows and algorithms in columns
df.head()

Unnamed: 0,dataset,Alg1,Alg2,Alg3
0,data1,0.131279,0.137068,0.132441
1,data2,0.461496,0.479547,0.570257
2,data3,0.712912,0.724823,0.725016
3,data4,0.513625,0.470073,0.466632
4,data5,0.765309,0.765585,0.767941


In [3]:
# remove "dataset" column if needed

df = df.drop('dataset', axis=1)
df.head()

Unnamed: 0,Alg1,Alg2,Alg3
0,0.131279,0.137068,0.132441
1,0.461496,0.479547,0.570257
2,0.712912,0.724823,0.725016
3,0.513625,0.470073,0.466632
4,0.765309,0.765585,0.767941


In [4]:
# Friedman mean rank test for k related samples

freidman_stat, friedman_pvalue = friedmanchisquare(*[df[c] for c in df.columns])
print("Friedman chi square stat = %.2E, p-value = %.2E" % (freidman_stat, friedman_pvalue))

# interpretation
alpha = 0.05
print("Fail to reject H0" if friedman_pvalue > alpha else "Reject H0")

Friedman chi square stat = 7.42E+00, p-value = 2.45E-02
Reject H0


In [5]:
# Wilcoxon piarwise test for 2 related samples

alg1 = 'Alg2'
alg2 = 'Alg3'

wilcoxon_stat, wilcoxon_pvalue = wilcoxon(df[alg1], df[alg2])
print("Wilcoxon stat = %.2E, p-value = %.2E" % (wilcoxon_stat, wilcoxon_pvalue))
print("R- =", (df[alg2] < df[alg1]).sum())
print("R+ =", (df[alg2] > df[alg1]).sum())

# interpretation
alpha = 0.05
print("Fail to reject H0" if wilcoxon_pvalue > alpha else "Reject H0")

Wilcoxon stat = 4.18E+02, p-value = 8.45E-03
R- = 37
R+ = 16
Reject H0


In [6]:
# Multiple Wilcoxon tests using the last column as control

control_col = df.columns[len(df.columns)-1]
df2 = df.drop(control_col, axis=1)
df2.head()

# for Holm correction (see next cell)
uncorrected_pvalues = []
rnegs = []
rposs = []

for c in df2.columns:
    wilcoxon_stat, wilcoxon_pvalue = wilcoxon(df[c], df[control_col])
    print("Wilcoxon test for", df[c].name, "vs.", df[control_col].name)
    print("Wilcoxon stat = %.2E, p-value = %.2E" % (wilcoxon_stat, wilcoxon_pvalue))
    uncorrected_pvalues.append(wilcoxon_pvalue)
    rneg = (df[control_col] < df[c]).sum()
    rnegs.append(rneg)
    print("R- =", rneg)
    rpos = (df[control_col] > df[c]).sum()
    rposs.append(rpos)
    print("R+ =", rpos)

    # interpretation
    alpha = 0.05
    print("Fail to reject H0\n" if wilcoxon_pvalue > alpha else "Reject H0\n")

Wilcoxon test for Alg1 vs. Alg3
Wilcoxon stat = 5.16E+02, p-value = 1.68E-01
R- = 30
R+ = 21
Fail to reject H0

Wilcoxon test for Alg2 vs. Alg3
Wilcoxon stat = 4.18E+02, p-value = 8.45E-03
R- = 37
R+ = 16
Reject H0



In [7]:
holm = multipletests(uncorrected_pvalues, method='holm')
for b,p in zip(holm[0],holm[1]):
    print("Reject H0," if b else "Fail to reject H0,", "corrected p-value = ", "%.2E" % p)

Fail to reject H0, corrected p-value =  1.68E-01
Reject H0, corrected p-value =  1.69E-02


In [8]:
# Friedman, Wilcoxon and Holm print to LaTeX table

out = ""
out = out + "\\begin{table}[!ht]\n" + "\\centering\n" + "\\caption{Friedman $p$-value = %.3f. " % friedman_pvalue + "Wilcoxon pairwise test with Holm correction using " + control_col + " as control algorithm." + "}\n" + "\\label{label1}\n" + "\\begin{tabular}{|lccccc|}\n" + "\\hline\n" + "Algorithm & $p$-value & $R^-$ & $R^+$ & Holm & $H_0$ \\\\\n" + "\\hline\n"

for alg,unc,rn,rp,corrp,h in zip(df2.columns,uncorrected_pvalues,rnegs,rposs,holm[1],holm[0]):
    hyp = "Reject" if h else "Fail to reject"
    out = out + df[alg].name + " & " + "%.2E" % unc + " & " + str(rn) + " & " + str(rp) + " & " + "%.2E" % corrp + " & " + hyp + " \\\\\n"
      
out = out + "\\hline\n" + "\\end{tabular}\n" + "\\end{table}" 

print(out)

\begin{table}[!ht]
\centering
\caption{Friedman $p$-value = 0.025. Wilcoxon pairwise test with Holm correction using Alg3 as control algorithm.}
\label{label1}
\begin{tabular}{|lccccc|}
\hline
Algorithm & $p$-value & $R^-$ & $R^+$ & Holm & $H_0$ \\
\hline
Alg1 & 1.68E-01 & 30 & 21 & 1.68E-01 & Fail to reject \\
Alg2 & 8.45E-03 & 37 & 16 & 1.69E-02 & Reject \\
\hline
\end{tabular}
\end{table}
