# Rankings and Statistical Significance Tests

In this notebook, we use the tabular results of the accompanied article to compute a ranking of the multi-annotator classification approaches and to perform statistical significance tests regarding the superiority of our approach annot-mix. 

First, we load the tabular results as numpy arrays.

In [None]:
import numpy as np
from scipy.stats import friedmanchisquare, norm, rankdata
from matplotlib import pyplot as plt

# Experimental results from the benchmark study.
approach_names = ["mv-base", "mv-mixup", "crowd-layer", "trace-reg", "conal", "union-net", "madl", "geo-ref-f", "geo-ref-w", "crowd-ar", "annot-mix"]
res_last = np.array([
    [66.1, 68.2, 69.8, 66.4, 69.0, 68.6, 72.0, 70.2, 70.1, 69.0, 73.8],
    [80.5, 82.8, 85.7, 82.6, 83.7, 85.2, 82.5, 85.4, 85.5, 84.6, 85.8],
    [73.3, 79.5, 79.5, 76.0, 80.6, 80.5, 79.5, 80.7, 80.9, 79.6, 84.6],
    [63.8, 81.3, 77.9, 65.1, 77.9, 81.4, 76.9, 80.5, 79.8, 80.5, 82.4],
    [51.9, 60.0,  4.8, 53.7, 27.4,  1.3, 42.8,  8.1,  8.1,  1.0, 64.7],
    [75.6, 81.3, 56.8, 82.7, 82.5, 66.3, 69.1, 82.4, 73.9, 78.1, 85.1],
    [68.0, 71.7, 36.8, 76.2, 52.8, 43.1, 85.0, 44.5, 44.5, 48.2, 90.1],
    [86.1, 89.0, 91.1, 92.0, 90.1, 90.1, 91.1, 91.8, 91.9, 89.4, 92.3],
    [71.9, 81.3,  0.9, 60.4, 21.3,  0.9, 79.2,  1.6,  1.7,  0.1, 83.6],
    [35.6, 43.0, 31.3, 36.6, 40.9, 30.9, 47.2, 35.2, 34.6, 39.3, 55.2],
    [74.3, 74.4, 85.6, 86.7, 76.1, 86.2, 76.0, 86.7, 82.1, 72.4, 86.2],
])
res_best = np.array([
    [66.8, 67.5, 69.0, 67.8, 69.2, 68.5, 72.4, 70.4, 69.8, 70.4, 73.8],
    [85.5, 85.1, 87.3, 85.8, 87.1, 87.5, 86.5, 87.4, 87.3, 87.3, 86.5],
    [72.8, 79.4, 79.4, 75.5, 80.3, 80.2, 78.8, 80.4, 80.7, 79.2, 84.4],
    [79.1, 82.8, 81.9, 79.2, 79.9, 82.0, 80.5, 81.9, 81.5, 80.4, 83.2],
    [53.2, 60.2,  4.5, 54.3, 27.1,  4.0, 42.7,  7.9,  7.9,  4.3, 64.1],
    [78.4, 81.3, 57.2, 82.8, 82.8, 66.8, 71.2, 82.4, 74.2, 77.8, 84.9],
    [71.5, 71.8, 36.9, 77.9, 52.8, 44.0, 85.1, 44.9, 44.9, 48.2, 90.0],
    [87.4, 88.2, 90.7, 91.4, 90.0, 89.8, 91.1, 91.5, 91.6, 89.1, 91.4],
    [74.9, 81.2,  1.0, 64.5, 21.2,  0.9, 79.0,  1.6,  1.6,  1.0, 83.5],
    [46.9, 46.2, 31.9, 46.9, 41.8, 33.6, 47.7, 35.9, 35.8, 40.1, 55.1],
    [77.1, 76.6, 86.2, 86.2, 79.3, 87.1, 78.0, 87.4, 83.1, 76.3, 87.0],
])

Second, we compute the rankings for performance results after the last and the best epoch.

In [None]:
# Compute rankings.
ranks_last = rankdata(-res_last, axis=1).mean(axis=0)  
print(f"Ranks (last epoch):\n {np.column_stack((approach_names, ranks_last.round(2)))}")
ranks_best = rankdata(-res_best, axis=1).mean(axis=0)
print(f"Ranks (best epoch):\n {np.column_stack((approach_names, ranks_best.round(2)))}")

# Plot rankings.
ranks = np.column_stack((ranks_last, ranks_best))
plt.imshow(ranks, cmap="PRGn_r", vmin=ranks.min(), vmax=ranks.max(), alpha=1.0),
plt.yticks(np.arange(len(approach_names)))
plt.savefig("ranks.pdf")
plt.show()

Third, we perform the statistical significance tests.

In [None]:
# Set significance level to 5%.
alpha = 0.05
n, k = res_best.shape

for epoch_type, res, ranks in zip(["last", "best"], [res_last, res_best], [ranks_last, ranks_best]):
    # Storage for significance results.
    significantly_better = np.array([False] * len(approach_names[:-1]))
    
    # Perform Friedman test as omnibus test for the results.
    f_statistic, p_value = friedmanchisquare(*res.T)
    
    if p_value < alpha:
        # Perform Dunn's post-hoc test for the results.
        rank_cmp = ranks[-1]
        p_values = []
        for i in range(len(approach_names)-1):
            test_statistic = (ranks[i] - rank_cmp) / np.sqrt((k * (k-1)) / (6 * n))
            p_values.append(2*(1 - norm.cdf(test_statistic)))
        p_values = np.array(p_values)
        
        # Perform Holm's step down procedure.
        p_sorted_idx = np.argsort(p_values)
        for i in range(len(approach_names)-1):
            alpha_i = alpha / (k-(i+1))
            if (p_values[p_sorted_idx[i]]) < alpha_i:
                significantly_better[p_sorted_idx[i:]] = True
            else:
                break
                
    print(f"Is annot-mix significantly better at the {epoch_type} epoch?")
    print(np.column_stack((approach_names[:-1], significantly_better)))