In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import ndcg_score
from scipy.stats import spearmanr, kendalltau

def rank_accuracy(ground_truth, predicted_rank):
    return int(np.array_equal(ground_truth, predicted_rank))

In [22]:
from scipy.stats import ttest_rel, wilcoxon
def statistical_significance(data_name, model_names, dataset, metric_name, metric_fn):

    arrays = dataset.groupby(['slug']).agg(list)
    arrays.reset_index(inplace=True)
    results = []
    for m in model_names:
        ground_truth = arrays[m]
        feature_grade = ground_truth.apply(lambda x: sorted(x, reverse=True))
        metrics = []
        for i in range(len(arrays)):
            curr_truth = np.asarray(ground_truth[i])
            curr_feat_grade = np.asarray(feature_grade[i])
            
            if metric_name == 'ndcg':
                if len(curr_truth) == 1:
                    continue
                curr_truth = np.asarray([curr_truth])
                curr_feat_grade = np.asarray([curr_feat_grade])
                metrics.append(metric_fn(curr_truth, curr_feat_grade))
            else:
                score = metric_fn(curr_truth, curr_feat_grade)
                if isinstance(score, tuple):
                    score = score[0]
                metrics.append(score)
        results.append((m, metrics))
    print(data_name + ' + ' + metric_name)
    for i in range(1, len(model_names)):
        test = wilcoxon(results[0][1], results[i][1])
        print(results[0][0] + ' vs. ' + results[i][0] + '- p-value:{} statistic:{}'.format(test[1], test[0]))
    
    
    return results

### Table 5

In [23]:
model_names = ['bertRank', 'bert_regression', 'bert_classification']
dataset = pd.read_csv('os_eng_rank_features.csv')
results = statistical_significance('os_eng', model_names, dataset, 'ndcg', ndcg_score)

os_eng + ndcg
bertRank vs. bert_regression- p-value:0.7733168776432707 statistic:1763.0
bertRank vs. bert_classification- p-value:0.013745972805509434 statistic:1084.0


  arrays.reset_index(inplace=True)


### Table 5

In [24]:
model_names = ['bertRank', 'bert_regression', 'bert_classification']
dataset = pd.read_csv('os_eng_rank_features.csv')
results = statistical_significance('os_eng', model_names, dataset, 'spearmanr', spearmanr)

  arrays.reset_index(inplace=True)


os_eng + spearmanr
bertRank vs. bert_regression- p-value:0.000967428450247743 statistic:19.0
bertRank vs. bert_classification- p-value:3.962393646612975e-08 statistic:6.0


### Table 5

In [25]:
model_names = ['bertRank', 'bert_regression', 'bert_classification']
dataset = pd.read_csv('os_eng_rank_features.csv')
results = statistical_significance('os_eng', model_names, dataset, 'kendalltau', kendalltau)

os_eng + kendalltau
bertRank vs. bert_regression- p-value:0.000967428450247743 statistic:19.0
bertRank vs. bert_classification- p-value:4.1346984212997275e-08 statistic:12.0


  arrays.reset_index(inplace=True)


### Table 5

In [26]:
model_names = ['bertRank', 'bert_regression', 'bert_classification']
dataset = pd.read_csv('os_eng_rank_features.csv')
results = statistical_significance('os_eng', model_names, dataset, 'RA', rank_accuracy)

os_eng + RA
bertRank vs. bert_regression- p-value:0.000967428450247743 statistic:19.0
bertRank vs. bert_classification- p-value:9.491672398804045e-07 statistic:54.0


  arrays.reset_index(inplace=True)


### Table 4

In [33]:
model_names = ['bertRank', 'word2vec-google-news-300_pairwiseSVM', 'fasttext-wiki-news-subwords-300_pairwiseSVM','glove-wiki-gigaword-300_pairwiseSVM']
dataset = pd.read_csv('os_eng_rank_features.csv')
reuslts=statistical_significance('os_eng', model_names, dataset, 'ndcg', ndcg_score)

  arrays.reset_index(inplace=True)


os_eng + ndcg
bertRank vs. word2vec-google-news-300_pairwiseSVM- p-value:0.00042382365472279907 statistic:444.0
bertRank vs. fasttext-wiki-news-subwords-300_pairwiseSVM- p-value:0.03034672438851402 statistic:780.0
bertRank vs. glove-wiki-gigaword-300_pairwiseSVM- p-value:1.7273419704794992e-05 statistic:333.0


In [28]:
model_names = ['bertRank', 'word2vec-google-news-300_pairwiseSVM', 'fasttext-wiki-news-subwords-300_pairwiseSVM','glove-wiki-gigaword-300_pairwiseSVM']
dataset = pd.read_csv('newsela_en_rank_features.csv')
reuslts=statistical_significance('newsela_en', model_names, dataset, 'ndcg', ndcg_score)

  arrays.reset_index(inplace=True)


newsela_en + ndcg
bertRank vs. word2vec-google-news-300_pairwiseSVM- p-value:1.1152287953113957e-109 statistic:52553.5
bertRank vs. fasttext-wiki-news-subwords-300_pairwiseSVM- p-value:3.5104135844101953e-75 statistic:90392.5
bertRank vs. glove-wiki-gigaword-300_pairwiseSVM- p-value:1.6933711304352312e-44 statistic:135890.0


### Table 4

In [32]:
model_names = ['bertRank', 'word2vec-google-news-300_pairwiseSVM', 'fasttext-wiki-news-subwords-300_pairwiseSVM','glove-wiki-gigaword-300_pairwiseSVM']
dataset = pd.read_csv('os_eng_rank_features.csv')
reuslts=statistical_significance('os_eng', model_names, dataset, 'spearmanr', spearmanr)

  arrays.reset_index(inplace=True)


os_eng + spearmanr
bertRank vs. word2vec-google-news-300_pairwiseSVM- p-value:0.0832645166635504 statistic:9.0
bertRank vs. fasttext-wiki-news-subwords-300_pairwiseSVM- p-value:0.007466740134387747 statistic:13.0
bertRank vs. glove-wiki-gigaword-300_pairwiseSVM- p-value:0.2059032107320684 statistic:7.0


### Table 4

In [30]:
model_names = ['bertRank', 'word2vec-google-news-300_pairwiseSVM', 'fasttext-wiki-news-subwords-300_pairwiseSVM','glove-wiki-gigaword-300_pairwiseSVM']
dataset = pd.read_csv('os_eng_rank_features.csv')
reuslts=statistical_significance('os_eng', model_names, dataset, 'kendalltau', kendalltau)

os_eng + kendalltau
bertRank vs. word2vec-google-news-300_pairwiseSVM- p-value:0.0832645166635504 statistic:9.0
bertRank vs. fasttext-wiki-news-subwords-300_pairwiseSVM- p-value:0.007466740134387747 statistic:13.0
bertRank vs. glove-wiki-gigaword-300_pairwiseSVM- p-value:0.2059032107320684 statistic:7.0


  arrays.reset_index(inplace=True)


### Table 4

In [31]:
model_names = ['bertRank', 'word2vec-google-news-300_pairwiseSVM', 'fasttext-wiki-news-subwords-300_pairwiseSVM','glove-wiki-gigaword-300_pairwiseSVM']
dataset = pd.read_csv('os_eng_rank_features.csv')
reuslts=statistical_significance('os_eng', model_names, dataset, 'RA', rank_accuracy)

os_eng + RA
bertRank vs. word2vec-google-news-300_pairwiseSVM- p-value:0.15729920705028502 statistic:9.0
bertRank vs. fasttext-wiki-news-subwords-300_pairwiseSVM- p-value:0.012554918596966547 statistic:14.0
bertRank vs. glove-wiki-gigaword-300_pairwiseSVM- p-value:0.4142161782425253 statistic:7.0


  arrays.reset_index(inplace=True)
