In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import friedmanchisquare
from scikit_posthocs import posthoc_nemenyi_friedman as nemenyi

In [2]:
df = pd.read_csv("/home/dobraczka/Downloads/git/er-embedding-benchmark/data/all_results.csv",header=0,sep=",")

# Analysis of Feature Input (for MultiKE)
To check if using embeddings in conjunction with similarities provides an improvement overall we sum the test_f1 value over all folds and classifiers

In [3]:
df["dataset"] = df["dataset"].str.replace(r"-721_5fold-\d","")
#aggregated = df.groupby(["dataset","embed_model","vector_name","model_name"]).mean()
aggregated = df.groupby(["embed_model","vector_name","dataset"]).mean()

In [4]:
#one_data_series = aggregated.loc[("D_W_15K_V1","MultiKE")]["test_f1"]
one_data_series = aggregated.loc[("MultiKE")]["test_f1"]
one_data_block_design = pd.DataFrame([one_data_series['OnlyEmb'], one_data_series['OnlySim'], one_data_series['OnlySimNormalized'], one_data_series['SimAndEmb'], one_data_series['SimAndEmbNormalized']]).T
one_data_block_design.columns = ['OnlyEmb','OnlySim','OnlySimNormalized','SimAndEmb','SimAndEmbNormalized']

In [5]:
cm = sns.cubehelix_palette(5, as_cmap=True)
s = one_data_block_design.style.background_gradient(cmap=cm, axis=1)
s

Unnamed: 0_level_0,OnlyEmb,OnlySim,OnlySimNormalized,SimAndEmb,SimAndEmbNormalized
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
D_W_15K_V1,0.785645,0.790195,0.748303,0.83849,0.843858
D_W_15K_V2,0.85083,0.872561,0.826816,0.8989,0.907431
D_Y_15K_V1,0.862925,0.985768,0.967756,0.986407,0.971496
D_Y_15K_V2,0.883976,0.990009,0.976977,0.990036,0.98258
EN_DE_15K_V1,0.869927,0.888634,0.892834,0.963513,0.931831
EN_DE_15K_V2,0.880563,0.928788,0.905889,0.969138,0.940233
EN_FR_15K_V1,0.862289,0.886036,0.852618,0.919756,0.91535
EN_FR_15K_V2,0.8952,0.944003,0.864234,0.943151,0.927239


Using Embeddings seems to improve the results.
Let's test if the difference is significant!

The Friedman test is designed to check if there is a significant difference in the rank sums

In [6]:
statistic, pvalue = friedmanchisquare(one_data_series['OnlyEmb'], one_data_series['OnlySim'], one_data_series['OnlySimNormalized'], one_data_series['SimAndEmb'], one_data_series['SimAndEmbNormalized'])
if pvalue < 0.05:
    print("There is a significant difference in the rank sums: ", pvalue)
    res = nemenyi(one_data_block_design)
res

There is a significant difference in the rank sums:  6.053820135900641e-05


Unnamed: 0,OnlyEmb,OnlySim,OnlySimNormalized,SimAndEmb,SimAndEmbNormalized
OnlyEmb,-1.0,0.123326,0.9,0.001,0.022374
OnlySim,0.123326,-1.0,0.174719,0.508007,0.9
OnlySimNormalized,0.9,0.174719,-1.0,0.001393,0.035854
SimAndEmb,0.001,0.508007,0.001393,-1.0,0.866947
SimAndEmbNormalized,0.022374,0.9,0.035854,0.866947,-1.0


There is a significant difference (p < 0.05) between using OnlyEmb and SimAndEmb/SimAndEmbNormalized.

There is also a significant difference (p < 0.05) between using OnlySim and SimAndEmb/SimAndEmbNormalized.

# Using similarities in conjunction with embeddings results in significant improvements for all classifiers, than using either similarities or embeddings on their own