In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import friedmanchisquare
from scikit_posthocs import posthoc_nemenyi_friedman as nemenyi

In [2]:
df = pd.read_csv("/home/dobraczka/Downloads/git/er-embedding-benchmark/data/all_results.csv",header=0,sep=",")
df["dataset"] = df["dataset"].str.replace(r"-721_5fold-\d","")

In [3]:
# creates a df with mean value of folds and all values per feature variant with all classifiers and datasets as rows
def get_expanded_df(df, removed_dim_name, fixed_value, levels):
    aggregated = df[df[removed_dim_name]==fixed_value].groupby(levels).mean()
    #level_zero = df["model_name"].unique()
    #level_one = df["vector_name"].unique()
    level_zero = df[levels[0]].unique()
    level_one = df[levels[1]].unique()
    variant_cols = []
    row_names = None
    for f in level_one:
        tmp_names = None
        tmp_col = None
        for m in level_zero:
            tmp = aggregated.loc[(m,f)]["test_f1"]
            tmp = tmp.add_suffix("_" + m)
            tmp.name = f
            if tmp_names is None:
                tmp_names = set(tmp.index.to_list())
            else:
                tmp_names = tmp_names | set(tmp.index.to_list())
            if tmp_col is None:
                tmp_col = tmp
            else:
                tmp_col = tmp_col.append(tmp)
        if row_names is None:
            row_names = tmp_names
        else:
            row_names = row_names & tmp_names
        variant_cols.append(tmp_col)
    for i in range(0, len(variant_cols)):
        variant_cols[i] = variant_cols[i].loc[row_names]
    expanded = pd.DataFrame(variant_cols).T
    expanded.sort_index(inplace=True)
    return expanded

In [4]:
def stylize(df):
    cm = sns.cubehelix_palette(5, as_cmap=True)
    return df.style.background_gradient(cmap=cm, axis=1)

In [5]:
def statistical_test(df, features=True):
    #dont know how to better provide the arguments without it being interpreted as single array
    if features:
        statistic, pvalue = friedmanchisquare(df['OnlyEmb'], df['OnlySim'], df['OnlySimNormalized'], df['SimAndEmb'], df['SimAndEmbNormalized'])
    else:
        statistic, pvalue = friedmanchisquare(df.iloc[0],df.iloc[1],df.iloc[2],df.iloc[3],df.iloc[4],df.iloc[5],df.iloc[6],df.iloc[7])
    if pvalue < 0.05:
        print("There is a significant difference in the rank sums: ", pvalue)
        res = nemenyi(df)
        return res
    else:
        print("No significant difference")
        return None

# Analysis of Feature Input for RDGCN
To check if using embeddings in conjunction with similarities provides an improvement overall we sum the test_f1 value over all folds

In [6]:
whole_df = get_expanded_df(df,"embed_model","RDGCN",["model_name","vector_name","dataset"])
stylized = stylize(whole_df)
stylized

Unnamed: 0_level_0,SimAndEmb,SimAndEmbNormalized,OnlySim,OnlySimNormalized,OnlyEmb
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
D_W_15K_V1_MLP,0.827257,0.79499,0.779592,0.731568,0.73211
D_W_15K_V1_decision tree,0.767036,0.739439,0.759321,0.740447,0.56285
D_W_15K_V1_gaussian naive bayes,0.645727,0.640866,0.631002,0.641253,0.479067
D_W_15K_V1_random forest 100,0.819275,0.75282,0.790552,0.751782,0.684229
D_W_15K_V1_random forest 20,0.797031,0.745846,0.789442,0.748123,0.60121
D_W_15K_V1_random forest 200,0.820802,0.751221,0.791908,0.752006,0.699446
D_W_15K_V1_random forest 50,0.80905,0.749912,0.790832,0.751583,0.655275
D_W_15K_V1_random forest 500,0.827357,0.749413,0.791672,0.752007,0.703812
D_W_15K_V1_svc,0.861579,0.805039,0.679925,0.638477,0.793565
D_W_15K_V2_MLP,0.898289,0.877148,0.849,0.794977,0.810077


Mean row wise rank (the lower the better):

In [7]:
whole_df.apply(lambda x: x.rank(ascending=False), axis=1).mean().sort_values()

SimAndEmb              1.717949
OnlySim                1.995726
SimAndEmbNormalized    3.179487
OnlySimNormalized      3.534188
OnlyEmb                4.572650
dtype: float64

Using embeddings AND similarity seems to improve the results slightly.
Let's test if the difference is significant!

The Friedman test is designed to check if there is a significant difference in the rank sums

In [8]:
res = statistical_test(whole_df)
res

There is a significant difference in the rank sums:  3.616224127070425e-54


Unnamed: 0,SimAndEmb,SimAndEmbNormalized,OnlySim,OnlySimNormalized,OnlyEmb
SimAndEmb,-1.0,0.001,0.642759,0.001,0.001
SimAndEmbNormalized,0.001,-1.0,0.001,0.425813,0.001
OnlySim,0.642759,0.001,-1.0,0.001,0.001
OnlySimNormalized,0.001,0.425813,0.001,-1.0,0.001
OnlyEmb,0.001,0.001,0.001,0.001,-1.0


Values below 0.05 show a significant difference. So keeping the mean row wise ranks in mind:
SimAndEmb is significantly better than SimAndEmbNormalized, OnlySimNormalized and OnlyEmb

# Analysis of Feature Input for MultiKE
To check if using embeddings in conjunction with similarities provides an improvement overall we sum the test_f1 value over all folds

In [9]:
whole_df = get_expanded_df(df,"embed_model","MultiKE",["model_name","vector_name","dataset"])
stylized = stylize(whole_df)
stylized

Unnamed: 0_level_0,SimAndEmb,SimAndEmbNormalized,OnlySim,OnlySimNormalized,OnlyEmb
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
D_W_15K_V1_MLP,0.891242,0.91091,0.825247,0.777011,0.863744
D_W_15K_V1_decision tree,0.795412,0.790338,0.782614,0.751449,0.648228
D_W_15K_V1_gaussian naive bayes,0.70191,0.679272,0.634647,0.679298,0.510797
D_W_15K_V1_random forest 100,0.858879,0.87606,0.839638,0.766699,0.847538
D_W_15K_V1_random forest 20,0.858057,0.846375,0.835402,0.764671,0.761282
D_W_15K_V1_random forest 200,0.856485,0.879976,0.837801,0.76651,0.860692
D_W_15K_V1_random forest 50,0.854638,0.867492,0.838676,0.764615,0.824295
D_W_15K_V1_random forest 500,0.858658,0.881195,0.838674,0.766747,0.870647
D_W_15K_V1_svc,0.871132,0.863103,0.679055,0.697728,0.883578
D_W_15K_V2_MLP,0.958186,0.962412,0.925613,0.843425,0.926455


Mean row wise rank (the lower the better):

In [10]:
whole_df.apply(lambda x: x.rank(ascending=False), axis=1).mean().sort_values()

SimAndEmb              1.777778
OnlySim                2.347222
SimAndEmbNormalized    2.527778
OnlyEmb                4.027778
OnlySimNormalized      4.319444
dtype: float64

Using embeddings AND similarities seems to improve the results.
Let's test if the difference is significant!

The Friedman test is designed to check if there is a significant difference in the rank sums

In [11]:
res = statistical_test(whole_df)
res

There is a significant difference in the rank sums:  9.182111362836687e-30


Unnamed: 0,SimAndEmb,SimAndEmbNormalized,OnlySim,OnlySimNormalized,OnlyEmb
SimAndEmb,-1.0,0.035854,0.194648,0.001,0.001
SimAndEmbNormalized,0.035854,-1.0,0.9,0.001,0.001
OnlySim,0.194648,0.9,-1.0,0.001,0.001
OnlySimNormalized,0.001,0.001,0.001,-1.0,0.777213
OnlyEmb,0.001,0.001,0.001,0.777213,-1.0


The advantage of SimAndEmb is more clear here, which is probably due to the missing CSV and Movie datasets

# Analysis of Feature Input for BootEA
To check if using embeddings in conjunction with similarities provides an improvement overall we sum the test_f1 value over all folds

In [12]:
whole_df = get_expanded_df(df,"embed_model","BootEA",["model_name","vector_name","dataset"])
cm = sns.cubehelix_palette(5, as_cmap=True)
stylized = whole_df.style.background_gradient(cmap=cm, axis=1)
stylized

Unnamed: 0_level_0,SimAndEmb,SimAndEmbNormalized,OnlySim,OnlySimNormalized,OnlyEmb
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
D_W_15K_V1_MLP,0.791804,0.818493,0.768113,0.713135,0.727974
D_W_15K_V1_decision tree,0.494057,0.77657,0.51491,0.748677,0.558257
D_W_15K_V1_gaussian naive bayes,0.782463,0.643331,0.7574,0.643552,0.537136
D_W_15K_V1_random forest 100,0.782081,0.783129,0.750966,0.760654,0.640658
D_W_15K_V1_random forest 20,0.800025,0.778346,0.730087,0.747176,0.594384
D_W_15K_V1_random forest 200,0.788171,0.782841,0.751714,0.775289,0.651368
D_W_15K_V1_random forest 50,0.788775,0.783869,0.74841,0.760121,0.627585
D_W_15K_V1_random forest 500,0.786525,0.781742,0.752484,0.774814,0.658408
D_W_15K_V1_svc,0.803262,0.75734,0.679193,0.660552,0.800042
D_W_15K_V2_MLP,0.95054,0.944617,0.924012,0.847119,0.936732


Mean row wise rank (the lower the better):

In [13]:
whole_df.apply(lambda x: x.rank(ascending=False), axis=1).mean().sort_values()

SimAndEmbNormalized    1.861111
SimAndEmb              2.055556
OnlySimNormalized      3.055556
OnlySim                3.736111
OnlyEmb                4.291667
dtype: float64

Interestingly normalization seems to be beneficial here

The Friedman test is designed to check if there is a significant difference in the rank sums

In [14]:
res = statistical_test(whole_df)
res

There is a significant difference in the rank sums:  1.8923043825176619e-26


Unnamed: 0,SimAndEmb,SimAndEmbNormalized,OnlySim,OnlySimNormalized,OnlyEmb
SimAndEmb,-1.0,0.9,0.001,0.001393,0.001
SimAndEmbNormalized,0.9,-1.0,0.001,0.001,0.001
OnlySim,0.001,0.001,-1.0,0.073738,0.216481
OnlySimNormalized,0.001393,0.001,0.073738,-1.0,0.001
OnlyEmb,0.001,0.001,0.216481,0.001,-1.0


Again the advantage of using similarities AND embeddings is clear here, which again is probably due to the missing CSV and Movie datasets.

# Analysis of Classifier Performance (on SimAndEmb)
Similarly we analyze the different performance of the classifiers

In [15]:
whole_df = get_expanded_df(df,"vector_name","SimAndEmb",["embed_model","model_name","dataset"])
stylized = stylize(whole_df)
stylized

Unnamed: 0_level_0,svc,random forest 20,random forest 50,random forest 100,random forest 200,random forest 500,decision tree,gaussian naive bayes,MLP
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
D_W_15K_V1_BootEA,0.803262,0.800025,0.788775,0.782081,0.788171,0.786525,0.494057,0.782463,0.791804
D_W_15K_V1_MultiKE,0.871132,0.858057,0.854638,0.858879,0.856485,0.858658,0.795412,0.70191,0.891242
D_W_15K_V1_RDGCN,0.861579,0.797031,0.80905,0.819275,0.820802,0.827357,0.767036,0.645727,0.827257
D_W_15K_V2_BootEA,0.945757,0.916522,0.924311,0.926187,0.920948,0.923547,0.531943,0.975445,0.95054
D_W_15K_V2_MultiKE,0.964495,0.923895,0.928638,0.930938,0.931163,0.932135,0.851528,0.669124,0.958186
D_W_15K_V2_RDGCN,0.901117,0.855963,0.864433,0.868989,0.870762,0.871779,0.835656,0.673662,0.898289
D_Y_15K_V1_BootEA,0.918748,0.885218,0.934201,0.950826,0.95151,0.964472,0.609641,0.900606,0.933886
D_Y_15K_V1_MultiKE,0.990512,0.9906,0.991445,0.992025,0.992168,0.992295,0.978265,0.95819,0.992159
D_Y_15K_V1_RDGCN,0.976349,0.982926,0.9891,0.988705,0.990082,0.990402,0.977337,0.948932,0.977117
D_Y_15K_V2_BootEA,0.987733,0.962488,0.984916,0.979826,0.985487,0.986704,0.630832,0.981906,0.992795


In [16]:
whole_df.apply(lambda x: x.rank(ascending=False), axis=1).mean().sort_values()

random forest 500       2.764706
random forest 200       3.470588
MLP                     3.720588
random forest 100       4.117647
random forest 50        4.632353
svc                     5.191176
random forest 20        6.382353
gaussian naive bayes    7.132353
decision tree           7.588235
dtype: float64

In [17]:
res = statistical_test(whole_df,features=False)
res

There is a significant difference in the rank sums:  2.819056165631006e-08


Unnamed: 0,svc,random forest 20,random forest 50,random forest 100,random forest 200,random forest 500,decision tree,gaussian naive bayes,MLP
svc,-1.0,0.662429,0.9,0.769909,0.190233,0.007955,0.009351,0.083732,0.398811
random forest 20,0.662429,-1.0,0.172421,0.018802,0.001,0.001,0.648994,0.9,0.002007
random forest 50,0.9,0.172421,-1.0,0.9,0.689302,0.112008,0.001,0.005253,0.9
random forest 100,0.769909,0.018802,0.9,-1.0,0.9,0.514642,0.001,0.001,0.9
random forest 200,0.190233,0.001,0.689302,0.9,-1.0,0.9,0.001,0.001,0.9
random forest 500,0.007955,0.001,0.112008,0.514642,0.9,-1.0,0.001,0.001,0.877392
decision tree,0.009351,0.648994,0.001,0.001,0.001,0.001,-1.0,0.9,0.001
gaussian naive bayes,0.083732,0.9,0.005253,0.001,0.001,0.001,0.9,-1.0,0.001
MLP,0.398811,0.002007,0.9,0.9,0.9,0.877392,0.001,0.001,-1.0
