In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import friedmanchisquare
from scikit_posthocs import posthoc_nemenyi_friedman as nemenyi

In [2]:
df = pd.read_csv("/home/dobraczka/Downloads/git/er-embedding-benchmark/data/all_results.csv",header=0,sep=",")

In [3]:
df[df["dataset"] == "dblp-acm"]

Unnamed: 0,dataset,embed_model,model_name,test_f1,test_precision,test_recall,test_time,train_f1,train_precision,train_recall,train_time,val_f1,val_precision,val_recall,vector_name
2385,dblp-acm,RDGCN,svc,1.000000,1.000000,1.000000,0.346381,1.000000,1.000000,1.000000,2.441623,0.994350,1.000000,0.988764,SimAndEmb
2386,dblp-acm,RDGCN,random forest 20,0.950355,1.000000,0.905405,0.336764,1.000000,1.000000,1.000000,2.812115,0.951708,1.000000,0.907865,SimAndEmb
2387,dblp-acm,RDGCN,random forest 50,0.997743,1.000000,0.995495,0.337436,1.000000,1.000000,1.000000,3.480028,0.995485,1.000000,0.991011,SimAndEmb
2388,dblp-acm,RDGCN,random forest 100,0.997743,1.000000,0.995495,0.346099,1.000000,1.000000,1.000000,4.520582,0.994350,1.000000,0.988764,SimAndEmb
2389,dblp-acm,RDGCN,random forest 200,1.000000,1.000000,1.000000,0.361146,1.000000,1.000000,1.000000,6.753609,0.995485,1.000000,0.991011,SimAndEmb
2390,dblp-acm,RDGCN,random forest 500,0.995475,1.000000,0.990991,0.380361,1.000000,1.000000,1.000000,13.717657,0.990930,1.000000,0.982022,SimAndEmb
2391,dblp-acm,RDGCN,decision tree,0.682493,1.000000,0.518018,0.337474,1.000000,1.000000,1.000000,2.615690,0.611544,1.000000,0.440449,SimAndEmb
2392,dblp-acm,RDGCN,gaussian naive bayes,0.967442,1.000000,0.936937,0.337044,0.996778,1.000000,0.993577,2.365130,0.968714,1.000000,0.939326,SimAndEmb
2393,dblp-acm,RDGCN,MLP,1.000000,1.000000,1.000000,0.415257,1.000000,1.000000,1.000000,5.479762,0.996618,1.000000,0.993258,SimAndEmb
2394,dblp-acm,RDGCN,svc,0.969838,1.000000,0.941441,0.239101,1.000000,1.000000,1.000000,1.758556,0.968714,1.000000,0.939326,SimAndEmbNormalized


# Analysis of Feature Input (for RDGCN,MLP)
To check if using embeddings in conjunction with similarities provides an improvement overall we sum the test_f1 value over all folds and classifiers

In [4]:
df["dataset"] = df["dataset"].str.replace(r"-721_5fold-\d","")
#aggregated = df.groupby(["dataset","embed_model","vector_name","model_name"]).mean()
aggregated = df[df["model_name"]=="MLP"].groupby(["embed_model","vector_name","dataset"]).mean()

In [5]:
#one_data_series = aggregated.loc[("D_W_15K_V1","MultiKE")]["test_f1"]
one_data_series = aggregated.loc[("RDGCN")]["test_f1"]
one_data_block_design = pd.DataFrame([one_data_series['OnlyEmb'], one_data_series['OnlySim'], one_data_series['OnlySimNormalized'], one_data_series['SimAndEmb'], one_data_series['SimAndEmbNormalized']]).T
one_data_block_design.columns = ['OnlyEmb','OnlySim','OnlySimNormalized','SimAndEmb','SimAndEmbNormalized']

In [6]:
cm = sns.cubehelix_palette(5, as_cmap=True)
stylized = one_data_block_design.style.background_gradient(cmap=cm, axis=1)
stylized

Unnamed: 0_level_0,OnlyEmb,OnlySim,OnlySimNormalized,SimAndEmb,SimAndEmbNormalized
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
D_W_15K_V1,0.73211,0.779592,0.731568,0.827257,0.79499
D_W_15K_V2,0.810077,0.849,0.794977,0.898289,0.877148
D_Y_15K_V1,0.945451,0.99005,0.967125,0.977117,0.961866
D_Y_15K_V2,0.944652,0.992493,0.977506,0.979986,0.969672
EN_DE_15K_V1,0.938084,0.920715,0.89562,0.960736,0.950634
EN_DE_15K_V2,0.930118,0.945956,0.915341,0.958919,0.950563
EN_FR_15K_V1,0.907002,0.899598,0.850392,0.932527,0.931306
EN_FR_15K_V2,0.920139,0.909774,0.863111,0.933689,0.943442
ScaDS_imdb_tmdb,0.526009,0.996441,0.994903,0.99239,0.979798
ScaDS_imdb_tvdb,0.277778,0.991543,0.996819,0.997877,0.792363


Using Embeddings seems to improve the results.
Let's test if the difference is significant!

The Friedman test is designed to check if there is a significant difference in the rank sums

In [7]:
statistic, pvalue = friedmanchisquare(one_data_series['OnlyEmb'], one_data_series['OnlySim'], one_data_series['OnlySimNormalized'], one_data_series['SimAndEmb'], one_data_series['SimAndEmbNormalized'])
if pvalue < 0.05:
    print("There is a significant difference in the rank sums: ", pvalue)
    res = nemenyi(one_data_block_design)
res

There is a significant difference in the rank sums:  5.9615749504829854e-05


Unnamed: 0,OnlyEmb,OnlySim,OnlySimNormalized,SimAndEmb,SimAndEmbNormalized
OnlyEmb,-1.0,0.081608,0.9,0.001,0.069613
OnlySim,0.081608,-1.0,0.451409,0.273542,0.9
OnlySimNormalized,0.9,0.451409,-1.0,0.002353,0.413055
SimAndEmb,0.001,0.273542,0.002353,-1.0,0.305285
SimAndEmbNormalized,0.069613,0.9,0.413055,0.305285,-1.0


There is a significant difference (p < 0.05) between using OnlyEmb and SimAndEmb.