In [1]:
import pandas as pd

df = pd.read_csv("../aggregated_results/hp_tune_results.csv").rename(columns={"method": "model"})

## Top 5 best overall hyper-parameter settings for each model

In [2]:
summary_df = df.groupby(["model", "settings"], as_index=False).mean(numeric_only=True)
for model, grouped in summary_df.groupby("model"):
    print(f"{model}\n{'='*80}")
    print(grouped[["settings", "val_log2pr", "test_log2pr"]].sort_values("val_log2pr", ascending=False).head(5))
    print(f"{'-'*80}\n")

GAT
                                     settings  val_log2pr  test_log2pr
15   hidden-channels=16_num-layers=3_lr=0.001    0.345954     0.344518
20   hidden-channels=16_num-layers=4_lr=0.001    0.340820     0.346242
21   hidden-channels=16_num-layers=4_lr=0.005    0.331128     0.339527
30   hidden-channels=32_num-layers=3_lr=0.001    0.329143     0.333958
11  hidden-channels=128_num-layers=5_lr=0.005    0.328574     0.336260
--------------------------------------------------------------------------------

GCN
                                     settings  val_log2pr  test_log2pr
72   hidden-channels=128_num-layers=5_lr=0.01    1.183741     1.050945
67   hidden-channels=128_num-layers=4_lr=0.01    1.118292     1.025969
63   hidden-channels=128_num-layers=3_lr=0.05    1.081920     0.978773
98    hidden-channels=32_num-layers=4_lr=0.05    1.063569     0.956344
71  hidden-channels=128_num-layers=5_lr=0.005    1.060713     0.968974
----------------------------------------------------------

## Dataset-specific optimal hyper-parameter settings for for each model

In [3]:
summary_df = df.groupby(["dataset", "model", "settings"], as_index=False).mean(numeric_only=True)
idxmax = summary_df.groupby(["dataset", "model"])["val_log2pr"].idxmax().values
summary_df.iloc[idxmax].set_index(["dataset", "model"]).drop("runid", axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,settings,train_log2pr,val_log2pr,test_log2pr,train_auroc,val_auroc,test_auroc
dataset,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
biogrid_disgenet,GAT,hidden-channels=32_num-layers=4_lr=0.005,0.071413,0.272931,0.268453,0.501352,0.50363,0.502826
biogrid_disgenet,GCN,hidden-channels=128_num-layers=4_lr=0.01,1.130235,0.66079,0.447006,0.65139,0.560282,0.522089
biogrid_disgenet,GIN,hidden-channels=128_num-layers=5_lr=0.1,1.941289,0.834584,0.711348,0.721319,0.581826,0.544529
biogrid_disgenet,GraphSAGE,hidden-channels=16_num-layers=5_lr=0.05,1.340549,0.628621,0.495213,0.648761,0.553994,0.523308
biogrid_disgenet,LabelProp,beta=0.999,4.873875,0.890676,0.818326,1.0,0.572271,0.554035
biogrid_disgenet,N2V-LogReg,dim=128_window-size=16_walk-length=160,2.976724,0.983677,0.772856,0.828483,0.591858,0.560702
biogrid_disgenet,N2V-SVM,dim=64_window-size=16_walk-length=160,2.389784,0.96374,0.71926,0.768072,0.584804,0.554266
biogrid_gobp,GAT,hidden-channels=16_num-layers=4_lr=0.001,0.112141,0.421837,0.344755,0.489818,0.50085,0.498059
biogrid_gobp,GCN,hidden-channels=128_num-layers=4_lr=0.05,2.06054,1.2115,0.808511,0.74313,0.626863,0.571582
biogrid_gobp,GIN,hidden-channels=128_num-layers=5_lr=0.1,3.603953,1.643371,1.055098,0.863974,0.667521,0.598501


## Optimal hyper-parameter settings for for each dataset and model

In [4]:
summary_df = df.groupby(["model", "dataset", "settings"], as_index=False).mean(numeric_only=True)
for (dataset, model), grouped in summary_df.groupby(["dataset", "model"]):
    print(f"{dataset} {model}\n{'='*80}")
    print(grouped[["settings", "val_log2pr", "test_log2pr"]].sort_values("val_log2pr", ascending=False).head(5))
    print(f"{'-'*80}\n")

biogrid_disgenet GAT
                                    settings  val_log2pr  test_log2pr
36  hidden-channels=32_num-layers=4_lr=0.005    0.272931     0.268453
15  hidden-channels=16_num-layers=3_lr=0.001    0.272798     0.262367
21  hidden-channels=16_num-layers=4_lr=0.005    0.265389     0.270530
40  hidden-channels=32_num-layers=5_lr=0.001    0.264030     0.242110
26  hidden-channels=16_num-layers=5_lr=0.005    0.262111     0.287674
--------------------------------------------------------------------------------

biogrid_disgenet GCN
                                      settings  val_log2pr  test_log2pr
247   hidden-channels=128_num-layers=4_lr=0.01    0.660790     0.447006
252   hidden-channels=128_num-layers=5_lr=0.01    0.646840     0.466744
253   hidden-channels=128_num-layers=5_lr=0.05    0.646377     0.491894
291   hidden-channels=64_num-layers=4_lr=0.005    0.626296     0.422513
246  hidden-channels=128_num-layers=4_lr=0.005    0.625649     0.419295
------------------------

## Label propagation favors large restart

No significant difference for any value between 0.7 and 0.999. Choose 0.9 to be the optimal value for beta.

In [5]:
summary_df = df.query("model == 'LabelProp'").groupby(["dataset", "settings"], as_index=False).mean(numeric_only=True)

print(f"Overall\n{'='*80}")
print(summary_df.groupby("settings", as_index=False).mean(numeric_only=True)[["settings", "train_log2pr", "val_log2pr", "test_log2pr"]].sort_values("val_log2pr", ascending=False))
print(f"{'-'*80}\n")

for dataset, grouped in summary_df.groupby("dataset"):
    print(f"{dataset}\n{'='*80}")
    print(grouped[["settings", "train_log2pr", "val_log2pr", "test_log2pr"]].sort_values("val_log2pr", ascending=False))
    print(f"{'-'*80}\n")

Overall
      settings  train_log2pr  val_log2pr  test_log2pr
10  beta=0.999      5.339612    2.388809     2.222950
9    beta=0.99      5.339612    2.388283     2.222421
8    beta=0.98      5.339612    2.387413     2.220557
7    beta=0.95      5.339612    2.384281     2.217614
6     beta=0.9      5.339612    2.377370     2.211540
5    beta=0.85      5.339612    2.369240     2.203762
4     beta=0.8      5.339612    2.361857     2.196095
3     beta=0.7      5.339612    2.337958     2.174765
2     beta=0.5      5.339457    2.256418     2.101043
1     beta=0.3      5.336457    2.070177     1.941257
0     beta=0.1      5.237425    1.494187     1.446290
--------------------------------------------------------------------------------

biogrid_disgenet
      settings  train_log2pr  val_log2pr  test_log2pr
10  beta=0.999      4.873875    0.890676     0.818326
9    beta=0.99      4.873875    0.890540     0.818069
8    beta=0.98      4.873875    0.889670     0.813908
7    beta=0.95      4.873875 