In [1]:
import os
import json
import pandas as pd
import torch
import numpy as np 
from parse import *

In [2]:
datasets = ['FB15k-237','NELL995']
model_names = ['SheafE_Translational', 'SheafE_Multisection']
results_loc = '../data/{}'
complex_metrics = ['1','10','mrr']
groupby_cols = ['model','embdim','esdim','sec','orthogonal']
resnames = ['1p','2p','3p','2i','3i','ip','pi']
orthogonals = [0,0.1,0.01]

In [3]:
def infer_params_from_filename(fname):
#     print(fname)
    shared_names = ['class','model','embdim','esdim','sec','norm','lbda']
    parse_str_seed_orthogonal = '{}_{}_{:d}embdim_{:d}esdim_{:d}sec_{:d}norm_{:f}lbda_{:f}orthogonal_{:d}epochs_{}loss_{}_{}seed_{:d}-{:d}'
    seed_orthogonal_names = shared_names + ['orthogonal','epochs','loss','sampler','seed','date','time']
    
    parse_str_original_orthogonal = '{}_{}_{:d}embdim_{:d}esdim_{:d}sec_{:d}norm_{:f}lbda_{:f}orthogonal_{:d}epochs_{}loss_{}_{:d}-{:d}'
    original_orthogonal_names = [n for n in seed_orthogonal_names if n != 'seed']
    
    parse_str_seed = '{}_{}_{:d}embdim_{:d}esdim_{:d}sec_{:d}norm_{:f}lbda_{:d}epochs_{}loss_{}_{}seed_{:d}-{:d}'
    seed_names = [n for n in seed_orthogonal_names if n != 'orthogonal']
    
    parse_str_original = '{}_{}_{:d}embdim_{:d}esdim_{:d}sec_{:d}norm_{:f}lbda_{:d}epochs_{}loss_{}_{:d}-{:d}'
    original_names = [n for n in seed_names if n != 'seed']
    
    parse_options = [parse_str_seed_orthogonal, parse_str_original_orthogonal, parse_str_seed, parse_str_original]
    parse_names = [seed_orthogonal_names, original_orthogonal_names, seed_names, original_names]
    for parse_option, parse_name in zip(parse_options,parse_names):
        parsed = parse(parse_option, fname)
        if parsed is not None:
            return {parse_name[i]:parsed[i] for i in range(len(parse_name))}
    
    print('ignoring', fname)
    

In [4]:
results = []
complex_results = []
idx = 0
for dataset in datasets:
    dataset_dirname = results_loc.format(dataset)
    dataset_complex_dirname = os.path.join(dataset_dirname, 'complex')
    subdirs = [f.name for f in os.scandir(dataset_dirname) if f.is_dir()]
    for subdir in subdirs:
        for model_name in model_names:
            if model_name in subdir:
                complex_fname = os.path.join(dataset_complex_dirname, subdir + '.csv')
                params = infer_params_from_filename(subdir)
                pk_result_fname = os.path.join(dataset_dirname, subdir, 'results.json')
                if params is not None and os.path.exists(complex_fname) and os.path.exists(pk_result_fname):
                    
                    with open(pk_result_fname) as json_file:
                        pkr = json.load(json_file)
                        
                    cr = pd.read_csv(complex_fname, index_col=0)[complex_metrics].to_dict()
                    try:
                        pk_mrr = pkr['metrics']['inverse_harmonic_mean_rank']['both']['realistic']
                        pk_10 = pkr['metrics']['hits_at_k']['both']['realistic']['10']
                    except KeyError:
                        # old pykeen results format
                        pk_mrr = pkr['metrics']['mean_reciprocal_rank']['both']['avg']
                        pk_10 = pkr['metrics']['hits_at_k']['both']['avg']['10']
                    
                    r = {'id':idx,'dataset':dataset,'gc_mrr':pk_mrr,'gc_10':pk_10}
                    results.append({**r,**params,**cr})
                    idx += 1
# cdf = pd.concat(complex_results, ignore_index=True)
df = pd.DataFrame(results)
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
df['orthogonal'] = df['orthogonal'].fillna(0)
df['seed'] = df['seed'].fillna(1234)
# df = df.merge(cdf, on='id', how='left')

ignoring SheafE_Multisection_64embdim_2esdim_1sec_2norm_1000epochs_SoftplusLossloss_20210208-1109
ignoring SheafE_Multisection_64embdim_64esdim_64sec_1norm_1000epochs_SoftplusLossloss_20210304-1455
ignoring SheafE_Multisection_64embdim_64esdim_1sec_2norm_1epochs_SoftplusLossloss_neighborhood_20210505-1539
ignoring SheafE_Multisection_64embdim_8esdim_1sec_2norm_1000epochs_SoftplusLossloss_20210208-1109
ignoring SheafE_Translational_64embdim_64esdim_64sec_2norm_1000epochs_SoftplusLossloss_20210301-1444
ignoring SheafE_Multisection_16embdim_16esdim_1sec_2norm_5epochs_SoftplusLossloss_neighborhood_20210504-2133
ignoring SheafE_Multisection_25sec_1000epochs_64embdim_SoftplusLossloss_7seed_20210203-1742
ignoring SheafE_Multisection_64embdim_64esdim_1sec_2norm_1epochs_SoftplusLossloss_neighborhood_20210505-1544
ignoring SheafE_Multisection_64embdim_64esdim_1sec_2norm_100epochs_SoftplusLossloss_neighborhood_20210501-2118
ignoring SheafE_Translational_Identity_Complex_Queries_64embdim_64esdim_1

ignoring SheafE_Multisection_64embdim_64esdim_64sec_2norm_1000epochs_SoftplusLossloss_20210301-2201
ignoring SheafE_Multisection_16embdim_16esdim_1sec_2norm_1epochs_SoftplusLossloss_neighborhood_20210504-1848
ignoring SheafE_Multisection_64embdim_64esdim_1sec_2norm_50epochs_SoftplusLossloss_neighborhood_20210505-1055
ignoring SheafE_Multisection_64embdim_16esdim_1sec_2norm_1000epochs_SoftplusLossloss_20210208-1057
ignoring SheafE_Multisection_16embdim_16esdim_1sec_2norm_500epochs_SoftplusLossloss_neighborhood_20210506-1322
ignoring SheafE_Multisection_64embdim_64esdim_88sec_2norm_1000epochs_SoftplusLossloss_20210301-1108
ignoring SheafE_Multisection_64embdim_64esdim_1sec_2norm_500epochs_SoftplusLossloss_neighborhood_20210506-1414
ignoring SheafE_Multisection_64embdim_64esdim_1sec_2norm_1epochs_SoftplusLossloss_neighborhood_20210503-1033
ignoring SheafE_Multisection_64embdim_64esdim_16sec_2norm_1000epochs_SoftplusLossloss_20210228-1913
ignoring SheafE_Multisection_64embdim_64esdim_1sec_

In [5]:
tdf = df[(df['epochs'] == 250)&(df['loss'] == 'MarginRankingLoss')]
tdf

Unnamed: 0,id,dataset,gc_mrr,gc_10,class,model,embdim,esdim,sec,norm,...,epochs,loss,sampler,date,time,1,10,mrr,orthogonal,seed
2,2,FB15k-237,0.083767,0.119581,SheafE,Multisection,64,64,16,2,...,250,MarginRankingLoss,,2021-05-26,818,"{'1p': 0.014874080438167856, '2p': 8.717724710...","{'1p': 0.03534917533736199, '2p': 0.0006964905...","{'1p': 0.022465451246184414, '2p': 0.000738137...",1.00,1234
5,5,FB15k-237,0.101673,0.258636,SheafE,Translational,64,64,64,2,...,250,MarginRankingLoss,,2021-05-27,2103,"{'1p': 6.195812456887471e-05, '2p': 0.00788861...","{'1p': 0.3079979677735141, '2p': 0.09891649810...","{'1p': 0.10815869263452142, '2p': 0.0408387581...",0.00,11
7,7,FB15k-237,0.211377,0.395709,SheafE,Translational,64,64,1,2,...,250,MarginRankingLoss,,2021-05-27,1849,"{'1p': 0.048034068707429614, '2p': 0.006772002...","{'1p': 0.21378444355408324, '2p': 0.0356017038...","{'1p': 0.101526761133875, '2p': 0.018825811310...",0.00,11
8,8,FB15k-237,0.091088,0.224092,SheafE,Translational,64,64,16,2,...,250,MarginRankingLoss,,2021-05-27,1456,"{'1p': 3.304433310339985e-05, '2p': 0.00216181...","{'1p': 0.09521311529580874, '2p': 0.0173046835...","{'1p': 0.03208190681743316, '2p': 0.0084451183...",0.01,11
11,11,FB15k-237,0.207479,0.346462,SheafE,Multisection,64,32,1,2,...,250,MarginRankingLoss,,2021-05-26,2245,"{'1p': 0.04622489147001847, '2p': 9.7378839856...","{'1p': 0.19835273999479552, '2p': 0.0008615708...","{'1p': 0.09508062333229382, '2p': 0.0008007281...",0.00,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
302,302,NELL995,0.000189,0.000105,SheafE,Multisection,32,32,1,2,...,250,MarginRankingLoss,,2021-05-27,909,"{'1p': 0.09982503794066756, '2p': 1.2866590610...","{'1p': 0.3639694154720592, '2p': 0.00015225465...","{'1p': 0.1873675777058704, '2p': 0.00013942063...",0.00,33
303,303,NELL995,0.101661,0.189896,SheafE,Multisection,32,16,1,2,...,250,MarginRankingLoss,,2021-05-26,905,"{'1p': 3.866564847125693e-05, '2p': 8.57772707...","{'1p': 0.00020299465447409882, '2p': 0.0001372...","{'1p': 0.00020534029265171244, '2p': 0.0001668...",0.00,1234
304,304,NELL995,0.071934,0.109378,SheafE,Multisection,64,64,16,2,...,250,MarginRankingLoss,,2021-05-26,1033,"{'1p': 2.899923635344269e-05, '2p': 1.07221588...","{'1p': 0.00018366183023847042, '2p': 0.0001458...","{'1p': 0.00020446760381509496, '2p': 0.0001811...",1.00,1234
305,305,NELL995,0.000709,0.000701,SheafE,Translational,32,32,16,2,...,250,MarginRankingLoss,,2021-05-27,302,"{'1p': 0.00021266106659191308, '2p': 0.0006004...","{'1p': 0.00039632289683038345, '2p': 0.0008577...","{'1p': 0.0004097018271520152, '2p': 0.00080331...",10.00,11


In [6]:
fb_df = tdf[tdf['dataset'] == 'FB15k-237'].reset_index(drop=True)

In [7]:
nell_df = tdf[(tdf['dataset'] == 'NELL995')&(tdf['10'].apply(lambda x: x['1p'] > 0.001))].reset_index(drop=True)

In [8]:
for complex_metric in complex_metrics:
    joined = fb_df.join(pd.DataFrame(fb_df[complex_metric].values.tolist()))
    grouped = joined.groupby(groupby_cols)
    sv = grouped.mean()
    embdim = 64
    sv = sv[sv.index.get_level_values('embdim') == embdim].droplevel('embdim',axis=0)
    sv = sv[sv.index.get_level_values('orthogonal').isin(orthogonals)]
    sv[resnames] = 100*sv[resnames].round(4)
    sv = sv[resnames]
    sv.to_excel(f'FB15k-237/{complex_metric}.xlsx')

In [9]:
sv

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,1p,2p,3p,2i,3i,ip,pi
model,esdim,sec,orthogonal,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Multisection,16,1,0.0,8.62,0.08,0.09,0.57,0.38,0.05,0.05
Multisection,32,1,0.0,9.44,0.08,0.08,0.92,0.58,0.06,0.06
Multisection,64,1,0.0,10.16,0.08,0.08,1.16,0.69,0.06,0.06
Multisection,64,16,0.0,20.28,0.07,0.08,15.95,10.04,0.07,0.06
Multisection,64,16,0.01,16.7,0.07,0.07,10.18,6.86,0.07,0.07
Multisection,64,16,0.1,6.13,0.07,0.07,3.05,2.95,0.07,0.07
Multisection,64,32,0.0,22.89,0.07,0.08,17.05,9.94,0.07,0.07
Multisection,64,32,0.01,9.18,0.07,0.07,3.75,2.93,0.07,0.07
Multisection,64,32,0.1,5.76,0.07,0.07,2.64,2.53,0.07,0.07
Multisection,64,64,0.0,24.7,0.07,0.08,18.74,11.06,0.07,0.07


In [10]:
for complex_metric in complex_metrics:
    joined = nell_df.join(pd.DataFrame(nell_df[complex_metric].values.tolist()))
    grouped = joined.groupby(groupby_cols)
    sv = grouped.mean()
    embdim = 32
    sv = sv[sv.index.get_level_values('embdim') == embdim].droplevel('embdim',axis=0)
    sv = sv[sv.index.get_level_values('orthogonal').isin(orthogonals)]
    sv[resnames] = 100*sv[resnames].round(4)
    sv = sv[resnames]
    sv.to_excel(f'NELL995/{complex_metric}.xlsx')

In [11]:
grouped.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,id,gc_mrr,gc_10,norm,lbda,epochs,time,1p,2p,3p,2i,3i,ip,pi
model,embdim,esdim,sec,orthogonal,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Multisection,32,8,1,0.0,274.25,0.000176,0.000114,2.0,0.0,250.0,1142.5,0.148962,0.000116,0.00058,0.00146,0.001058,8.4e-05,3.6e-05
Multisection,32,16,1,0.0,255.0,0.000194,0.000166,2.0,0.0,250.0,1088.0,0.172117,0.000105,0.000161,0.003329,0.002775,0.000115,3.7e-05
Multisection,32,32,1,0.0,251.75,0.000219,0.000223,2.0,0.0,250.0,1050.5,0.187777,0.000141,0.000192,0.005826,0.004973,0.000174,6.1e-05
Multisection,32,32,8,0.0,280.0,0.000294,0.000327,2.0,0.0,250.0,630.333333,0.363871,0.000203,0.000225,0.053027,0.048521,0.000187,0.000116
Multisection,32,32,8,0.01,236.0,0.000215,0.000228,2.0,0.0,250.0,433.0,0.060893,0.0002,0.000189,0.013406,0.015432,0.000179,0.000133
Multisection,32,32,8,0.1,238.0,0.000196,0.000158,2.0,0.0,250.0,613.0,0.03006,0.000189,0.000197,0.005734,0.006659,0.000186,0.000164
Multisection,32,32,8,1.0,271.0,0.000173,0.000175,2.0,0.0,250.0,751.0,0.029665,0.000201,0.000185,0.004819,0.006195,0.000182,0.000179
Multisection,32,32,8,10.0,229.0,0.000137,7e-05,2.0,0.0,250.0,934.0,0.038304,0.000188,0.000181,0.004939,0.005728,0.000188,0.000179
Multisection,32,32,16,0.0,255.666667,0.000306,0.000345,2.0,0.0,250.0,923.333333,0.384833,0.000203,0.000218,0.054691,0.048447,0.000201,7.6e-05
Multisection,32,32,16,0.01,250.0,0.000182,0.000123,2.0,0.0,250.0,2358.0,0.053038,0.000203,0.000192,0.011444,0.01315,0.000184,0.000126


In [12]:
sv = grouped.mean()
embdim = 32
sv = sv[sv.index.get_level_values('embdim') == embdim].droplevel('embdim',axis=0)
sv = sv[sv.index.get_level_values('orthogonal').isin(orthogonals)]
sv[resnames] = 100*sv[resnames].round(4)
sv = sv[resnames]
sv

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,1p,2p,3p,2i,3i,ip,pi
model,esdim,sec,orthogonal,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Multisection,8,1,0.0,14.9,0.01,0.06,0.15,0.11,0.01,0.0
Multisection,16,1,0.0,17.21,0.01,0.02,0.33,0.28,0.01,0.0
Multisection,32,1,0.0,18.78,0.01,0.02,0.58,0.5,0.02,0.01
Multisection,32,8,0.0,36.39,0.02,0.02,5.3,4.85,0.02,0.01
Multisection,32,8,0.01,6.09,0.02,0.02,1.34,1.54,0.02,0.01
Multisection,32,8,0.1,3.01,0.02,0.02,0.57,0.67,0.02,0.02
Multisection,32,16,0.0,38.48,0.02,0.02,5.47,4.84,0.02,0.01
Multisection,32,16,0.01,5.3,0.02,0.02,1.14,1.32,0.02,0.01
Multisection,32,16,0.1,2.37,0.02,0.02,0.47,0.55,0.02,0.02
Multisection,32,32,0.0,40.94,0.02,0.02,5.51,5.09,0.02,0.01
