In [1]:
import sys
sys.path.insert(0,'..')
import json

import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path


# Log Extraction

We find all log directories that appear both in the data/save and data/log directories. We then extract arguments (args.json) of the run and last epoch macro/micro scores (last row of eval_valid.csv). If the eval_valid.csv file is missing we discard the run altogether. 

The code below assembles a pandas data frame of all run IDs that are names of both log and save directories.

In [2]:
LOGS = Path('../data/log')
SAVES = Path('../data/save')
MAXROWS = 300
TOPROWS = 10
pd.set_option('display.max_rows', MAXROWS)

In [3]:
logLabels = {x.name : x for x in LOGS.iterdir() if x.is_dir() and 'train' in str(x.name)}
saveLabels = {x.name : x for x in SAVES.iterdir() if x.is_dir()}
# reconcile logRun and saveRun Labels to match
logRunLabels = {x.name : L for L,D in logLabels.items() for x in D.iterdir() if x.is_dir()}
saveRunLabelsAll = {x.name : L for L,D in saveLabels.items() for x in D.iterdir() if x.is_dir() if x.name in logRunLabels.keys()}

validKeys = set(logRunLabels.keys()).intersection(set(saveRunLabelsAll.keys()))
logRunLabels = {k: v for k,v in logRunLabels.items() if k in validKeys}
saveRunLabels = {k: v for k,v in saveRunLabelsAll.items() if k in validKeys}

logRunPaths = {x.name :  x for L,D in logLabels.items() for x in D.iterdir() if x.is_dir() and x.name in validKeys }
saveRunPaths =  {x.name :  x for L,D in saveLabels.items() for x in D.iterdir() if x.is_dir() if x.name in validKeys}
runDF = pd.DataFrame([logRunLabels.keys(),logRunLabels.values(),logRunPaths.values()],columns=logRunLabels.keys(),index=['run','label','logPath']).T
#print(logRunLabels)
runDF = runDF.join(pd.DataFrame([saveRunPaths.keys(),saveRunPaths.values()],columns=logRunLabels.keys(),index=['runSave','savePath']).T,how='left').drop(columns = ['runSave'])
logLabels


{'ade_bert_train': PosixPath('../data/log/ade_bert_train'),
 'scierc_bert_train': PosixPath('../data/log/scierc_bert_train'),
 'scierc_rob_train': PosixPath('../data/log/scierc_rob_train'),
 'conll04_bert_train_lvl': PosixPath('../data/log/conll04_bert_train_lvl'),
 'conll04_rob_train_lvl': PosixPath('../data/log/conll04_rob_train_lvl'),
 'conll04_bert_train': PosixPath('../data/log/conll04_bert_train'),
 'docRed_rob_train_lvl': PosixPath('../data/log/docRed_rob_train_lvl'),
 'scierc_elec_train': PosixPath('../data/log/scierc_elec_train'),
 'conll04_elec_train': PosixPath('../data/log/conll04_elec_train'),
 'conll04_rob_train': PosixPath('../data/log/conll04_rob_train'),
 'ade_elec_train': PosixPath('../data/log/ade_elec_train'),
 'docRed_bert_train_lvl': PosixPath('../data/log/docRed_bert_train_lvl'),
 'ade_rob_train': PosixPath('../data/log/ade_rob_train')}

In [4]:
logEvalLabels = {x.name : x for x in LOGS.iterdir() if x.is_dir() and 'test' in str(x.name)}
logEvalRunLabels = {x.name : L for L,D in logEvalLabels.items() for x in D.iterdir() if x.is_dir()}
logEvalRunPaths = {x.name :  x for L,D in logEvalLabels.items() for x in D.iterdir() if x.is_dir() }
runEvalDF = pd.DataFrame([logEvalRunLabels.keys(),logEvalRunLabels.values(),logEvalRunPaths.values()],columns=logEvalRunLabels.keys(),index=['run','label','logPath']).T
runEvalDF

Unnamed: 0,run,label,logPath
2022-03-26_13.17.27.579408,2022-03-26_13.17.27.579408,conll04_bert_test,../data/log/conll04_bert_test/2022-03-26_13.17...
2022-03-26_13.09.08.704869,2022-03-26_13.09.08.704869,conll04_bert_test,../data/log/conll04_bert_test/2022-03-26_13.09...
2022-03-26_13.15.24.855735,2022-03-26_13.15.24.855735,conll04_bert_test,../data/log/conll04_bert_test/2022-03-26_13.15...
2022-03-26_13.18.15.750252,2022-03-26_13.18.15.750252,conll04_bert_test,../data/log/conll04_bert_test/2022-03-26_13.18...
2022-03-26_14.11.45.743272,2022-03-26_14.11.45.743272,conll04_rob_test,../data/log/conll04_rob_test/2022-03-26_14.11....
2022-03-27_14.52.01.153096,2022-03-27_14.52.01.153096,conll04_rob_test,../data/log/conll04_rob_test/2022-03-27_14.52....
2022-03-26_14.14.40.366734,2022-03-26_14.14.40.366734,conll04_rob_test,../data/log/conll04_rob_test/2022-03-26_14.14....
2022-03-26_14.16.29.477572,2022-03-26_14.16.29.477572,conll04_rob_test,../data/log/conll04_rob_test/2022-03-26_14.16....
2022-03-31_10.04.38.767961,2022-03-31_10.04.38.767961,conll04_rob_test,../data/log/conll04_rob_test/2022-03-31_10.04....
2022-03-26_14.15.40.228845,2022-03-26_14.15.40.228845,conll04_rob_test,../data/log/conll04_rob_test/2022-03-26_14.15....


For each of the log directories selected above, we extract the arguments (`args.json`) and the eval_valid.csv. We discard directories that don't contain eval_valid.csv (indicating incomplete runs). We create a data frame with all arguments, run info from the previous data frame and the scores of the last run. There is one row for each Run-ID. 

We show some of the columns below

In [5]:
argList= []
for run in runDF.itertuples():
    with open(run.logPath.joinpath('args.json')) as A:
        D = json.load(A)
    D['label']=run.label
    D['logPath'] = run.logPath
    D['savePath'] = run.savePath
    D['runID'] = run.run
    
    evPath = run.logPath.joinpath('eval_valid.csv')
    if evPath.exists():
        try:
            D.update(pd.read_csv(evPath,sep=';').iloc[-1].to_dict())
            argList.append(D)
        except:
            print('exception ==>',evPath)
#data/log/scierc_rob_train/2022-03-22_16.49.15.916860/eval_valid.csv
argDF = pd.DataFrame.from_dict(argList).drop(columns=['store_predictions',	'store_examples','tokenizer_path']).reset_index()
argDF[['label','runID','ner_f1_macro','rel_f1_macro','rel_nec_f1_macro','ner_f1_micro','rel_f1_micro',	'rel_nec_f1_micro','train_batch_size', 'epochs', 'neg_entity_count', 'neg_relation_count', 'lr', 'weight_decay',  'lowercase', 'model_path', 'rel_filter_threshold', 'prop_drop']]
display(argDF)


exception ==> ../data/log/scierc_bert_train/2022-03-31_09.44.48.114125/eval_valid.csv
exception ==> ../data/log/scierc_bert_train/2022-03-31_09.49.08.704424/eval_valid.csv
exception ==> ../data/log/scierc_bert_train/2022-03-31_10.03.55.903883/eval_valid.csv
exception ==> ../data/log/conll04_rob_train_lvl/2022-03-31_10.30.40.480713/eval_valid.csv
exception ==> ../data/log/conll04_rob_train_lvl/2022-03-31_10.31.33.581041/eval_valid.csv
exception ==> ../data/log/conll04_rob_train_lvl/2022-03-31_10.26.09.634505/eval_valid.csv
exception ==> ../data/log/conll04_rob_train_lvl/2022-03-31_10.41.47.528621/eval_valid.csv
exception ==> ../data/log/conll04_rob_train_lvl/2022-03-31_10.23.59.029523/eval_valid.csv
exception ==> ../data/log/conll04_rob_train_lvl/2022-03-31_10.36.02.704537/eval_valid.csv
exception ==> ../data/log/conll04_rob_train_lvl/2022-03-31_10.37.49.452884/eval_valid.csv
exception ==> ../data/log/conll04_rob_train/2022-04-04_17.18.38.033391/eval_valid.csv
exception ==> ../data/log/

Unnamed: 0,index,train_path,valid_path,save_path,init_eval,save_optimizer,train_log_iter,final_eval,train_batch_size,epochs,...,rel_nec_f1_micro,rel_nec_prec_macro,rel_nec_rec_macro,rel_nec_f1_macro,epoch,iteration,global_iteration,output_hidden_states,hidden_state_from_layer,f_score_beta
0,0,data/datasets/ade/ade_split_0_train.json,data/datasets/ade/ade_split_0_test.json,data/save/,False,False,100,False,4,20,...,81.199707,79.172611,83.333333,81.199707,20.0,0.0,19220.0,,,
1,1,data/datasets/ade/ade_split_0_train.json,data/datasets/ade/ade_split_0_test.json,data/save/,False,False,100,False,4,20,...,81.049563,78.753541,83.483483,81.049563,20.0,0.0,19220.0,,,
2,2,data/datasets/ade/ade_split_0_train.json,data/datasets/ade/ade_split_0_test.json,data/save/,False,False,100,False,4,20,...,82.753623,79.971989,85.735736,82.753623,20.0,0.0,19220.0,,,
3,3,data/datasets/ade/ade_split_7_train.json,data/datasets/ade/ade_split_7_test.json,data/save/,False,False,100,False,4,20,...,80.433311,76.843467,84.375000,80.433311,20.0,0.0,19220.0,,,
4,4,data/datasets/ade/ade_split_0_train.json,data/datasets/ade/ade_split_0_test.json,data/save/,False,False,100,False,4,20,...,83.819242,81.444759,86.336336,83.819242,20.0,0.0,19220.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
654,654,data/datasets/ade/ade_split_0_train.json,data/datasets/ade/ade_split_0_test.json,data/save/,False,False,100,False,4,20,...,75.609756,72.390110,79.129129,75.609756,20.0,0.0,19220.0,,,
655,655,data/datasets/ade/ade_split_0_train.json,data/datasets/ade/ade_split_0_test.json,data/save/,False,False,100,False,4,20,...,81.129272,80.294118,81.981982,81.129272,20.0,0.0,19220.0,,,
656,656,data/datasets/ade/ade_split_0_train.json,data/datasets/ade/ade_split_0_test.json,data/save/,False,False,100,False,4,20,...,81.355932,79.884226,82.882883,81.355932,20.0,0.0,19220.0,,,
657,657,data/datasets/ade/ade_split_0_train.json,data/datasets/ade/ade_split_0_test.json,data/save/,False,False,100,False,4,20,...,81.001473,79.479769,82.582583,81.001473,20.0,0.0,19220.0,,,


In [6]:
def best_runs(df=None,groupingLabel = 'label', 
    criteria = ['ner_f1_macro','rel_f1_macro',	'rel_nec_f1_macro'],
    topN=3):
    '''
    Function to extract best runs from a pandas dataframe of all runs 
    found in the standard directories.
    Argumnets:
        df: Pandas dataframe with results and arguments of each run
        groupingLable (default is 'label'): The column name by which to partition the data frame
            such partitioning will produce different best results for each groupingLable
        criteria: list of column headings denoting metrics which will determine the best runs
        topN: The numner of best N experiments to return per criterion per groupLabel
    '''
    res = pd.DataFrame()
    cols = ['index','criterion','label',	'runID',	'ner_f1_macro',	'rel_f1_macro',	'rel_nec_f1_macro',	'ner_f1_micro',	'rel_f1_micro',	'rel_nec_f1_micro','lr','weight_decay']

    for criterion in criteria:
        DF1=df.groupby('label').apply(lambda x : x.sort_values(by = criterion, ascending = False).head(topN).reset_index(drop = True))
        DF1['criterion']=criterion
        DF1=DF1.drop(columns=['config','types_path','train_path','valid_path','global_iteration','ner_prec_micro','ner_prec_macro','ner_rec_micro', 'ner_rec_macro','rel_prec_micro', 'rel_rec_micro','rel_rec_macro','rel_prec_macro', 'rel_rec_macro', 'rel_nec_prec_micro',
       'rel_nec_rec_micro','logPath','savePath','model_type'])
        #DF1.set_index('criterion', append=True, inplace=True)
        #DF1.reorder_levels([2, 0, 1])
        res=pd.concat([res,DF1],axis=0)
    res.set_index('criterion', append=True, inplace=True)
    res.index.names = ['label', 'seq', 'criterion']
    varCols = [col for col in res.columns if len(set(res[col])) > 1 or col in cols]
    res = res[varCols]

    #  
    return   res.reorder_levels(['criterion','label', 'seq']).drop(columns=['label'])

In [7]:
bestN = best_runs(argDF,
    criteria=['ner_f1_macro','rel_f1_macro','rel_nec_f1_macro', 'ner_f1_micro','rel_f1_micro','rel_nec_f1_micro'],
    topN=5)
df = bestN.reset_index()[['runID','label']]
#lookup = [runID : label for runID, label in df.iterrows()}
pd.DataFrame(bestN).to_csv('hyperparm_sel_bestN_train.csv')
bestN

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,index,train_batch_size,epochs,neg_entity_count,neg_relation_count,lr,weight_decay,model_path,eval_batch_size,rel_filter_threshold,...,rel_f1_micro,rel_f1_macro,rel_nec_f1_micro,rel_nec_prec_macro,rel_nec_rec_macro,rel_nec_f1_macro,epoch,output_hidden_states,hidden_state_from_layer,f_score_beta
criterion,label,seq,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
ner_f1_macro,ade_bert_train,0,4,4,20,100,100,0.00001,0.02,allenai/scibert_scivocab_cased,1,0.4,...,83.819242,83.819242,83.819242,81.444759,86.336336,83.819242,20.0,,,
ner_f1_macro,ade_bert_train,1,2,4,20,100,100,0.00005,0.01,allenai/scibert_scivocab_cased,1,0.4,...,82.753623,82.753623,82.753623,79.971989,85.735736,82.753623,20.0,,,
ner_f1_macro,ade_bert_train,2,7,4,20,100,100,0.00001,0.02,allenai/scibert_scivocab_cased,1,0.4,...,83.442743,83.442743,83.442743,81.134752,85.885886,83.442743,20.0,,,
ner_f1_macro,ade_bert_train,3,32,4,20,100,100,0.00002,0.02,allenai/scibert_scivocab_cased,1,0.4,...,83.272462,83.272462,83.272462,81.081081,85.585586,83.272462,20.0,,,
ner_f1_macro,ade_bert_train,4,20,4,20,100,100,0.00005,0.00,allenai/scibert_scivocab_cased,1,0.4,...,83.164006,83.164006,83.164006,80.477528,86.036036,83.164006,20.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
rel_nec_f1_micro,scierc_rob_train,0,79,4,20,100,100,0.00005,0.20,allenai/biomed_roberta_base,1,0.4,...,46.171429,44.340069,37.714286,38.302292,34.443652,35.433664,20.0,,,
rel_nec_f1_micro,scierc_rob_train,1,82,4,20,100,100,0.00006,0.20,allenai/biomed_roberta_base,1,0.4,...,47.522523,42.825260,37.387387,35.638485,31.356329,32.417352,20.0,,,
rel_nec_f1_micro,scierc_rob_train,2,77,4,20,100,100,0.00005,0.15,allenai/biomed_roberta_base,1,0.4,...,48.081264,45.794171,36.794582,36.507540,32.408808,33.401826,20.0,,,
rel_nec_f1_micro,scierc_rob_train,3,72,4,20,100,100,0.00005,0.10,allenai/biomed_roberta_base,1,0.4,...,46.978335,44.869976,36.488027,34.047894,31.488411,32.329741,20.0,,,


In [8]:
recentSaveRunIDs  = sorted([x.name  for _,D in saveLabels.items() for x in D.iterdir() if x.is_dir() ])[-5:]
bestN_runIDs = set(bestN.runID)
print(recentSaveRunIDs)
saveDirsAll = [x  for _,D in saveLabels.items() for x in D.iterdir() if x.is_dir() ]
saveDirsXXX = [x  for _,D in saveLabels.items() for x in D.iterdir() if x.is_dir() and x.name not in bestN_runIDs  and x.name < recentSaveRunIDs[0]]
saveRidsXXX = sorted([x.name  for x in saveDirsXXX])
print('All save directories:',len(saveDirsAll))
print('Save dirs to remove: ',len(saveDirsXXX))
print('Save Directories to keep:',len(set(saveDirsAll).difference(set(saveDirsXXX))))
print('latest directories:' )
print('maximum deleted ID:', saveRidsXXX[-1] )

['2022-04-06_16.46.06.356213', '2022-04-06_17.09.58.332288', '2022-04-06_17.36.04.102594', '2022-04-06_17.48.40.329911', '2022-04-06_17.59.52.002310']
All save directories: 735
Save dirs to remove:  625
Save Directories to keep: 110
latest directories:
maximum deleted ID: 2022-04-06_02.39.58.676552


In [9]:
sorted(list(set(saveDirsAll).difference(set(saveDirsXXX))),key=lambda x : x.name)[-25:]

[PosixPath('../data/save/docRed_bert_train_lvl/2022-04-04_02.30.54.745259'),
 PosixPath('../data/save/docRed_bert_train_lvl/2022-04-04_04.33.41.768240'),
 PosixPath('../data/save/docRed_bert_train_lvl/2022-04-04_06.37.09.385758'),
 PosixPath('../data/save/docRed_bert_train_lvl/2022-04-04_08.40.39.096705'),
 PosixPath('../data/save/docRed_bert_train_lvl/2022-04-04_10.47.30.574246'),
 PosixPath('../data/save/docRed_bert_train_lvl/2022-04-04_18.38.54.835873'),
 PosixPath('../data/save/docRed_bert_train_lvl/2022-04-04_18.41.50.524935'),
 PosixPath('../data/save/docRed_rob_train_lvl/2022-04-04_20.29.00.322642'),
 PosixPath('../data/save/docRed_rob_train_lvl/2022-04-04_23.10.02.989790'),
 PosixPath('../data/save/docRed_rob_train_lvl/2022-04-05_01.50.51.744269'),
 PosixPath('../data/save/docRed_rob_train_lvl/2022-04-05_04.27.46.671195'),
 PosixPath('../data/save/docRed_rob_train_lvl/2022-04-05_07.04.03.787270'),
 PosixPath('../data/save/docRed_rob_train_lvl/2022-04-05_09.47.37.728639'),
 Posi

In [10]:
[str(d) for d in sorted(saveDirsXXX,key=lambda x : x.name)]


['../data/save/scierc_bert_train/2022-03-22_10.17.46.145076',
 '../data/save/scierc_bert_train/2022-03-22_10.46.00.854943',
 '../data/save/scierc_rob_train/2022-03-22_13.52.36.286048',
 '../data/save/scierc_rob_train/2022-03-22_14.31.14.577274',
 '../data/save/scierc_rob_train/2022-03-22_16.14.48.128661',
 '../data/save/scierc_rob_train/2022-03-22_16.16.10.286131',
 '../data/save/scierc_rob_train/2022-03-22_16.17.36.028741',
 '../data/save/scierc_rob_train/2022-03-22_16.18.08.564927',
 '../data/save/scierc_rob_train/2022-03-22_16.49.15.916860',
 '../data/save/scierc_rob_train/2022-03-22_17.36.49.251748',
 '../data/save/scierc_rob_train/2022-03-22_17.58.14.707227',
 '../data/save/scierc_elec_train/2022-03-23_09.43.18.960435',
 '../data/save/scierc_elec_train/2022-03-23_11.23.18.939754',
 '../data/save/scierc_rob_train/2022-03-23_18.28.13.297036',
 '../data/save/conll04_bert_train/2022-03-24_10.07.59.138914',
 '../data/save/conll04_bert_train/2022-03-24_10.11.21.546339',
 '../data/save/c

In [11]:
def best_run(df=None,groupingLabel = 'label', 
    maxMetrics = ['ner_f1_macro','rel_f1_macro',	'rel_nec_f1_macro']):
    '''
    Function to extract best runs from a pandas dataframe of all runs 
    found in the standard directories. 
    '''
    res = pd.DataFrame()
    for metric in maxMetrics:
        idx=df.groupby(by='label')[metric].idxmax()
        df1 = df.loc[idx,['label','runID']+maxMetrics]
        df1['criterion'] = metric
        res = pd.concat([res,df1],axis=0)
    return res

## Best Runs - Training
The best runs are identified below. The label is the one assigned to each run that tracks dataset and model type. The metrics shown are those based on which we make the choice of best run. The column "maximize" indicates which of the list of metrics was used to select the row. 

In [12]:
best = best_run(argDF,maxMetrics=['ner_f1_macro','rel_f1_macro',	'rel_nec_f1_macro', 'ner_f1_micro','rel_f1_micro',	'rel_nec_f1_micro'])
best

Unnamed: 0,label,runID,ner_f1_macro,rel_f1_macro,rel_nec_f1_macro,ner_f1_micro,rel_f1_micro,rel_nec_f1_micro,criterion
4,ade_bert_train,2022-03-27_09.07.07.554048,91.224258,83.819242,83.819242,90.934192,83.819242,83.819242,ner_f1_macro
617,ade_elec_train,2022-03-28_12.03.19.494299,90.838738,81.196013,81.196013,90.461538,81.196013,81.196013,ner_f1_macro
636,ade_rob_train,2022-03-27_22.47.55.820777,91.289223,82.532751,82.532751,90.942529,82.532751,82.532751,ner_f1_macro
161,conll04_bert_train,2022-03-24_17.03.33.337513,85.760877,70.47158,70.47158,88.751406,68.944099,68.944099,ner_f1_macro
86,conll04_bert_train_lvl,2022-04-06_16.29.59.809661,84.451806,72.374453,72.374453,87.719298,71.019108,71.019108,ner_f1_macro
278,conll04_elec_train,2022-03-26_10.39.43.538891,73.74453,62.670404,62.670404,77.066229,61.363636,61.363636,ner_f1_macro
574,conll04_rob_train,2022-03-25_10.31.25.567663,83.998313,67.612002,67.612002,87.148373,67.365269,67.365269,ner_f1_macro
90,conll04_rob_train_lvl,2022-03-31_10.44.39.925787,80.702417,68.092642,67.882115,84.71373,65.853659,65.54878,ner_f1_macro
621,docRed_bert_train_lvl,2022-04-04_18.41.50.524935,99.803744,64.975845,64.975845,99.857737,90.01372,90.01372,ner_f1_macro
217,docRed_rob_train_lvl,2022-04-04_20.29.00.322642,86.264739,22.630432,22.313493,87.547632,59.210526,58.947368,ner_f1_macro


We now look at the columns of this dataframe and identify colummn names that have more than one value. If all the rows have the same value in a column, then the column is not interesting in terms of hyperparameter selection. We then select most significant columns from the original dataframe for the experiments that appear in the best-list.

In [13]:
varCols = [col for col in argDF.columns if len(set(argDF[col])) > 1 ]
lmt = varCols.index('label') # we discard anything on the right of column label (addidional metrics)
#print(varCols)
df = argDF.iloc[list(set(best.index))][['label','runID','ner_f1_macro','rel_f1_macro','rel_nec_f1_macro', 'ner_f1_micro','rel_f1_micro',	'rel_nec_f1_micro']+varCols[:lmt]]\
    .drop(columns=['config','model_type','index','neg_entity_count',	'neg_relation_count','types_path','final_eval','train_path','valid_path'])
df.sort_values(['label','runID'])

Unnamed: 0,label,runID,ner_f1_macro,rel_f1_macro,rel_nec_f1_macro,ner_f1_micro,rel_f1_micro,rel_nec_f1_micro,train_batch_size,epochs,lr,weight_decay,lowercase,model_path,eval_batch_size,rel_filter_threshold,prop_drop
4,ade_bert_train,2022-03-27_09.07.07.554048,91.224258,83.819242,83.819242,90.934192,83.819242,83.819242,4,20,1e-05,0.02,False,allenai/scibert_scivocab_cased,1,0.4,0.1
617,ade_elec_train,2022-03-28_12.03.19.494299,90.838738,81.196013,81.196013,90.461538,81.196013,81.196013,4,20,5e-05,0.02,False,kamalkraj/bioelectra-base-discriminator-pubmed,1,0.4,0.1
603,ade_elec_train,2022-03-28_13.38.42.558541,90.062642,81.231231,81.231231,89.786444,81.231231,81.231231,4,20,5e-05,0.01,False,kamalkraj/bioelectra-base-discriminator-pubmed,1,0.4,0.1
636,ade_rob_train,2022-03-27_22.47.55.820777,91.289223,82.532751,82.532751,90.942529,82.532751,82.532751,4,20,5e-05,0.01,False,allenai/biomed_roberta_base,1,0.4,0.2
120,conll04_bert_train,2022-03-24_16.56.59.416385,85.677381,73.202166,73.202166,88.651685,71.889401,71.889401,4,20,5e-05,0.1,False,bert-base-cased,1,0.4,0.2
161,conll04_bert_train,2022-03-24_17.03.33.337513,85.760877,70.47158,70.47158,88.751406,68.944099,68.944099,4,20,5e-05,0.1,False,bert-base-cased,1,0.5,0.0
194,conll04_bert_train,2022-03-24_19.12.06.248497,83.286697,73.041919,73.041919,87.443439,72.063492,72.063492,4,20,5e-05,0.005,False,bert-base-cased,1,0.5,0.2
144,conll04_bert_train,2022-03-24_19.30.46.210486,83.071676,73.270185,72.950185,86.800895,71.964018,71.664168,4,20,5e-05,0.0,False,bert-base-cased,1,0.4,0.2
86,conll04_bert_train_lvl,2022-04-06_16.29.59.809661,84.451806,72.374453,72.374453,87.719298,71.019108,71.019108,4,20,5e-05,0.1,False,bert-base-cased,1,0.5,0.2
282,conll04_elec_train,2022-03-26_09.53.45.502396,72.407269,63.295298,63.295298,75.737705,61.649783,61.649783,4,20,5e-05,0.05,False,google/electra-base-discriminator,1,0.4,0.2


In [14]:
def model_type(label=None):
    if '_rob_' in label:
        return 'sprob'
    if '_bert_' in label:
        return 'spert'
    if  '_elec' in label:
        return 'spelec'
    if  '_long' in label:
        return 'splong'
    return 'invalidModel'

def dataset_path(label=None):
    if 'conll04' in label:
        return 'data/datasets/conll04/conll04_test.json' 
    if 'ade_' in label:
        return 'data/datasets/ade/ade_split_9_test.json'
    if  'docRed' in label:
        return 'data/datasets/docRED/docRed_test.json'
    if  'scierc' in label:
        return 'data/datasets/scierc/scierc_test.json'
    return 'invalidLabel'

def types_path(label=None):
    if 'conll04' in label:
        return 'data/datasets/conll04/conll04_types.json'
    if 'ade_' in label:
        return 'data/datasets/ade/ade_types.json'
    if  'docRed' in label:
        return 'data/datasets/docRED/docRed_types.json'
    if  'scierc' in label:
        return 'data/datasets/scierc/scierc_types.json'
    return 'invalidLabel'


df = argDF.iloc[list(set(best.index))][['label','runID']]
df['label'] = [lbl.replace('_train','_test') for lbl in df.label]
df['model_type'] = [model_type(lbl) for lbl in df.label]
df['model_path'] =  df['tokenizer_path'] =  [saveRunPaths[id].joinpath('final_model') for id in df.runID]
df['dataset_path'] =   [dataset_path(lbl) for lbl in df.label]
df['types_path']   =   [types_path(lbl) for lbl in df.label]
print(df.shape)
df.sort_values(['label','runID'])

(31, 7)


Unnamed: 0,label,runID,model_type,model_path,tokenizer_path,dataset_path,types_path
4,ade_bert_test,2022-03-27_09.07.07.554048,spert,../data/save/ade_bert_train/2022-03-27_09.07.0...,../data/save/ade_bert_train/2022-03-27_09.07.0...,data/datasets/ade/ade_split_9_test.json,data/datasets/ade/ade_types.json
617,ade_elec_test,2022-03-28_12.03.19.494299,spelec,../data/save/ade_elec_train/2022-03-28_12.03.1...,../data/save/ade_elec_train/2022-03-28_12.03.1...,data/datasets/ade/ade_split_9_test.json,data/datasets/ade/ade_types.json
603,ade_elec_test,2022-03-28_13.38.42.558541,spelec,../data/save/ade_elec_train/2022-03-28_13.38.4...,../data/save/ade_elec_train/2022-03-28_13.38.4...,data/datasets/ade/ade_split_9_test.json,data/datasets/ade/ade_types.json
636,ade_rob_test,2022-03-27_22.47.55.820777,sprob,../data/save/ade_rob_train/2022-03-27_22.47.55...,../data/save/ade_rob_train/2022-03-27_22.47.55...,data/datasets/ade/ade_split_9_test.json,data/datasets/ade/ade_types.json
120,conll04_bert_test,2022-03-24_16.56.59.416385,spert,../data/save/conll04_bert_train/2022-03-24_16....,../data/save/conll04_bert_train/2022-03-24_16....,data/datasets/conll04/conll04_test.json,data/datasets/conll04/conll04_types.json
161,conll04_bert_test,2022-03-24_17.03.33.337513,spert,../data/save/conll04_bert_train/2022-03-24_17....,../data/save/conll04_bert_train/2022-03-24_17....,data/datasets/conll04/conll04_test.json,data/datasets/conll04/conll04_types.json
194,conll04_bert_test,2022-03-24_19.12.06.248497,spert,../data/save/conll04_bert_train/2022-03-24_19....,../data/save/conll04_bert_train/2022-03-24_19....,data/datasets/conll04/conll04_test.json,data/datasets/conll04/conll04_types.json
144,conll04_bert_test,2022-03-24_19.30.46.210486,spert,../data/save/conll04_bert_train/2022-03-24_19....,../data/save/conll04_bert_train/2022-03-24_19....,data/datasets/conll04/conll04_test.json,data/datasets/conll04/conll04_types.json
86,conll04_bert_test_lvl,2022-04-06_16.29.59.809661,spert,../data/save/conll04_bert_train_lvl/2022-04-06...,../data/save/conll04_bert_train_lvl/2022-04-06...,data/datasets/conll04/conll04_test.json,data/datasets/conll04/conll04_types.json
282,conll04_elec_test,2022-03-26_09.53.45.502396,spelec,../data/save/conll04_elec_train/2022-03-26_09....,../data/save/conll04_elec_train/2022-03-26_09....,data/datasets/conll04/conll04_test.json,data/datasets/conll04/conll04_types.json


In [15]:
cmds = []
for i,(label,_, model_type,model_path, tokenizer_path, dataset_path, types_path)in df.iterrows():    
    cmd = 'python spert.py eval --config configs/eval_generic.conf '
    cmd += ' --label %s'% label #label
    cmd += ' --model_type %s'% str(model_type)  
    cmd += ' --model_path %s'% str(model_path)[1:] 
    cmd += ' --tokenizer_path %s'% str(tokenizer_path)[1:]
    cmd += ' --dataset_path %s'% dataset_path 
    cmd += ' --types_path %s'% types_path
    cmds.append(cmd)

len(cmds)


31

In [16]:

with open('../eval_all_bash_stream.sh', 'w') as f:
    for line in cmds:
        f.write(line)
        f.write('\n')

## Best/Mean Runs - Testing

We pick the best weights from each of the selected training runs above. We run a test for each and report the best run.

In [17]:
argEvalList= []
for run in runEvalDF.itertuples():
    with open(run.logPath.joinpath('args.json')) as A:
        D = json.load(A)
    D['label']=run.label
    D['logPath'] = run.logPath
    #D['savePath'] = run.savePath
    D['runID'] = run.run
    
    evPath = run.logPath.joinpath('eval_test.csv')
    if evPath.exists():
        try:
            D.update(pd.read_csv(evPath,sep=';').iloc[-1].to_dict())
            argEvalList.append(D)
        except:
            print('exception ==>',evPath)

argEvalDF = pd.DataFrame.from_dict(argEvalList).reset_index()
argEvalDF[['label','runID','ner_f1_macro','rel_f1_macro','rel_nec_f1_macro','ner_f1_micro','rel_f1_micro',	'rel_nec_f1_micro','model_path', 'rel_filter_threshold']]


exception ==> ../data/log/conll04_rob_test/2022-03-27_14.52.01.153096/eval_test.csv
exception ==> ../data/log/conll04_rob_test/2022-03-31_10.04.38.767961/eval_test.csv


Unnamed: 0,label,runID,ner_f1_macro,rel_f1_macro,rel_nec_f1_macro,ner_f1_micro,rel_f1_micro,rel_nec_f1_micro,model_path,rel_filter_threshold
0,conll04_bert_test,2022-03-26_13.17.27.579408,85.789502,73.091352,72.824686,88.642659,72.040302,71.788413,data/models/bert/conll04/2022-03-24_19.12.06.2...,0.5
1,conll04_bert_test,2022-03-26_13.09.08.704869,85.037992,69.378081,69.12167,87.824897,68.064118,67.817509,data/models/bert/conll04/2022-03-24_16.56.59.4...,0.5
2,conll04_bert_test,2022-03-26_13.15.24.855735,85.308104,68.993104,68.733363,88.327206,67.821782,67.574257,data/models/bert/conll04/2022-03-24_17.03.33.3...,0.5
3,conll04_bert_test,2022-03-26_13.18.15.750252,84.703277,70.210056,69.960056,87.885767,68.905473,68.656716,data/models/bert/conll04/2022-03-24_19.30.46.2...,0.5
4,conll04_rob_test,2022-03-26_14.14.40.366734,85.239432,70.327951,69.815883,87.827285,68.948035,68.441065,data/models/roberta/conll04/2022-03-25_05.43.5...,0.5
5,conll04_rob_test,2022-03-26_14.16.29.477572,85.472489,69.16752,69.16752,88.132475,67.478685,67.478685,data/models/roberta/conll04/2022-03-25_15.07.1...,0.5
6,conll04_rob_test,2022-03-26_14.15.40.228845,84.253314,70.866954,70.866954,86.91632,69.444444,69.444444,data/models/roberta/conll04/2022-03-25_06.10.1...,0.5
7,docRed_rob_test_lvl,2022-04-06_10.28.41.079524,86.791883,17.366252,17.282796,87.538242,49.211909,48.78659,data/save/docRed_rob_train_lvl/2022-04-05_04.2...,0.5
8,docRed_rob_test_lvl,2022-04-06_09.52.23.252423,86.928362,19.015714,18.926461,87.533156,50.241056,49.885816,data/save/docRed_rob_train_lvl/2022-04-04_20.2...,0.5
9,docRed_bert_test_lvl,2022-04-01_13.18.25.606821,72.876355,4.426773,4.303077,76.03658,16.670322,16.538715,data/models/bert/docRed/2022-04-01_10.50.12.67...,0.5


### Display Mean of Test Runs by Label

In [18]:
# Averate results of all tested runs:
means= pd.DataFrame(argEvalDF).groupby(by='label').mean(['ner_f1_macro','rel_f1_macro',	'rel_nec_f1_macro', 'ner_f1_micro','rel_f1_micro',	'rel_nec_f1_micro'])
counts= pd.DataFrame(argEvalDF).groupby(by='label').count()
counts=pd.DataFrame(counts['index']).rename(columns={'index':'count'})
means= pd.concat([means[['ner_f1_macro','rel_f1_macro',	'rel_nec_f1_macro', 'ner_f1_micro','rel_f1_micro',	'rel_nec_f1_micro']],counts],axis=1)
display(means)



Unnamed: 0_level_0,ner_f1_macro,rel_f1_macro,rel_nec_f1_macro,ner_f1_micro,rel_f1_micro,rel_nec_f1_micro,count
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
conll04_bert_test,85.209719,70.418148,70.159944,88.170132,69.207919,68.959224,4
conll04_rob_test,84.988412,70.120808,69.950119,87.62536,68.623721,68.454731,3
docRed_bert_test_lvl,82.156971,22.724061,22.17005,83.794504,44.152314,43.570904,3
docRed_rob_test_lvl,86.860122,18.190983,18.104628,87.535699,49.726482,49.336203,2


In [19]:
filter0 = [x for x in means.index if not x.endswith('lvl')] 
filter1 = [x for x in means.index if  x.endswith('lvl') and not x.startswith('docRed')] 
filter2 = [x for x in means.index if  x.endswith('lvl') and     x.startswith('docRed')] 
cols0 = [c for c in means.columns if  'macro' in c or c == 'count']
cols1 = [c for c in means.columns if  'macro' in c or c == 'count']
cols2 = [c for c in means.columns if  'micro' in c or c == 'count']
res0 = means.loc[filter0][cols0].round(decimals=2)
res1 = means.loc[filter1][cols1].round(decimals=2)
res2 = means.loc[filter2][cols2].round(decimals=2)
display(res0)
display(res1)
display(res2)

res0.to_latex('test_result_avgs_original_datasets.tex')
res1.to_latex('test_result_avgs_level_analysis.tex')
res2.to_latex('test_result_avgs_docRed_dataset.tex')


Unnamed: 0_level_0,ner_f1_macro,rel_f1_macro,rel_nec_f1_macro,count
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
conll04_bert_test,85.21,70.42,70.16,4
conll04_rob_test,84.99,70.12,69.95,3


Unnamed: 0_level_0,ner_f1_macro,rel_f1_macro,rel_nec_f1_macro,count
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


Unnamed: 0_level_0,ner_f1_micro,rel_f1_micro,rel_nec_f1_micro,count
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
docRed_bert_test_lvl,83.79,44.15,43.57,3
docRed_rob_test_lvl,87.54,49.73,49.34,2


### Display best Test Runs by Label

In [20]:
best = best_run(argEvalDF,maxMetrics=['ner_f1_macro','rel_f1_macro',	'rel_nec_f1_macro', 'ner_f1_micro','rel_f1_micro',	'rel_nec_f1_micro'])
#print(best)
varColsTest = [col for col in argEvalDF.columns if len(set(argEvalDF[col])) > 1 or col == 'label']
lmtTest = varColsTest.index('label') # we discard anything on the right of column label (addidional metrics)
#print(varCols)
#argEvalDF.iloc[list(set(best.index))]
df = argEvalDF.iloc[list(set(best.index))][['label','runID','ner_f1_macro','rel_f1_macro','rel_nec_f1_macro', 'ner_f1_micro','rel_f1_micro',	'rel_nec_f1_micro']+varColsTest[:lmtTest]].drop(columns=['tokenizer_path'])
df = df.sort_values(['label','runID'])
df[['label'	,'runID',	'ner_f1_macro',	'rel_f1_macro',	'rel_nec_f1_macro'	,'ner_f1_micro',	'rel_f1_micro',	'rel_nec_f1_micro']]

Unnamed: 0,label,runID,ner_f1_macro,rel_f1_macro,rel_nec_f1_macro,ner_f1_micro,rel_f1_micro,rel_nec_f1_micro
0,conll04_bert_test,2022-03-26_13.17.27.579408,85.789502,73.091352,72.824686,88.642659,72.040302,71.788413
6,conll04_rob_test,2022-03-26_14.15.40.228845,84.253314,70.866954,70.866954,86.91632,69.444444,69.444444
5,conll04_rob_test,2022-03-26_14.16.29.477572,85.472489,69.16752,69.16752,88.132475,67.478685,67.478685
10,docRed_bert_test_lvl,2022-04-02_18.43.26.670247,87.455293,31.041569,30.414249,88.300041,57.079841,56.352007
11,docRed_bert_test_lvl,2022-04-04_13.15.51.696886,86.139266,32.703839,31.792824,87.046892,58.706777,57.821991
8,docRed_rob_test_lvl,2022-04-06_09.52.23.252423,86.928362,19.015714,18.926461,87.533156,50.241056,49.885816
7,docRed_rob_test_lvl,2022-04-06_10.28.41.079524,86.791883,17.366252,17.282796,87.538242,49.211909,48.78659


## Write Training Results (CSV) by Label

In [21]:
for label in sorted(list(set(argDF.label))):
    print(label)
    df = argDF.loc[argDF.label == label][['label','runID','ner_f1_macro','rel_f1_macro','rel_nec_f1_macro', 'ner_f1_micro','rel_f1_micro',	'rel_nec_f1_micro']+varCols[:lmt]].drop(columns=['config','model_type','index','train_path','valid_path','types_path','final_eval','lowercase','model_path'])
    pd.DataFrame(df).to_csv('hyperparm_sel_%s.csv'%label)


ade_bert_train
ade_elec_train
ade_rob_train
conll04_bert_train
conll04_bert_train_lvl
conll04_elec_train
conll04_rob_train
conll04_rob_train_lvl
docRed_bert_train_lvl
docRed_rob_train_lvl
scierc_bert_train
scierc_elec_train
scierc_rob_train
