In [11]:
import os
from os import path
import glob
import pandas as pd
import json
from collections import defaultdict

In [12]:
perf_dir = "/share/data/speech/shtoshni/research/litbank_coref/models/perf/"
slurm_id = "6101199"

files = sorted(glob.glob(path.join(perf_dir, slurm_id + "*")), key=lambda x: int(path.splitext(path.basename(x))[0].split('_')[1]))

In [13]:
model_dict_list = []
for file in files:
    model_dict_list.append(json.loads(open(file).read()))
    
# print(model_dict_list)

In [14]:
def determine_varying_attributes(model_dict_list, ignore_attribs=['train', 'test', 'dev',  'pretrained_mention_model',
                                                                  'conll_data_dir','slurm_id', 'best_model_dir', 'data_dir']):
    attrib_to_vals = defaultdict(set)
    for model_dict in model_dict_list:
        for attrib, val in model_dict.items():
            if attrib in ignore_attribs:
                continue
            else:
                attrib_to_vals[attrib].add(val)
    
    
    varying_attribs = []
    for attrib, vals in attrib_to_vals.items():
        if len(vals) > 1:
            varying_attribs.append(attrib)
            
    return varying_attribs

In [15]:
varying_attribs = determine_varying_attributes(model_dict_list)
# perf_attribs = ['MUC' , 'Bcub', 'CEAFE']
perf_attribs = []
print(varying_attribs)

['model_dir', 'mem_type', 'num_cells', 'cross_val_split', 'sample_invalid']


### Load all dev and test f-scores

In [16]:
perf_df = pd.DataFrame(columns=(varying_attribs + ['devf', 'fs'] + perf_attribs))

for model_dict in model_dict_list:
    perf_dict = {}
    for attrib in varying_attribs:
        perf_dict[attrib] = model_dict[attrib]
        
    
    for perf_attrib in perf_attribs:
        if perf_attrib in model_dict['test']:
            attrib_dict = model_dict['test'][perf_attrib]
            perf_dict[perf_attrib] = (attrib_dict['recall'], attrib_dict['precision'], attrib_dict['fscore'])
        else:
            perf_dict[perf_attrib] = '-'
            
    
    perf_dict['devf'] = model_dict['dev']['fscore']
    perf_dict['fs'] = model_dict['test']['fscore']
    
    perf_df = perf_df.append(perf_dict, ignore_index=True)

### Filter by maximum dev-scores among hyperparams

In [17]:
perf_df = perf_df.rename(columns={"label_smoothing_wt": "ls_wt", "sample_invalid": "samp", "max_training_segments": "segs"})
idx = perf_df.groupby(['mem_type', 'num_cells', 'cross_val_split'])['devf'].idxmax()

dev_max_df = perf_df.iloc[idx]
dev_max_df

Unnamed: 0,model_dir,mem_type,num_cells,cross_val_split,samp,devf,fs
212,/share/data/speech/shtoshni/research/litbank_c...,learned,5,0,0.75,73.4,68.1
156,/share/data/speech/shtoshni/research/litbank_c...,learned,5,1,0.25,70.9,72.3
188,/share/data/speech/shtoshni/research/litbank_c...,learned,5,2,0.50,74.0,71.9
251,/share/data/speech/shtoshni/research/litbank_c...,learned,5,3,1.00,71.9,72.8
165,/share/data/speech/shtoshni/research/litbank_c...,learned,5,4,0.25,74.4,68.4
...,...,...,...,...,...,...,...
23,/share/data/speech/shtoshni/research/litbank_c...,unbounded,20,5,0.75,76.7,73.6
24,/share/data/speech/shtoshni/research/litbank_c...,unbounded,20,6,0.75,75.0,77.9
35,/share/data/speech/shtoshni/research/litbank_c...,unbounded,20,7,1.00,77.9,75.8
26,/share/data/speech/shtoshni/research/litbank_c...,unbounded,20,8,0.75,76.4,76.9


### Get varying memory type and memory size configurations

In [18]:
z = dev_max_df.groupby(['mem_type','num_cells']).size()
z

mem_type   num_cells
learned    5            10
           10           10
           20           10
lru        5            10
           10           10
           20           10
unbounded  20           10
dtype: int64

In [19]:
multindex = z.axes[0]
mem_types = list(multindex.get_level_values(0))
num_cells = list(multindex.get_level_values(1))

print(mem_types, num_cells)

['learned', 'learned', 'learned', 'lru', 'lru', 'lru', 'unbounded'] [5, 10, 20, 5, 10, 20, 20]


### Get location of all conll output files

In [30]:
# SPLIT = 'dev'
SPLIT = 'test'


model_config_to_conll_files = []
for mem_type, num_cell in zip(mem_types, num_cells):
    model_config =  (mem_type, num_cell)
    
    config_df = dev_max_df.loc[(dev_max_df['mem_type'] == mem_type) & (dev_max_df['num_cells'] == num_cell)]
    conll_files = []
    json_files = []
    for cross_val_split in range(10):
        model_dir = config_df.loc[config_df['cross_val_split'] == cross_val_split]['model_dir'].values[0]
        conll_file = path.join(model_dir, f'{SPLIT}.conll')
        conll_files.append(conll_file)
        json_files.append(path.join(model_dir, f'{SPLIT}.log.jsonl'))
        
    model_config_to_conll_files.append((model_config, conll_files, json_files))

### Concat all Cross Val CoNLLs and JSONLs

In [34]:
output_dir = "../models/litbank_preds/"
if not path.exists(output_dir):
    os.makedirs(output_dir)


model_config_output_file_list = []    
for model_config, conll_files, jsonl_files in model_config_to_conll_files:
    conll_output_file = path.join(output_dir, f'{model_config[0]}_{model_config[1]}_{SPLIT}.conll')
    jsonl_output_file = path.join(output_dir, f'{model_config[0]}_{model_config[1]}_{SPLIT}.jsonl')
    
    model_config_output_file_list.append((model_config, conll_output_file))
    
    with open(conll_output_file, "w") as output_w:
        for conll_file in conll_files:
            with open(conll_file) as g:
                for line in g:
                    output_w.write(line)
                    
    with open(jsonl_output_file, "w") as output_w:
        for jsonl_file in jsonl_files:
            with open(jsonl_file) as g:
                for line in g:
                    output_w.write(line)
                    
        
                    

#### Setup coref evaluation script path and Gold CoNLL 

In [35]:
import sys
import subprocess
import re

gold_conll = f"/home/shtoshni/Research/litbank_coref/data/litbank/all.{SPLIT}.conll"
scorer_path = "/home/shtoshni/Research/litbank_coref/lrec2020-coref/reference-coreference-scorers/scorer.pl"

def get_coref_score(metric, path_to_scorer, gold=None, preds=None):
    output=subprocess.check_output(["perl", path_to_scorer, metric, preds, gold]).decode("utf-8")
    output=output.split("\n")[-3]
    matcher=re.search("Coreference: Recall: \(.*?\) (.*?)%	Precision: \(.*?\) (.*?)%	F1: (.*?)%", output)
    if matcher is not None:
        recall=float(matcher.group(1))
        precision=float(matcher.group(2))
        f1=float(matcher.group(3))
    return recall, precision, f1

In [None]:
metrics = ['MUC', 'Bcub', 'CEAFE']
for model_config, conll_file in model_config_output_file_list:
    print(f"\\{model_config[0]} & {model_config[1]}", end="")
    fscore_list = []
    for metric in metrics:
        recall, precision, fscore = get_coref_score(metric.lower(), scorer_path, gold_conll, conll_file)
        print(f" & {recall:.1f} & {precision:.1f} & {fscore:.1f} ", end="")
        fscore_list.append(fscore)
    
    print(f"& {sum(fscore_list)/len(fscore_list): .1f}")


\learned & 5 & 90.9 & 80.0 & 85.1  & 77.4 & 64.0 & 70.1  & 57.8 & 53.8 & 55.7 &  70.3
\learned & 10 & 90.0 & 84.6 & 87.2  & 78.1 & 70.8 & 74.2 