## For running the T5 vanilla baseline
1. Setup datafiles for running t5 (i.e. produce json files)
1. Run the seq2seq model to produce outputs, saving in a directory that matches k_t5_outputs below
    - Train model (produces saved checkpoints)
    - Eval top performing model (load from top checkpoint, produces json outputs)

1. run the eval code here using the json output from model eval

In [5]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../decrypt')
from decrypt.scrape_parse import (
    load_guardian_splits,
    load_guardian_splits_disjoint,
    load_guardian_splits_disjoint_hash
)

import os
import config
from decrypt.common.puzzle_clue import GuardianClue, BaseClue
from decrypt.common import validation_tools as vt
from decrypt.common.util_data import clue_list_tuple_to_train_split_json
from typing import List
import json
import logging
logging.getLogger(__name__)


k_json_folder = config.DataDirs.Guardian.json_folder

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1 Produce datasets

In [10]:
def make_dataset(split_type: str, overwrite=False):
    assert split_type in ['naive_random', 'naive_disjoint', 'word_init_disjoint']
    if split_type == 'naive_random':
        load_fn = load_guardian_splits
        tgt_dir = config.DataDirs.DataExport.guardian_naive_random_split
    elif split_type == 'naive_disjoint':
        load_fn = load_guardian_splits_disjoint
        tgt_dir = config.DataDirs.DataExport.guardian_naive_disjoint_split
    else:
        load_fn = load_guardian_splits_disjoint_hash
        tgt_dir = config.DataDirs.DataExport.guardian_word_init_disjoint_split

    _, _, (train, val, test) = load_fn(k_json_folder)

    os.makedirs(tgt_dir, exist_ok=True)
    # write the output as json
    try:
        clue_list_tuple_to_train_split_json((train, val, test),
                                            comment=f'Guardian data. Split: {split_type}',
                                            export_dir=tgt_dir,
                                            overwrite=overwrite)
    except FileExistsError:
        logging.warning(f'You have already generated the {split_type} dataset.\n'
                        f'It is located at {tgt_dir}\n'
                        f'To regenerate, pass overwrite=True or delete it\n')


make_dataset('naive_random')
make_dataset('word_init_disjoint')
# you can also make_dataset('naive_disjoint')

INFO:decrypt.scrape_parse.guardian_load:loading from /Users/jsrozner/jsrozner/cryptic/decrypt/data/puzzles
INFO:decrypt.scrape_parse.guardian_load:Using file glob at /Users/jsrozner/jsrozner/cryptic/decrypt/data/puzzles/cryptic*.json
INFO:decrypt.scrape_parse.guardian_load:Glob has size 5518
INFO:decrypt.scrape_parse.guardian_load:Glob size matches the expected one from Decrypting paper
100%|██████████| 5518/5518 [00:12<00:00, 448.52it/s]
 70%|███████   | 100848/143991 [00:01<00:00, 76506.56it/s]

[("length punct: '", 1),
 ('invalid: clue group', 7687),
 ('invalid: invalid start char (most are continuation clues)', 607),
 ('invalid: number in clue (commonly references another clue)', 7066),
 ('invalid: regexp', 75),
 ('invalid: soln length does not match specified lens (multi box soln)', 56),
 ('invalid: unrecognized char in clue (e.g. html)', 85),
 ('invalid: zero-len clue text after regexp', 15),
 ('length punct: ,', 24644),
 ('length punct: -', 4148),
 ('length punct: .', 8),
 ('length punct: /', 1),
 ('stat: parsed_puzzle', 5518),
 ('stat: total_clues', 143991),
 (1, 119956),
 (2, 20272),
 (3, 2957),
 (4, 686),
 (5, 112),
 (6, 8)]
Total clues: len(puzz_list)


100%|██████████| 143991/143991 [00:01<00:00, 130467.53it/s]
100%|██████████| 55783/55783 [00:02<00:00, 24200.43it/s]


removed 1611 exact dupes
142380


INFO:decrypt.scrape_parse.guardian_load:Counter({1: 118540, 2: 20105, 3: 2929, 4: 686, 5: 112, 6: 8})
INFO:decrypt.scrape_parse.guardian_load:Clue list length matches Decrypting paper expected length
INFO:decrypt.scrape_parse.guardian_load:Got splits of lenghts [75847, 32628, 33905]
INFO:decrypt.scrape_parse.guardian_load:First three clues of train set:
	[GuardianClue(clue='Sailor boy in his hammock', lengths=[4], soln='abed', soln_with_spaces='abed', idx=34809, dataset=PosixPath('/Users/jsrozner/jsrozner/cryptic/decrypt/data/puzzles'), across_or_down='across', pos=(0, 2), unique_clue_id='cryptic_23048_10-across', type='cryptic', number=23048, id='crosswords/cryptic/23048', creator='Rufus', orig_lengths='4', lengths_punctuation=set()), GuardianClue(clue='With a degree, I leave this subject', lengths=[5], soln='maths', soln_with_spaces='maths', idx=412, dataset=PosixPath('/Users/jsrozner/jsrozner/cryptic/decrypt/data/puzzles'), across_or_down='across', pos=(0, 13), unique_clue_id='crypt

{'idx': 34809, 'input': 'Sailor boy in his hammock (4)', 'target': 'abed'}
{'idx': 412,
 'input': 'With a degree, I leave this subject (5)',
 'target': 'maths'}
{'idx': 116809,
 'input': 'Burrow to cure limb and make sure one gets up (3,3,5)',
 'target': 'set the alarm'}



## 2 Running (training) the model
1. Setup environment
    1. You should setup wandb for logging (that's where metrics will show up).
    If you try to run, the wandb will tell you what you need to do to initialize
1. Train the model
    1. from directory seq2seq, run the commands in the box below
    1. Will produce model checkpoints

# TODO
todo: environment setup

- Choose place for your wandb dir, e.g., `'./wandb' `
- Note that the default arguments are given in args_cryptic. See `--default_train` and `--default_val`
- Note that it looks like epochs start at 11, so that we have space for 10 "warmup" epochs for curricular training - this is so that plots in wandb will line up

Baseline naive
```python
train_clues.py --default_train=base --name=baseline_naive --project=baseline --wandb_dir='./wandb' --data_dir='../data/clue_json/guardian/naive_random'
```
Baseline disjoint (word initial disjoint)
```python
train_clues.py --default_train=base --name=baseline_disj --project=baseline --wandb_dir='./wandb' --data_dir='../data/clue_json/guardian/word_initial_disjoint'
```

Baseline (naive split), without lengths
```python
train_clues.py --default_train=base --name=baseline_naive_nolens --project=baseline --wandb_dir='./wandb' --data_dir='../data/clue_json/guardian/word_initial_disjoint' --special=no_lens
```

## 3 Evaluating the model
For training we generate only 5 beams. For eval we are going to generate 100.
1. Select the best model based on num_match_top_sampled
2. Run eval using that model
3. This will produce a file in a new wandb directory that looks like `epoch_11.pth.tar.preds.json` (i.e a single epoch)

For example,

Baseline naive, if epoch 10 is best (you'll need to set the run_name)
This runs the eval set
```python
train_clues.py --default_val=base --name=baseline_naive_val --data_dir='../data/clue_json/guardian/naive_random' --ckpt_path='./wandb/run_name/files/epoch_10.pth.tar
```

To test the test set, add `--test`
```python
train_clues.py --default_val=base --name=baseline_naive_val --data_dir='../data/clue_json/guardian/naive_random' --ckpt_path='./wandb/run_name/files/epoch_10.pth.tar --test
```


Now we evaluate the json that was produced
1. Change the k_t5_outputs_dir value to the location where you have saved the json files. 
    - Recommend copying all of the preds.json files into a common directory and working from that.
    - Alternatively you could modify the code below and pass in a full path name to each of the json outputs (using the wandb directory path)
1. For each t5 model eval that you ran, run `load_and_run()` to get metrics for those outputs
1. The resulting outputs are the values we report in the tables. See `decrypt/common/validation_tools.ModelEval` for more details about the numbers that are produced. Percentages are prefixed by agg_

In [None]:
# Copy your preds.json files to a directory and specify that directory here
k_t5_outputs_dir = None
assert k_t5_outputs_dir is not None

def load_t5(json_out_file: str, pre_truncate=None):
    with open(json_out_file, 'r') as f:
        json_blob = json.load(f)

    # eval (and backfill)
    model_outputs = []
    idx_set = set()
    for d in json_blob:
        idx, input, tgt, greedy, sampled = d
        assert idx not in idx_set
        idx_set.add(idx)

        mp = vt.ModelPrediction(idx, input, tgt, greedy, sampled)
        mp.model_eval = vt.eval(mp, pre_truncate=pre_truncate)
        model_outputs.append(mp)

    print(len(model_outputs))
    # can use to verify there is an output for all inputs
    # for i in range(28476):
    #     if i not in idx_set:
    #         print(i)

    return model_outputs



In [4]:
def load_and_run(fname, label=None, filter_fcn=None, pre_truncate=None):
    if label is None:
        label = fname
    data = load_t5(k_t5_outputs_dir + fname + '.json',
                   pre_truncate=pre_truncate)
    vt.all_aggregate(data, label=label, filter_fcn=filter_fcn)

In [6]:
# for example, if your output files are in
# 'decrypt/t5_outputs/'
# you would have set k_t5_outputs_dir to be 'decrypt/t5_outputs/
# and you will run the below, e.g., if you have named the files
# baseline_naive_e12_test.json

# for example
### primary - test
load_and_run('baseline_naive_e12_test')
load_and_run('baseline_naive_nolens_e15_test')     # test set

## primary val
load_and_run('baseline_naive_e12_val')
load_and_run('baseline_naive_nolens_e15_val')     


28476
[('agg_filter_len_pre_truncate', 44.504495013344574),
 ('agg_filtered_few', 0.007093692934400899),
 ('agg_generate_few', 0.0),
 ('agg_generate_none', 0.0),
 ('agg_in_filtered', 0.4591234723978087),
 ('agg_in_sample', 0.2807978648686613),
 ('agg_sample_len', 10.0),
 ('agg_sample_len_correct', 0.48454136816968674),
 ('agg_sample_len_pre_truncate', 100.0),
 ('agg_sample_wordct_correct', 0.9789858126141312),
 ('agg_top_10_after_filter', 0.3385658098047479),
 ('agg_top_match', 0.1630495856159573),
 ('agg_top_match_len_correct', 0.9998946481247366),
 ('agg_top_match_none', 0.00010535187526337969),
 ('agg_top_match_wordct_correct', 0.9942407641522686),
 ('agg_top_sample_result_len_correct', 0.560858266610479),
 ('agg_top_sample_result_wordct_correct', 0.9846186262115466),
 ('filter_len_pre_truncate', 1267310),
 ('filtered_few', 202),
 ('generate_few', 0),
 ('generate_none', 0),
 ('in_filtered', 13074),
 ('in_sample', 7996),
 ('sample_len', 284760),
 ('sample_len_correct', 137978),
 ('sa

28476
[('agg_filter_len_pre_truncate', 88.33677482792527),
 ('agg_filtered_few', 0.001685630004214075),
 ('agg_generate_few', 0.0),
 ('agg_generate_none', 0.0),
 ('agg_in_filtered', 0.6056679308891698),
 ('agg_in_sample', 0.41515662312122487),
 ('agg_sample_len', 10.0),
 ('agg_sample_len_correct', 0.9134042702626773),
 ('agg_sample_len_pre_truncate', 100.0),
 ('agg_sample_wordct_correct', 0.9718253968253968),
 ('agg_top_10_after_filter', 0.4220747295968535),
 ('agg_top_match', 0.21519876387133025),
 ('agg_top_match_len_correct', 0.9998244135412278),
 ('agg_top_match_none', 0.00017558645877229948),
 ('agg_top_match_wordct_correct', 0.9902373928922602),
 ('agg_top_sample_result_len_correct', 0.9383340356791684),
 ('agg_top_sample_result_wordct_correct', 0.9814229526618907),
 ('filter_len_pre_truncate', 2515478),
 ('filtered_few', 48),
 ('generate_few', 0),
 ('generate_none', 0),
 ('in_filtered', 17247),
 ('in_sample', 11822),
 ('sample_len', 284760),
 ('sample_len_correct', 260101),
 ('s

In [None]:
##################
###############

# top curricular
load_and_run('cw_4_4_e19_val_all')
load_and_run('cw_4_4_e19_test_all')

In [6]:
##
# top curricular disjoint
load_and_run('curr_4_4_disj2_test')
load_and_run('curr_4_4_disj2_dev')

28085
[('agg_filter_len_pre_truncate', 84.89104504183727),
 ('agg_filtered_few', 0.053444899412497776),
 ('agg_generate_few', 0.0),
 ('agg_generate_none', 0.0),
 ('agg_in_filtered', 0.386612070500267),
 ('agg_in_sample', 0.1935196724230016),
 ('agg_sample_len', 10.0),
 ('agg_sample_len_correct', 0.861146519494392),
 ('agg_sample_len_pre_truncate', 100.0),
 ('agg_sample_wordct_correct', 0.9229944810397009),
 ('agg_top_10_after_filter', 0.2),
 ('agg_top_match', 0.06498130674737404),
 ('agg_top_match_len_correct', 0.991917393626491),
 ('agg_top_match_none', 0.00808260637350899),
 ('agg_top_match_wordct_correct', 0.9389709809506854),
 ('agg_top_sample_result_len_correct', 0.8667616165212747),
 ('agg_top_sample_result_wordct_correct', 0.9304254940359623),
 ('filter_len_pre_truncate', 2384165),
 ('filtered_few', 1501),
 ('generate_few', 0),
 ('generate_none', 0),
 ('in_filtered', 10858),
 ('in_sample', 5435),
 ('sample_len', 280850),
 ('sample_len_correct', 241853),
 ('sample_len_pre_truncat



In [None]:
##################
###############

# todo: josh (should be in another folder)

## primary - anagram subset performance
load_and_run('baseline_naive_e12_val',
             filter_fcn=vt.make_set_filter('anag_direct'),
             pre_truncate=5)

In [24]:
# best performing model after curricular train
load_and_run('ACW_descrambleonly_20',
             filter_fcn=vt.make_set_filter('anag_direct'))

28476
With filter check_inclusion
[('agg_filter_len_pre_truncate', 3.8727087576374744),
 ('agg_filtered_few', 1.0),
 ('agg_generate_few', 1.0),
 ('agg_generate_none', 0.0),
 ('agg_in_filtered', 0.3000678886625934),
 ('agg_in_sample', 0.3000678886625934),
 ('agg_sample_len', 5.0),
 ('agg_sample_len_correct', 0.7803801765105227),
 ('agg_sample_len_pre_truncate', 5.0),
 ('agg_sample_wordct_correct', 0.9613713509843856),
 ('agg_top_10_after_filter', 0.3000678886625934),
 ('agg_top_match', 0.21384928716904278),
 ('agg_top_match_len_correct', 0.9758995247793618),
 ('agg_top_match_none', 0.024100475220638153),
 ('agg_top_match_wordct_correct', 0.9694501018329938),
 ('agg_top_sample_result_len_correct', 0.8295994568906992),
 ('agg_top_sample_result_wordct_correct', 0.9755600814663951),
 ('filter_len_pre_truncate', 11409),
 ('filtered_few', 2946),
 ('generate_few', 2946),
 ('generate_none', 0),
 ('in_filtered', 884),
 ('in_sample', 884),
 ('sample_len', 14730),
 ('sample_len_correct', 11495),
 

In [None]:
## disjoint sets

# disjoint 1
load_and_run('baseline_disj1_e6_val')
# disjoint 2 (true disjoint)
load_and_run('baseline_disj2_e1_val')

In [5]:
# disjoint sets (test)

# disjoint 1
load_and_run('baseline_disj1_e6_test')
# disjoint 2 (true disjoint)
load_and_run('baseline_disj2_e1_test')

28521
[('agg_filter_len_pre_truncate', 39.15812909785772),
 ('agg_filtered_few', 0.010378317730794853),
 ('agg_generate_few', 0.0),
 ('agg_generate_none', 0.0),
 ('agg_in_filtered', 0.21976789032642613),
 ('agg_in_sample', 0.07776725921251008),
 ('agg_sample_len', 10.0),
 ('agg_sample_len_correct', 0.40793450440026646),
 ('agg_sample_len_pre_truncate', 100.0),
 ('agg_sample_wordct_correct', 0.9815890045931068),
 ('agg_top_10_after_filter', 0.12864205322394026),
 ('agg_top_match', 0.0323621191402826),
 ('agg_top_match_len_correct', 0.9999649381157744),
 ('agg_top_match_none', 3.5061884225658285e-05),
 ('agg_top_match_wordct_correct', 0.9913747764804881),
 ('agg_top_sample_result_len_correct', 0.4201816205602889),
 ('agg_top_sample_result_wordct_correct', 0.9843273377511308),
 ('filter_len_pre_truncate', 1116829),
 ('filtered_few', 296),
 ('generate_few', 0),
 ('generate_none', 0),
 ('in_filtered', 6268),
 ('in_sample', 2218),
 ('sample_len', 285210),
 ('sample_len_correct', 116347),
 ('

In [5]:



# special eval
# train overlap performance
# load_and_run('baseline_naive_e12_val')

_, _, (train, _, _) = load_guardian_splits(k_json_folder)

INFO:data_util.scrape_parse.guardian_scrape:loading from /Users/jsrozner/jsrozner/cryptic/cryptic-code/data/puzzles/guardian_data/guardian_2020_10_08_json
INFO:data_util.scrape_parse.guardian_scrape:Using file glob at /Users/jsrozner/jsrozner/cryptic/cryptic-code/data/puzzles/guardian_data/guardian_2020_10_08_json/cryptic*.json
INFO:data_util.scrape_parse.guardian_scrape:Glob has size 5518
100%|██████████| 5518/5518 [00:22<00:00, 241.82it/s]
INFO:data_util.scrape_parse.guardian_scrape:Params set to not filter
100%|██████████| 143991/143991 [00:05<00:00, 24810.84it/s]
100%|██████████| 55783/55783 [00:06<00:00, 8603.18it/s] 
INFO:data_util.scrape_parse.guardian_scrape:Counter({1: 118540, 2: 20105, 3: 2929, 4: 686, 5: 112, 6: 8})


[("length punct: '", 1),
 ('invalid: clue group', 7687),
 ('invalid: invalid start char (most are continuation clues)', 607),
 ('invalid: number in clue (commonly references another clue)', 7066),
 ('invalid: regexp', 75),
 ('invalid: soln length does not match specified lens (multi box soln)', 56),
 ('invalid: unrecognized char in clue (e.g. html)', 85),
 ('invalid: zero-len clue text after regexp', 15),
 ('length punct: ,', 24644),
 ('length punct: -', 4148),
 ('length punct: .', 8),
 ('length punct: /', 1),
 ('stat: parsed_puzzle', 5518),
 ('stat: total_clues', 143991),
 (1, 119956),
 (2, 20272),
 (3, 2957),
 (4, 686),
 (5, 112),
 (6, 8)]
Total clues: len(puzz_list)
removed 1611 exact dupes
142380


In [6]:
def make_set_inclusion_filter_fcn(train_set: List[BaseClue]):
    s = set()
    for c in train_set:
        s.add(c.soln_with_spaces)

    # return False to omit
    def filter_fcn(mp: vt.ModelPrediction):
        if mp.target in s:
            return False
        return True
    return filter_fcn

In [7]:
# subset that does not overlap with train (exact match)
load_and_run('baseline_naive_e12_val',
             label='baseline_naive_e12_val_diff_exact',
             filter_fcn=make_set_inclusion_filter_fcn(train))
load_and_run('baseline_naive_e12_test',
             label='baseline_naive_e12_test_diff_exact',
             filter_fcn=make_set_inclusion_filter_fcn(train))

28476
With filter filter_fcn
[('agg_filter_len_pre_truncate', 38.64603782619105),
 ('agg_filtered_few', 0.014962892027771128),
 ('agg_generate_few', 0.0),
 ('agg_generate_none', 0.0),
 ('agg_in_filtered', 0.15872635863059611),
 ('agg_in_sample', 0.06308355278908308),
 ('agg_sample_len', 10.0),
 ('agg_sample_len_correct', 0.41554943739525974),
 ('agg_sample_len_pre_truncate', 100.0),
 ('agg_sample_wordct_correct', 0.963574335647594),
 ('agg_top_10_after_filter', 0.09456547761551352),
 ('agg_top_match', 0.030045487191764423),
 ('agg_top_match_len_correct', 0.9998802968637779),
 ('agg_top_match_none', 0.00011970313622216902),
 ('agg_top_match_wordct_correct', 0.9868326550155614),
 ('agg_top_sample_result_len_correct', 0.44613358870002395),
 ('agg_top_sample_result_wordct_correct', 0.96744074694757),
 ('filter_len_pre_truncate', 322849),
 ('filtered_few', 125),
 ('generate_few', 0),
 ('generate_none', 0),
 ('in_filtered', 1326),
 ('in_sample', 527),
 ('sample_len', 83540),
 ('sample_len_co

INFO:data_util.scrape_parse.guardian_scrape:loading from /Users/jsrozner/jsrozner/cryptic/cryptic-code/data/puzzles/guardian_data/guardian_2020_10_08_json
INFO:data_util.scrape_parse.guardian_scrape:Using file glob at /Users/jsrozner/jsrozner/cryptic/cryptic-code/data/puzzles/guardian_data/guardian_2020_10_08_json/cryptic*.json
INFO:data_util.scrape_parse.guardian_scrape:Glob has size 5518
100%|██████████| 5518/5518 [00:19<00:00, 287.01it/s]
INFO:data_util.scrape_parse.guardian_scrape:Params set to not filter
100%|██████████| 143991/143991 [00:00<00:00, 637508.78it/s]
100%|██████████| 55783/55783 [00:02<00:00, 22397.92it/s]
INFO:data_util.scrape_parse.guardian_scrape:Counter({1: 118540, 2: 20105, 3: 2929, 4: 686, 5: 112, 6: 8})


[("length punct: '", 1),
 ('invalid: clue group', 7687),
 ('invalid: invalid start char (most are continuation clues)', 607),
 ('invalid: number in clue (commonly references another clue)', 7066),
 ('invalid: regexp', 75),
 ('invalid: soln length does not match specified lens (multi box soln)', 56),
 ('invalid: unrecognized char in clue (e.g. html)', 85),
 ('invalid: zero-len clue text after regexp', 15),
 ('length punct: ,', 24644),
 ('length punct: -', 4148),
 ('length punct: .', 8),
 ('length punct: /', 1),
 ('stat: parsed_puzzle', 5518),
 ('stat: total_clues', 143991),
 (1, 119956),
 (2, 20272),
 (3, 2957),
 (4, 686),
 (5, 112),
 (6, 8)]
Total clues: len(puzz_list)
removed 1611 exact dupes
142380


In [8]:
###
# prepare for set that does not overlap, fuzzily
##
# this result not in paper
def make_set_inclusion_filter_fcn_fuzz(train_set: List[BaseClue]):
    s = set()
    for c in train_set:
        soln = c.soln_with_spaces
        if soln.endswith('es'):
            s.add(soln[:-2])
        if soln.endswith('s'):
            s.add(soln[:-1])
        s.add(soln + 'es')
        s.add(soln + 's')
        s.add(soln)

    # return False to omit
    def filter_fcn(mp: vt.ModelPrediction):
        if mp.target in s or mp.target[:-1] in s or mp.target[:-2] in s:
            return False
        return True
    return filter_fcn

In [9]:
load_and_run('baseline_naive_e12_val',
             label='baseline_naive_e12_val_diff_plurals',
             filter_fcn=make_set_inclusion_filter_fcn_fuzz(train))
load_and_run('baseline_naive_e12_test',
             label='baseline_naive_e12_test_diff_plurals',
             filter_fcn=make_set_inclusion_filter_fcn_fuzz(train))

28476
With filter filter_fcn
[('agg_filter_len_pre_truncate', 37.30992736077482),
 ('agg_filtered_few', 0.01791767554479419),
 ('agg_generate_few', 0.0),
 ('agg_generate_none', 0.0),
 ('agg_in_filtered', 0.12784503631961258),
 ('agg_in_sample', 0.049878934624697335),
 ('agg_sample_len', 10.0),
 ('agg_sample_len_correct', 0.40508474576271186),
 ('agg_sample_len_pre_truncate', 100.0),
 ('agg_sample_wordct_correct', 0.9544471347861179),
 ('agg_top_10_after_filter', 0.07602905569007264),
 ('agg_top_match', 0.022598870056497175),
 ('agg_top_match_len_correct', 0.9998385794995964),
 ('agg_top_match_none', 0.00016142050040355126),
 ('agg_top_match_wordct_correct', 0.9832122679580306),
 ('agg_top_sample_result_len_correct', 0.4314769975786925),
 ('agg_top_sample_result_wordct_correct', 0.9583535108958838),
 ('filter_len_pre_truncate', 231135),
 ('filtered_few', 111),
 ('generate_few', 0),
 ('generate_none', 0),
 ('in_filtered', 792),
 ('in_sample', 309),
 ('sample_len', 61950),
 ('sample_len_c

In [None]:
###
# disjoint
###
# not reported in paper
_, _, (train_disj, val_disj, _) = load_guardian_splits_disjoint_hash(k_json_folder)

load_and_run('baseline_disj1_e6_val',
             label='baseline_disj1_e6_val_diff',
             filter_fcn=make_set_inclusion_filter_fcn_fuzz(train_disj))


