## For running the T5 vanilla baseline
1. Run the seq2seq model to produce outputs, saving in a directory that matches k_t5_outputs below
    - Train model (produces saved checkpoints)
    - Eval top performing model (load from top checkpoint, produces json outputs)

2. run the eval code here using the json output from model eval

In [None]:

%load_ext autoreload
%autoreload 2

In [None]:
import config
k_json_folder = config.DataDirs.Guardian.json_folder

# you'll need to specify where you saved the model outputs
k_t5_outputs_dir = None
assert k_t5_outputs_dir is not None

In [None]:
import sys
sys.path.append('../decrypt')
from decrypt.scrape_parse import (
    load_guardian_splits,
    load_guardian_splits_disjoint_hash
)

In [None]:
from decrypt.common.puzzle_clue import GuardianClue, BaseClue
from decrypt.common import validation_tools as vt
from typing import List
import json
from glob import glob

In [3]:
def load_t5(json_out_file: str, pre_truncate=None):
    with open(json_out_file, 'r') as f:
        json_blob = json.load(f)

    # eval (and backfill)
    model_outputs = []
    idx_set = set()
    for d in json_blob:
        idx, input, tgt, greedy, sampled = d
        assert idx not in idx_set
        idx_set.add(idx)

        mp = vt.ModelPrediction(idx, input, tgt, greedy, sampled)
        mp.model_eval = vt.eval(mp, pre_truncate=pre_truncate)
        model_outputs.append(mp)

    print(len(model_outputs))
    # was used to verify we had an output for all inputs
    # for i in range(28476):
    #     if i not in idx_set:
    #         print(i)

    return model_outputs

# def json_from_dir(dir):
#     path = dir + "/epoch_1.*.json"
#     g = glob(path)
#     assert len(g) == 1, path
#     return g[0]



initializing a csv writer


INFO:validation_tools:Opened clue labels file:


In [4]:
def load_and_run(fname, label=None, filter_fcn=None, pre_truncate=None):
    if label is None:
        label = fname
    data = load_t5(k_t5_outputs_dir + fname + '.json',
                   pre_truncate=pre_truncate)
    vt.all_aggregate(data, label=label, filter_fcn=filter_fcn)

In [6]:
# for example, if your output files are in
# 'decrypt/t5_outputs/baseline_naive_e12_test.json'
# you would have set k_t5_outputs_dir to be 'decrypt/t5_outputs/
# and you will run the below

### primary - test
load_and_run('baseline_naive_e12_test')
load_and_run('baseline_naive_nolens_e15_test')     # test set

28476
[('agg_filter_len_pre_truncate', 44.504495013344574),
 ('agg_filtered_few', 0.007093692934400899),
 ('agg_generate_few', 0.0),
 ('agg_generate_none', 0.0),
 ('agg_in_filtered', 0.4591234723978087),
 ('agg_in_sample', 0.2807978648686613),
 ('agg_sample_len', 10.0),
 ('agg_sample_len_correct', 0.48454136816968674),
 ('agg_sample_len_pre_truncate', 100.0),
 ('agg_sample_wordct_correct', 0.9789858126141312),
 ('agg_top_10_after_filter', 0.3385658098047479),
 ('agg_top_match', 0.1630495856159573),
 ('agg_top_match_len_correct', 0.9998946481247366),
 ('agg_top_match_none', 0.00010535187526337969),
 ('agg_top_match_wordct_correct', 0.9942407641522686),
 ('agg_top_sample_result_len_correct', 0.560858266610479),
 ('agg_top_sample_result_wordct_correct', 0.9846186262115466),
 ('filter_len_pre_truncate', 1267310),
 ('filtered_few', 202),
 ('generate_few', 0),
 ('generate_none', 0),
 ('in_filtered', 13074),
 ('in_sample', 7996),
 ('sample_len', 284760),
 ('sample_len_correct', 137978),
 ('sa

In [5]:
## primary val
load_and_run('baseline_naive_e12_val')
load_and_run('baseline_naive_nolens_e15_val')     # test set

28476
[('agg_filter_len_pre_truncate', 44.389275179098185),
 ('agg_filtered_few', 0.0064615816828206205),
 ('agg_generate_few', 0.0),
 ('agg_generate_none', 0.0),
 ('agg_in_filtered', 0.45146790279533644),
 ('agg_in_sample', 0.2747225733951398),
 ('agg_sample_len', 10.0),
 ('agg_sample_len_correct', 0.4834843376878775),
 ('agg_sample_len_pre_truncate', 100.0),
 ('agg_sample_wordct_correct', 0.9792175867397106),
 ('agg_top_10_after_filter', 0.33122629582806573),
 ('agg_top_match', 0.16002949852507375),
 ('agg_top_match_len_correct', 0.9999648827082456),
 ('agg_top_match_none', 3.5117291754459895e-05),
 ('agg_top_match_wordct_correct', 0.9941002949852508),
 ('agg_top_sample_result_len_correct', 0.5595940441073184),
 ('agg_top_sample_result_wordct_correct', 0.9845132743362832),
 ('filter_len_pre_truncate', 1264029),
 ('filtered_few', 184),
 ('generate_few', 0),
 ('generate_none', 0),
 ('in_filtered', 12856),
 ('in_sample', 7823),
 ('sample_len', 284760),
 ('sample_len_correct', 137677),
 

In [7]:
## no lens - disj2
load_and_run('baseline_disj2_nolens_val')
load_and_run('baseline_disj2_nolens_test')     # test set



26546
[('agg_filter_len_pre_truncate', 13.30332253446847),
 ('agg_filtered_few', 0.2847509982671589),
 ('agg_generate_few', 0.0),
 ('agg_generate_none', 0.0),
 ('agg_in_filtered', 0.057146086039327956),
 ('agg_in_sample', 0.013033978753861222),
 ('agg_sample_len', 10.0),
 ('agg_sample_len_correct', 0.12870488962555565),
 ('agg_sample_len_pre_truncate', 100.0),
 ('agg_sample_wordct_correct', 0.7729488435169141),
 ('agg_top_10_after_filter', 0.047653130415128454),
 ('agg_top_match', 0.009492955624199502),
 ('agg_top_match_len_correct', 0.9659835756799517),
 ('agg_top_match_none', 0.03401642432004822),
 ('agg_top_match_wordct_correct', 0.7763128154900927),
 ('agg_top_sample_result_len_correct', 0.13414450388005725),
 ('agg_top_sample_result_wordct_correct', 0.7771038951254426),
 ('filter_len_pre_truncate', 353150),
 ('filtered_few', 7559),
 ('generate_few', 0),
 ('generate_none', 0),
 ('in_filtered', 1517),
 ('in_sample', 346),
 ('sample_len', 265460),
 ('sample_len_correct', 34166),
 ('s



In [5]:
# top curricular
load_and_run('cw_4_4_e19_val_all')
load_and_run('cw_4_4_e19_test_all')

28476
[('agg_filter_len_pre_truncate', 88.33677482792527),
 ('agg_filtered_few', 0.001685630004214075),
 ('agg_generate_few', 0.0),
 ('agg_generate_none', 0.0),
 ('agg_in_filtered', 0.6056679308891698),
 ('agg_in_sample', 0.41515662312122487),
 ('agg_sample_len', 10.0),
 ('agg_sample_len_correct', 0.9134042702626773),
 ('agg_sample_len_pre_truncate', 100.0),
 ('agg_sample_wordct_correct', 0.9718253968253968),
 ('agg_top_10_after_filter', 0.4220747295968535),
 ('agg_top_match', 0.21519876387133025),
 ('agg_top_match_len_correct', 0.9998244135412278),
 ('agg_top_match_none', 0.00017558645877229948),
 ('agg_top_match_wordct_correct', 0.9902373928922602),
 ('agg_top_sample_result_len_correct', 0.9383340356791684),
 ('agg_top_sample_result_wordct_correct', 0.9814229526618907),
 ('filter_len_pre_truncate', 2515478),
 ('filtered_few', 48),
 ('generate_few', 0),
 ('generate_none', 0),
 ('in_filtered', 17247),
 ('in_sample', 11822),
 ('sample_len', 284760),
 ('sample_len_correct', 260101),
 ('s

In [6]:
##
# top curricular disjoint
load_and_run('curr_4_4_disj2_test')
load_and_run('curr_4_4_disj2_dev')

28085
[('agg_filter_len_pre_truncate', 84.89104504183727),
 ('agg_filtered_few', 0.053444899412497776),
 ('agg_generate_few', 0.0),
 ('agg_generate_none', 0.0),
 ('agg_in_filtered', 0.386612070500267),
 ('agg_in_sample', 0.1935196724230016),
 ('agg_sample_len', 10.0),
 ('agg_sample_len_correct', 0.861146519494392),
 ('agg_sample_len_pre_truncate', 100.0),
 ('agg_sample_wordct_correct', 0.9229944810397009),
 ('agg_top_10_after_filter', 0.2),
 ('agg_top_match', 0.06498130674737404),
 ('agg_top_match_len_correct', 0.991917393626491),
 ('agg_top_match_none', 0.00808260637350899),
 ('agg_top_match_wordct_correct', 0.9389709809506854),
 ('agg_top_sample_result_len_correct', 0.8667616165212747),
 ('agg_top_sample_result_wordct_correct', 0.9304254940359623),
 ('filter_len_pre_truncate', 2384165),
 ('filtered_few', 1501),
 ('generate_few', 0),
 ('generate_none', 0),
 ('in_filtered', 10858),
 ('in_sample', 5435),
 ('sample_len', 280850),
 ('sample_len_correct', 241853),
 ('sample_len_pre_truncat



In [21]:
# todo: josh (should be in another folder)

## primary - anagram subset performance
load_and_run('baseline_naive_e12_val',
             filter_fcn=vt.make_set_filter('anag_direct'),
             pre_truncate=5)

28476
With filter check_inclusion
[('agg_filter_len_pre_truncate', 2.173794976238968),
 ('agg_filtered_few', 1.0),
 ('agg_generate_few', 1.0),
 ('agg_generate_none', 0.0),
 ('agg_in_filtered', 0.1945010183299389),
 ('agg_in_sample', 0.1945010183299389),
 ('agg_sample_len', 5.0),
 ('agg_sample_len_correct', 0.43638832315003395),
 ('agg_sample_len_pre_truncate', 5.0),
 ('agg_sample_wordct_correct', 0.9726408689748812),
 ('agg_top_10_after_filter', 0.1945010183299389),
 ('agg_top_match', 0.13713509843856075),
 ('agg_top_match_len_correct', 0.8988458927359131),
 ('agg_top_match_none', 0.1011541072640869),
 ('agg_top_match_wordct_correct', 0.8920570264765784),
 ('agg_top_sample_result_len_correct', 0.4820095044127631),
 ('agg_top_sample_result_wordct_correct', 0.9769178547182621),
 ('filter_len_pre_truncate', 6404),
 ('filtered_few', 2946),
 ('generate_few', 2946),
 ('generate_none', 0),
 ('in_filtered', 573),
 ('in_sample', 573),
 ('sample_len', 14730),
 ('sample_len_correct', 6428),
 ('sa

In [24]:
# best performing model after curricular train
load_and_run('ACW_descrambleonly_20',
             filter_fcn=vt.make_set_filter('anag_direct'))

28476
With filter check_inclusion
[('agg_filter_len_pre_truncate', 3.8727087576374744),
 ('agg_filtered_few', 1.0),
 ('agg_generate_few', 1.0),
 ('agg_generate_none', 0.0),
 ('agg_in_filtered', 0.3000678886625934),
 ('agg_in_sample', 0.3000678886625934),
 ('agg_sample_len', 5.0),
 ('agg_sample_len_correct', 0.7803801765105227),
 ('agg_sample_len_pre_truncate', 5.0),
 ('agg_sample_wordct_correct', 0.9613713509843856),
 ('agg_top_10_after_filter', 0.3000678886625934),
 ('agg_top_match', 0.21384928716904278),
 ('agg_top_match_len_correct', 0.9758995247793618),
 ('agg_top_match_none', 0.024100475220638153),
 ('agg_top_match_wordct_correct', 0.9694501018329938),
 ('agg_top_sample_result_len_correct', 0.8295994568906992),
 ('agg_top_sample_result_wordct_correct', 0.9755600814663951),
 ('filter_len_pre_truncate', 11409),
 ('filtered_few', 2946),
 ('generate_few', 2946),
 ('generate_none', 0),
 ('in_filtered', 884),
 ('in_sample', 884),
 ('sample_len', 14730),
 ('sample_len_correct', 11495),
 

In [None]:
## disjoint sets

# disjoint 1
load_and_run('baseline_disj1_e6_val')
# disjoint 2 (true disjoint)
load_and_run('baseline_disj2_e1_val')

In [5]:
# disjoint sets (test)

# disjoint 1
load_and_run('baseline_disj1_e6_test')
# disjoint 2 (true disjoint)
load_and_run('baseline_disj2_e1_test')

28521
[('agg_filter_len_pre_truncate', 39.15812909785772),
 ('agg_filtered_few', 0.010378317730794853),
 ('agg_generate_few', 0.0),
 ('agg_generate_none', 0.0),
 ('agg_in_filtered', 0.21976789032642613),
 ('agg_in_sample', 0.07776725921251008),
 ('agg_sample_len', 10.0),
 ('agg_sample_len_correct', 0.40793450440026646),
 ('agg_sample_len_pre_truncate', 100.0),
 ('agg_sample_wordct_correct', 0.9815890045931068),
 ('agg_top_10_after_filter', 0.12864205322394026),
 ('agg_top_match', 0.0323621191402826),
 ('agg_top_match_len_correct', 0.9999649381157744),
 ('agg_top_match_none', 3.5061884225658285e-05),
 ('agg_top_match_wordct_correct', 0.9913747764804881),
 ('agg_top_sample_result_len_correct', 0.4201816205602889),
 ('agg_top_sample_result_wordct_correct', 0.9843273377511308),
 ('filter_len_pre_truncate', 1116829),
 ('filtered_few', 296),
 ('generate_few', 0),
 ('generate_none', 0),
 ('in_filtered', 6268),
 ('in_sample', 2218),
 ('sample_len', 285210),
 ('sample_len_correct', 116347),
 ('

In [5]:



# special eval
# train overlap performance
# load_and_run('baseline_naive_e12_val')

_, _, (train, _, _) = load_guardian_splits(k_json_folder)

INFO:data_util.scrape_parse.guardian_scrape:loading from /Users/jsrozner/jsrozner/cryptic/cryptic-code/data/puzzles/guardian_data/guardian_2020_10_08_json
INFO:data_util.scrape_parse.guardian_scrape:Using file glob at /Users/jsrozner/jsrozner/cryptic/cryptic-code/data/puzzles/guardian_data/guardian_2020_10_08_json/cryptic*.json
INFO:data_util.scrape_parse.guardian_scrape:Glob has size 5518
100%|██████████| 5518/5518 [00:22<00:00, 241.82it/s]
INFO:data_util.scrape_parse.guardian_scrape:Params set to not filter
100%|██████████| 143991/143991 [00:05<00:00, 24810.84it/s]
100%|██████████| 55783/55783 [00:06<00:00, 8603.18it/s] 
INFO:data_util.scrape_parse.guardian_scrape:Counter({1: 118540, 2: 20105, 3: 2929, 4: 686, 5: 112, 6: 8})


[("length punct: '", 1),
 ('invalid: clue group', 7687),
 ('invalid: invalid start char (most are continuation clues)', 607),
 ('invalid: number in clue (commonly references another clue)', 7066),
 ('invalid: regexp', 75),
 ('invalid: soln length does not match specified lens (multi box soln)', 56),
 ('invalid: unrecognized char in clue (e.g. html)', 85),
 ('invalid: zero-len clue text after regexp', 15),
 ('length punct: ,', 24644),
 ('length punct: -', 4148),
 ('length punct: .', 8),
 ('length punct: /', 1),
 ('stat: parsed_puzzle', 5518),
 ('stat: total_clues', 143991),
 (1, 119956),
 (2, 20272),
 (3, 2957),
 (4, 686),
 (5, 112),
 (6, 8)]
Total clues: len(puzz_list)
removed 1611 exact dupes
142380


In [6]:
def make_set_inclusion_filter_fcn(train_set: List[BaseClue]):
    s = set()
    for c in train_set:
        s.add(c.soln_with_spaces)

    # return False to omit
    def filter_fcn(mp: vt.ModelPrediction):
        if mp.target in s:
            return False
        return True
    return filter_fcn

In [7]:
# subset that does not overlap with train (exact match)
load_and_run('baseline_naive_e12_val',
             label='baseline_naive_e12_val_diff_exact',
             filter_fcn=make_set_inclusion_filter_fcn(train))
load_and_run('baseline_naive_e12_test',
             label='baseline_naive_e12_test_diff_exact',
             filter_fcn=make_set_inclusion_filter_fcn(train))

28476
With filter filter_fcn
[('agg_filter_len_pre_truncate', 38.64603782619105),
 ('agg_filtered_few', 0.014962892027771128),
 ('agg_generate_few', 0.0),
 ('agg_generate_none', 0.0),
 ('agg_in_filtered', 0.15872635863059611),
 ('agg_in_sample', 0.06308355278908308),
 ('agg_sample_len', 10.0),
 ('agg_sample_len_correct', 0.41554943739525974),
 ('agg_sample_len_pre_truncate', 100.0),
 ('agg_sample_wordct_correct', 0.963574335647594),
 ('agg_top_10_after_filter', 0.09456547761551352),
 ('agg_top_match', 0.030045487191764423),
 ('agg_top_match_len_correct', 0.9998802968637779),
 ('agg_top_match_none', 0.00011970313622216902),
 ('agg_top_match_wordct_correct', 0.9868326550155614),
 ('agg_top_sample_result_len_correct', 0.44613358870002395),
 ('agg_top_sample_result_wordct_correct', 0.96744074694757),
 ('filter_len_pre_truncate', 322849),
 ('filtered_few', 125),
 ('generate_few', 0),
 ('generate_none', 0),
 ('in_filtered', 1326),
 ('in_sample', 527),
 ('sample_len', 83540),
 ('sample_len_co

INFO:data_util.scrape_parse.guardian_scrape:loading from /Users/jsrozner/jsrozner/cryptic/cryptic-code/data/puzzles/guardian_data/guardian_2020_10_08_json
INFO:data_util.scrape_parse.guardian_scrape:Using file glob at /Users/jsrozner/jsrozner/cryptic/cryptic-code/data/puzzles/guardian_data/guardian_2020_10_08_json/cryptic*.json
INFO:data_util.scrape_parse.guardian_scrape:Glob has size 5518
100%|██████████| 5518/5518 [00:19<00:00, 287.01it/s]
INFO:data_util.scrape_parse.guardian_scrape:Params set to not filter
100%|██████████| 143991/143991 [00:00<00:00, 637508.78it/s]
100%|██████████| 55783/55783 [00:02<00:00, 22397.92it/s]
INFO:data_util.scrape_parse.guardian_scrape:Counter({1: 118540, 2: 20105, 3: 2929, 4: 686, 5: 112, 6: 8})


[("length punct: '", 1),
 ('invalid: clue group', 7687),
 ('invalid: invalid start char (most are continuation clues)', 607),
 ('invalid: number in clue (commonly references another clue)', 7066),
 ('invalid: regexp', 75),
 ('invalid: soln length does not match specified lens (multi box soln)', 56),
 ('invalid: unrecognized char in clue (e.g. html)', 85),
 ('invalid: zero-len clue text after regexp', 15),
 ('length punct: ,', 24644),
 ('length punct: -', 4148),
 ('length punct: .', 8),
 ('length punct: /', 1),
 ('stat: parsed_puzzle', 5518),
 ('stat: total_clues', 143991),
 (1, 119956),
 (2, 20272),
 (3, 2957),
 (4, 686),
 (5, 112),
 (6, 8)]
Total clues: len(puzz_list)
removed 1611 exact dupes
142380


In [8]:
###
# prepare for set that does not overlap, fuzzily
##
# this result not in paper
def make_set_inclusion_filter_fcn_fuzz(train_set: List[BaseClue]):
    s = set()
    for c in train_set:
        soln = c.soln_with_spaces
        if soln.endswith('es'):
            s.add(soln[:-2])
        if soln.endswith('s'):
            s.add(soln[:-1])
        s.add(soln + 'es')
        s.add(soln + 's')
        s.add(soln)

    # return False to omit
    def filter_fcn(mp: vt.ModelPrediction):
        if mp.target in s or mp.target[:-1] in s or mp.target[:-2] in s:
            return False
        return True
    return filter_fcn

In [9]:
load_and_run('baseline_naive_e12_val',
             label='baseline_naive_e12_val_diff_plurals',
             filter_fcn=make_set_inclusion_filter_fcn_fuzz(train))
load_and_run('baseline_naive_e12_test',
             label='baseline_naive_e12_test_diff_plurals',
             filter_fcn=make_set_inclusion_filter_fcn_fuzz(train))

28476
With filter filter_fcn
[('agg_filter_len_pre_truncate', 37.30992736077482),
 ('agg_filtered_few', 0.01791767554479419),
 ('agg_generate_few', 0.0),
 ('agg_generate_none', 0.0),
 ('agg_in_filtered', 0.12784503631961258),
 ('agg_in_sample', 0.049878934624697335),
 ('agg_sample_len', 10.0),
 ('agg_sample_len_correct', 0.40508474576271186),
 ('agg_sample_len_pre_truncate', 100.0),
 ('agg_sample_wordct_correct', 0.9544471347861179),
 ('agg_top_10_after_filter', 0.07602905569007264),
 ('agg_top_match', 0.022598870056497175),
 ('agg_top_match_len_correct', 0.9998385794995964),
 ('agg_top_match_none', 0.00016142050040355126),
 ('agg_top_match_wordct_correct', 0.9832122679580306),
 ('agg_top_sample_result_len_correct', 0.4314769975786925),
 ('agg_top_sample_result_wordct_correct', 0.9583535108958838),
 ('filter_len_pre_truncate', 231135),
 ('filtered_few', 111),
 ('generate_few', 0),
 ('generate_none', 0),
 ('in_filtered', 792),
 ('in_sample', 309),
 ('sample_len', 61950),
 ('sample_len_c

In [None]:
###
# disjoint
###
# not reported in paper
_, _, (train_disj, val_disj, _) = load_guardian_splits_disjoint_hash(k_json_folder)

load_and_run('baseline_disj1_e6_val',
             label='baseline_disj1_e6_val_diff',
             filter_fcn=make_set_inclusion_filter_fcn_fuzz(train_disj))


