In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../decrypt')
sys.path.append('../')

# Model Analysis section of paper

## 6.1 Meta-linguistic properties
Table 3b is produced from the statistics that will be outputted in baselines/baseline_t5 and
baselines/baseline_knn.

Nothing else is required

## 6.2 Disjointness
This section contains two pars:
1. Subset analysis
2. Analysis on distinct splits

Producing the table rows:
1. Row 1 in the Table 3b (naive, entire split) is the same as T5 (lenghts) in the main
 results table and is produced by running baselines/baseline_t5, i.e.
1. Subset not in train - see below for setup
1. Naive disjoint - run in the same way as baselines/baseline_t5, but on the naive_disjoint split, i.e.
    - Train on the naive disjoint split:
```python
train_clues.py --default_train=base --name=baseline_naive_disjoint --project=baseline --wandb_dir='./wandb' --data_dir='../data/clue_json/guardian/naive_disjoint'
```
    - Eval to get 100 generations (on the best epoch checkpoint), with and without `--test` flag
```python
train_clues.py --default_val=base --name=baseline_naive_disjoint_val --project=baseline --data_dir='../data/clue_json/guardian/naive_disjoint' --ckpt_path='./wandb/run_name/files/epoch_10.pth.tar
```
    - run the eval script
`vt.load_and_run_t5('decrypt/t5_outputs/baseline_naive_disjoint_val.preds')`
1. word initial disjoint split - same as T5 (lengths) row in main results table (see baselines/baselines_t5)


In [3]:
# eval on the subset not seen in train
import config
from scrape_parse import load_guardian_splits
from decrypt.common import validation_tools as vt



In [None]:
# create a function that will filter to those input-output pairs with answers
# not seen during training
def make_filter_fn():
    _, _, (train, _, _) = load_guardian_splits(config.DataDirs.Guardian.json_folder, verify=True)
    s = set()
    for c in train:
        s.add(c.soln_with_spaces)

    # return False to omit
    def filter_fcn(mp: vt.ModelPrediction):
        if mp.target in s:
            return False
        return True
    return filter_fcn


vt.load_and_run_t5('outputs/naive_baseline.preds',
                   filter_fcn=make_filter_fn())

In [None]:
# the following is not reported in the paper
# prepare for set that does not overlap, fuzzily
# ie also match on plurals
def make_set_inclusion_filter_fcn_fuzz():
    _, _, (train, _, _) = load_guardian_splits(config.DataDirs.Guardian.json_folder, verify=True)
    s = set()

    # fuzzily match plurals
    for c in train:
        soln = c.soln_with_spaces
        if soln.endswith('es'):
            s.add(soln[:-2])
        if soln.endswith('s'):
            s.add(soln[:-1])
        s.add(soln + 'es')
        s.add(soln + 's')
        s.add(soln)

    # return False to omit
    def filter_fcn(mp: vt.ModelPrediction):
        if mp.target in s or mp.target[:-1] in s or mp.target[:-2] in s:
            return False
        return True
    return filter_fcn

vt.load_and_run_t5('outputs/naive_baseline.preds',
                   filter_fcn=make_set_inclusion_filter_fcn_fuzz())

## 6.3 Wordplay minimal task
- dataset
- two descramble tasks
- direct copy

In [None]:
# first prepare the descrambling dataset
from decrypt.scrape_parse.acw_load import get_clean_xd_clues
from sklearn.model_selection import train_test_split
from decrypt.common.util_data import write_json_tuple
import random

k_xd_orig_tsv = config.DataDirs.OriginalData.k_xd_cw
k_descramble_rand = config.DataDirs.DataExport.descramble_random
k_descramble_disj = config.DataDirs.DataExport.descramble_word_init_disjoint

# method will produce a version of the ACW dataset that is
# - single words that appear in our dictionary
# - without exact duplicates (but note that some answers will occur multiple times with different clues
# - filtered to answer words with between 4 and 14 characters
# - downsampled to 10%
def make_descramble_json(seed=42):
    stc_map, all_clues = get_clean_xd_clues(k_xd_orig_tsv,
                                            remove_if_not_in_dict=True,
                                            do_filter_dupes=True)

    # further filter away anything < 4 chars
    all_clues_len = [x for x in all_clues if len(x.soln) > 3 and len(x.soln) < 15]
    print(len(all_clues_len))

    # downsample to 10 percent
    rng = random.Random(42)
    all_clues_10per = rng.sample(all_clues_len, k=int(len(all_clues)*.1))
    print(len(all_clues_10per))
    print(all_clues_10per[0])

    # logic the same as scrape_parse.guardian_load.make_disjoint_split()
    def make_dataset(disj: bool):
        # make json
        json_all = []
        for c in all_clues_10per:
            c.soln = c.soln.lower()
            json_dict = dict(defn=c.clue,
                             target=c.soln)
            json_all.append(json_dict)

        train, val = [], []     # list of json dicts
        rng.seed(seed)
        if disj:
            for c in json_all:
                idx = hash(c['target'][:2]) % 10
                if idx == 0:
                    val.append(c)   # val ~ 20k
                else:
                    train.append(c)
            rng.shuffle(train)
            rng.shuffle(val)
        else:   # not disj
            train, val = train_test_split(json_all, test_size=.1, random_state=seed)

        return train, val

    write_json_tuple(list(make_dataset(disj=False)),    # train, val
                     comment="",
                     export_dir=k_descramble_rand)

    write_json_tuple(list(make_dataset(disj=True)),     # train, val
                     comment="",
                     export_dir=k_descramble_disj)

make_descramble_json()

INFO:root:loading xd (ACW) set from /Users/jsrozner/MOUNT/scdt/cryptic_nlp/decrypt_root/data/original/xd/clues.tsv
INFO:root:Reading file into dict: /Users/jsrozner/MOUNT/scdt/cryptic_nlp/decrypt_root/data/generated/twl_dict.txt
0it [00:00, ?it/s]

Initialized a spellchecker
This will fail if you have not downloaded or generated twl_dict.txt


178691it [00:00, 266654.85it/s]
INFO:root:Done reading file: /Users/jsrozner/MOUNT/scdt/cryptic_nlp/decrypt_root/data/generated/twl_dict.txt
INFO:root:Reading file into dict: /Users/jsrozner/MOUNT/scdt/cryptic_nlp/decrypt_root/data/original/us/US.dic
118619it [00:00, 205430.77it/s]
INFO:root:Done reading file: /Users/jsrozner/MOUNT/scdt/cryptic_nlp/decrypt_root/data/original/us/US.dic
INFO:root:Done setting up spellchecker
989205it [01:39, 17980.76it/s]

todo: how to run with the descramble setup tokenizer

In [None]:
# in the collater, produce <scramble> | defn -> target

## 6.4 Wordplay systematic learning
- identify clues with anagram of first name
- two sets: scramble / substitute
- run / evaluate
- average character level overlap

## 6.5  efrat comparison
- replication - hyperparams / model calls
- create word initial version
- how to run curricular