In [7]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
import config
k_json_folder = config.DataDirs.Guardian.json_folder

In [9]:
import sys
sys.path.append('../decrypt')
from decrypt.scrape_parse import (
    load_guardian_splits,
    load_guardian_splits_disjoint_hash
)

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from decrypt.common.puzzle_clue import GuardianClue

from sklearn.feature_extraction.text import CountVectorizer
from typing import *
from tqdm import tqdm

from decrypt.common import validation_tools as vt

In [11]:
# no need to lowercase - countvectorizer does this
def load_data(clue_list: List[GuardianClue], add_lens=False):
    """
    Take clue_list and return the X and Y data
    """
    def iter_fcn(clue: GuardianClue):
        if add_lens:
            ret = clue.clue_with_lengths("|")
        else:
            ret = clue.clue
        return ret

    X = [iter_fcn(c) for c in clue_list]
    Y = [c.soln_with_spaces.lower() for c in clue_list]
    return X, Y

def knn_eval(train, val, add_lens, knn_neighbors: int, verify=False):
    """
    :param add_lens: whether to add lengths to the input clues
    :param knn_neighbors: number of neighbors to use when doing "beam search". None => no beam search
    """
    # load data
    train_inputs, train_targets = load_data(train, add_lens)
    test_inputs, test_targets = load_data(val, add_lens)
    print(train_inputs[:2])

    # set up the bag-of-words vectorizer
    # token patter needed for the length specification
    bow_vectorizer = CountVectorizer(token_pattern='[a-z\d()|]+',
                                     ngram_range=(1,1))     # further ngrams degrade performance
    bowVect = bow_vectorizer.fit(train_inputs)

    # show that everything was vectorized correctly
    print(len(bowVect.vocabulary_))
    if verify:
        for w in train_inputs[0].replace(","," ").lower().split(" "):
            if w == '': continue
            print(bowVect.vocabulary_[w])

    bowTrain = bowVect.transform(train_inputs)
    bowTest = bowVect.transform(test_inputs)

    # fit KNN
    # neighbor setting here doesn't matter; can put in call to knn.kneighbors
    knn = KNeighborsClassifier()
    knn.fit(bowTrain, train_targets)

    # predict (runs long)
    # get the nearest neighbors (beam search)
    # this returns in sorted order, so commented code not needed
    # nn_dist, nn_idx = knn.kneighbors(bowTest, n_neighbors=knn_neighbors, return_distance=True)
    # nn_dist_and_idx = zip(nn_dist, nn_idx)
    nn = knn.kneighbors(bowTest, n_neighbors=knn_neighbors, return_distance=False)

    # get the predictions ("greedy")
    pred = knn.predict(bowTest)
    return train_targets, test_targets, nn, pred

In [12]:
def eval_knn(val_set: List[GuardianClue],
             train_targets,
             test_targets,
             nn,
             pred):
    model_outputs = []

    # don't need to check idx set since we have a 1:1 of val_gc to test_tgt
    # nn_list is already sorted
    for val_gc, test_tgt, nn_list, greedy_pred in tqdm(zip(val_set, test_targets, nn, pred)):
        assert val_gc.soln_with_spaces == test_tgt
        neighbor_solns = [train_targets[n] for n in nn_list]

        # nbr set is the list of indices of nearest neighbor
        # we retrieve all the solns for those neighbors (y_train[i])
        mp = vt.ModelPrediction(idx=val_gc.idx,
                            input=val_gc.clue_with_lengths(punct="|"),
                            target=test_tgt,
                            greedy=greedy_pred,
                             sampled=neighbor_solns)

        mp.model_eval = vt.eval(mp)
        model_outputs.append(mp)

    return model_outputs

def aggregate(val, output_tuple):
    model_out = eval_knn(val, *output_tuple)
    vt.all_aggregate(model_out)


In [13]:
# eval on naive val set (with/without lens)
def run_eval_knn(val_or_test: str, naive_or_disj: str, nn=3000):
    assert val_or_test in ["val", "test"]
    assert naive_or_disj in ["naive", "disj"]
    if naive_or_disj == "naive":
        load_fn = load_guardian_splits
    else:
        load_fn = load_guardian_splits_disjoint_hash
    _, _, (train_local, val, test) = load_fn(k_json_folder)
    if val_or_test == "val":
        val_local = val
    else:
        val_local = test

    knn_tuple_random_val_nolens = knn_eval(train_local, val_local, add_lens=False, knn_neighbors=nn)
    aggregate(val_local, knn_tuple_random_val_nolens)

    knn_tuple_random_val_lens = knn_eval(train_local, val_local, add_lens=True, knn_neighbors=nn)
    aggregate(val_local, knn_tuple_random_val_lens)

    return knn_tuple_random_val_nolens, knn_tuple_random_val_lens

In [14]:
# knn_tuple_random_val_nolens, knn_tuple_random_val_lens = eval_naive_val()
# run with nn=3000 to replicate research
knn_tuple_random_val_nolens, knn_tuple_random_val_lens = run_eval_knn(val_or_test="val",
                                                                      naive_or_disj="naive",
                                                                      nn=1)
# %store knn_tuple_random_val_nolens
# %store knn_tuple_random_val_lens





INFO:decrypt.scrape_parse.guardian_load:loading from ../puzzles/
INFO:decrypt.scrape_parse.guardian_load:Using file glob at ../puzzles/cryptic*.json
INFO:decrypt.scrape_parse.guardian_load:Glob has size 5518
INFO:decrypt.scrape_parse.guardian_load:Glob size matches the expected one from Decrypting paper
100%|██████████| 5518/5518 [00:12<00:00, 454.75it/s]
  5%|▌         | 7311/143991 [00:00<00:05, 24179.55it/s]

[("length punct: '", 1),
 ('invalid: clue group', 7687),
 ('invalid: invalid start char (most are continuation clues)', 607),
 ('invalid: number in clue (commonly references another clue)', 7066),
 ('invalid: regexp', 75),
 ('invalid: soln length does not match specified lens (multi box soln)', 56),
 ('invalid: unrecognized char in clue (e.g. html)', 85),
 ('invalid: zero-len clue text after regexp', 15),
 ('length punct: ,', 24644),
 ('length punct: -', 4148),
 ('length punct: .', 8),
 ('length punct: /', 1),
 ('stat: parsed_puzzle', 5518),
 ('stat: total_clues', 143991),
 (1, 119956),
 (2, 20272),
 (3, 2957),
 (4, 686),
 (5, 112),
 (6, 8)]
Total clues: len(puzz_list)


100%|██████████| 143991/143991 [00:00<00:00, 276338.67it/s]
100%|██████████| 55783/55783 [00:02<00:00, 21140.35it/s]


removed 1611 exact dupes
142380


INFO:decrypt.scrape_parse.guardian_load:Counter({1: 118540, 2: 20105, 3: 2929, 4: 686, 5: 112, 6: 8})
INFO:decrypt.scrape_parse.guardian_load:Clue list length matches Decrypting paper expected length
INFO:decrypt.scrape_parse.guardian_load:Got splits of lenghts [85428, 28476, 28476]
INFO:decrypt.scrape_parse.guardian_load:First three clues of train set:
	[GuardianClue(clue='Suffering to grasp edge of plant', lengths=[8], soln='agrimony', soln_with_spaces='agrimony', idx=85002, dataset='../puzzles/', across_or_down='across', pos=(7, 4), unique_clue_id='cryptic_25415_11-across', type='cryptic', number=25415, id='crosswords/cryptic/25415', creator='Chifonie', orig_lengths='8', lengths_punctuation=set()), GuardianClue(clue='Honour Ben and Noel with new order', lengths=[7], soln='ennoble', soln_with_spaces='ennoble', idx=3432, dataset='../puzzles/', across_or_down='down', pos=(7, 8), unique_clue_id='cryptic_24994_18-down', type='cryptic', number=24994, id='crosswords/cryptic/24994', creator

['Suffering to grasp edge of plant', 'Honour Ben and Noel with new order']
31939


28476it [00:02, 9931.91it/s] 


[('agg_filter_len_pre_truncate', 0.16038067144261833),
 ('agg_filtered_few', 1.0),
 ('agg_generate_few', 1.0),
 ('agg_generate_none', 0.0),
 ('agg_in_filtered', 0.035995224048321395),
 ('agg_in_sample', 0.035995224048321395),
 ('agg_sample_len', 1.0),
 ('agg_sample_len_correct', 0.16038067144261833),
 ('agg_sample_len_pre_truncate', 1.0),
 ('agg_sample_wordct_correct', 0.7433277145666526),
 ('agg_top_10_after_filter', 0.035995224048321395),
 ('agg_top_match', 0.035995224048321395),
 ('agg_top_match_len_correct', 0.16038067144261833),
 ('agg_top_match_none', 0.8396193285573816),
 ('agg_top_match_wordct_correct', 0.13864306784660768),
 ('agg_top_sample_result_len_correct', 0.16038067144261833),
 ('agg_top_sample_result_wordct_correct', 0.7433277145666526),
 ('filter_len_pre_truncate', 4567),
 ('filtered_few', 28476),
 ('generate_few', 28476),
 ('generate_none', 0),
 ('in_filtered', 1025),
 ('in_sample', 1025),
 ('sample_len', 28476),
 ('sample_len_correct', 4567),
 ('sample_len_pre_trunc

28476it [00:02, 12824.05it/s]


[('agg_filter_len_pre_truncate', 0.877826941986234),
 ('agg_filtered_few', 1.0),
 ('agg_generate_few', 1.0),
 ('agg_generate_none', 0.0),
 ('agg_in_filtered', 0.058189352437140046),
 ('agg_in_sample', 0.058189352437140046),
 ('agg_sample_len', 1.0),
 ('agg_sample_len_correct', 0.877826941986234),
 ('agg_sample_len_pre_truncate', 1.0),
 ('agg_sample_wordct_correct', 0.9142786908273633),
 ('agg_top_10_after_filter', 0.058189352437140046),
 ('agg_top_match', 0.058189352437140046),
 ('agg_top_match_len_correct', 0.877826941986234),
 ('agg_top_match_none', 0.12217305801376598),
 ('agg_top_match_wordct_correct', 0.8731914594746453),
 ('agg_top_sample_result_len_correct', 0.877826941986234),
 ('agg_top_sample_result_wordct_correct', 0.9142786908273633),
 ('filter_len_pre_truncate', 24997),
 ('filtered_few', 28476),
 ('generate_few', 28476),
 ('generate_none', 0),
 ('in_filtered', 1657),
 ('in_sample', 1657),
 ('sample_len', 28476),
 ('sample_len_correct', 24997),
 ('sample_len_pre_truncate', 

In [28]:
####
# Supplementary to verify KNN works the way we expect
###

## verify how KNN does tokenization
train_inputs, train_targets = load_data(train, True)
print(train_inputs[:2])

# set up the bag-of-words vectorizer
# token patter needed for the length specification
# punctuation not included in token pattern (e.g. , or ') will be split and treated as space
# see below experiment
bow_vectorizer = CountVectorizer(token_pattern='[a-z\d()|]+',
                                 ngram_range=(1,1))     # further ngrams degrade performance
bowVect = bow_vectorizer.fit(train_inputs)

# show that everything was vectorized correctly
print(len(bowVect.vocabulary_))
print()
# for w in train_inputs[0].replace(","," ").lower().split(" "):
# need to replace any punct that occurs
all = []
for idx, w in enumerate(train_inputs[12].replace("'", "").lower().split(" ")):
    print(w)
    try:
        val = bowVect.vocabulary_[w]
        print(val)
        all.append(val)
    except:
        pass
print(sorted(all))

['Suffering to grasp edge of plant (8)', 'Honour Ben and Noel with new order (7)']
32466

0
very
31148
1
sad
24960
2
to
29352
3
find
11477
4
out
20378
5
whats
6
popular
21973
7
on
20096
8
twitter
30162
9
(5|7)
447
[31148, 24960, 29352, 11477, 20378, 21973, 20096, 30162, 447]


In [29]:
sorted(all)

[447, 11477, 20096, 20378, 21973, 24960, 29352, 30162, 31148]

In [26]:
print(train_inputs[12])

Very sad to find out what's popular on Twitter (5|7)


In [25]:
matrix = bow_vectorizer.transform([train_inputs[12]])
print(matrix)
for i,j in matrix:
    print(bowVect.vocabulary_[])

  (0, 447)	1
  (0, 11477)	1
  (0, 20096)	1
  (0, 20378)	1
  (0, 21973)	1
  (0, 24933)	1
  (0, 24960)	1
  (0, 29352)	1
  (0, 30162)	1
  (0, 31148)	1
  (0, 31779)	1


In [30]:
bow_vectorizer.inverse_transform(matrix)

[array(['(5|7)', 'find', 'on', 'out', 'popular', 's', 'sad', 'to',
        'twitter', 'very', 'what'], dtype='<U20')]

In [32]:
import string
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [None]:
for i, c in enumerate(train):
    if len(c.lengths) > 1:
        print(c.idx)
        print(i)
        break
print(train_inputs[12])