In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from decrypt import config
k_json_folder = config.DataDirs.Guardian.json_folder

from decrypt.scrape_parse import (
    load_guardian_splits,
    load_guardian_splits_disjoint_hash
)

from sklearn.neighbors import KNeighborsClassifier
from decrypt.common.puzzle_clue import GuardianClue

from sklearn.feature_extraction.text import CountVectorizer
from typing import *
from tqdm import tqdm

from decrypt.common import validation_tools as vt

In [None]:
# no need to lowercase - countvectorizer does this
def load_data(clue_list: List[GuardianClue], add_lens=False):
    """
    Take clue_list and return the X and Y data
    """
    def iter_fcn(clue: GuardianClue):
        if add_lens:
            ret = clue.clue_with_lengths("|")
        else:
            ret = clue.clue
        return ret

    X = [iter_fcn(c) for c in clue_list]
    Y = [c.soln_with_spaces.lower() for c in clue_list]
    return X, Y

def knn_eval(train, val, add_lens, knn_neighbors: int, verify=False):
    """
    :param add_lens: whether to add lengths to the input clues
    :param knn_neighbors: number of neighbors to use when doing "beam search". None => no beam search
    """
    # load data
    train_inputs, train_targets = load_data(train, add_lens)
    test_inputs, test_targets = load_data(val, add_lens)
    print(train_inputs[:2])

    # set up the bag-of-words vectorizer
    # token patter needed for the length specification
    bow_vectorizer = CountVectorizer(token_pattern='[a-z\d()|]+',
                                     ngram_range=(1,1))     # further ngrams degrade performance
    bowVect = bow_vectorizer.fit(train_inputs)

    # show that everything was vectorized correctly
    print(len(bowVect.vocabulary_))
    if verify:
        for w in train_inputs[0].replace(","," ").lower().split(" "):
            if w == '': continue
            print(bowVect.vocabulary_[w])

    bowTrain = bowVect.transform(train_inputs)
    bowTest = bowVect.transform(test_inputs)

    # fit KNN
    # neighbor setting here doesn't matter; can put in call to knn.kneighbors
    knn = KNeighborsClassifier()
    knn.fit(bowTrain, train_targets)

    # predict (runs long)
    # get the nearest neighbors (beam search)
    # this returns in sorted order, so commented code not needed
    # nn_dist, nn_idx = knn.kneighbors(bowTest, n_neighbors=knn_neighbors, return_distance=True)
    # nn_dist_and_idx = zip(nn_dist, nn_idx)
    nn = knn.kneighbors(bowTest, n_neighbors=knn_neighbors, return_distance=False)

    # get the predictions ("greedy")
    pred = knn.predict(bowTest)
    return train_targets, test_targets, nn, pred

In [None]:
def eval_knn(val_set: List[GuardianClue],
             train_targets,
             test_targets,
             nn,
             pred):
    model_outputs = []

    # don't need to check idx set since we have a 1:1 of val_gc to test_tgt
    # nn_list is already sorted
    for val_gc, test_tgt, nn_list, greedy_pred in tqdm(zip(val_set, test_targets, nn, pred)):
        assert val_gc.soln_with_spaces == test_tgt
        neighbor_solns = [train_targets[n] for n in nn_list]

        # nbr set is the list of indices of nearest neighbor
        # we retrieve all the solns for those neighbors (y_train[i])
        mp = vt.ModelPrediction(idx=val_gc.idx,
                            input=val_gc.clue_with_lengths(punct="|"),
                            target=test_tgt,
                            greedy=greedy_pred,
                             sampled=neighbor_solns)

        mp.model_eval = vt.eval(mp)
        model_outputs.append(mp)

    return model_outputs

def aggregate(val, output_tuple):
    model_out = eval_knn(val, *output_tuple)
    vt.all_aggregate(model_out)


In [None]:
# eval on naive val set (with/without lens)
def run_eval_knn(val_or_test: str, naive_or_disj: str, nn=3000):
    assert val_or_test in ["val", "test"]
    assert naive_or_disj in ["naive", "disj"]
    if naive_or_disj == "naive":
        load_fn = load_guardian_splits
    else:
        load_fn = load_guardian_splits_disjoint_hash
    _, _, (train_local, val, test) = load_fn(k_json_folder)
    if val_or_test == "val":
        val_local = val
    else:
        val_local = test

    knn_tuple_random_val_nolens = knn_eval(train_local, val_local, add_lens=False, knn_neighbors=nn)
    aggregate(val_local, knn_tuple_random_val_nolens)

    knn_tuple_random_val_lens = knn_eval(train_local, val_local, add_lens=True, knn_neighbors=nn)
    aggregate(val_local, knn_tuple_random_val_lens)

    return knn_tuple_random_val_nolens, knn_tuple_random_val_lens

In [None]:
# run with nn=3000 to replicate research
knn_tuple_random_val_nolens, knn_tuple_random_val_lens = run_eval_knn(val_or_test="val",
                                                                      naive_or_disj="naive",
                                                                      nn=3000)

To reproduce the two rows corresponding to KNN in Main Results, also run with
- val_or_test = "test"
- naive_or_disj="disj"

Note that for the Main Results Table 2, the metrics we include in the table correspond to
- `agg_top_match`
- `agg_top_10_after_filter`

More details of these metric calculations can be found in `decrypt.common.validation_tools`




In [None]:
####
# Supplementary to verify KNN works the way we expect
###

## verify how KNN does tokenization
train_inputs, train_targets = load_data(train, True)
print(train_inputs[:2])

# set up the bag-of-words vectorizer
# token patter needed for the length specification
# punctuation not included in token pattern (e.g. , or ') will be split and treated as space
# see below experiment
bow_vectorizer = CountVectorizer(token_pattern='[a-z\d()|]+',
                                 ngram_range=(1,1))     # further ngrams degrade performance
bowVect = bow_vectorizer.fit(train_inputs)

# show that everything was vectorized correctly
print(len(bowVect.vocabulary_))
print()
# for w in train_inputs[0].replace(","," ").lower().split(" "):
# need to replace any punct that occurs
all = []
for idx, w in enumerate(train_inputs[12].replace("'", "").lower().split(" ")):
    print(w)
    try:
        val = bowVect.vocabulary_[w]
        print(val)
        all.append(val)
    except:
        pass
print(sorted(all))

print(train_inputs[12])

In [None]:
matrix = bow_vectorizer.transform([train_inputs[12]])
print(matrix)
for i,j in matrix:
    print(bowVect.vocabulary_[])

bow_vectorizer.inverse_transform(matrix)

In [None]:
import string
print(string.punctuation)

In [None]:
for i, c in enumerate(train):
    if len(c.lengths) > 1:
        print(c.idx)
        print(i)
        break
print(train_inputs[12])