Load pre-computed embeddings for the training data.

In [1]:
import pickle
import numpy as np

with open('/data/patch_features_small/train2014_embeddings.p', 'rb') as f:
    embeddings_dict = pickle.load(f)
embeddings, labels = embeddings_dict['embeddings'], embeddings_dict['labels']

Load the serialized model.

In [2]:
from cse547.data import CocoPatchesDataset, OneShotDataLoader
from cse547.models import MultiLayerPerceptron
from cse547.s3 import deserialize_object

S3_BUCKET = 'cse-547'
S3_MODEL_KEY = 'project/train/model_103740db-7181-4771-aa0b-cfa7cc407cf8.pkl'
S3_TRAINING_LOG_KEY = 'project/train/training_log_103740db-7181-4771-aa0b-cfa7cc407cf8.pkl'
TEST_DATA_PATH = '/data/patch_features_small/test2014_positive.p'

state_dict = deserialize_object(S3_BUCKET, S3_MODEL_KEY)
training_log = deserialize_object(S3_BUCKET, S3_TRAINING_LOG_KEY)

eval_data = iter(OneShotDataLoader(
    CocoPatchesDataset.from_state_dict_files([TEST_DATA_PATH]))).next()
model = MultiLayerPerceptron(
    eval_data['features'].size()[1],
    eval_data['label'].size()[1],
    training_log['model']['hidden_units'],
    training=False,
    dropout=training_log['model']['dropout'])
model.load_state_dict(state_dict)

Embed the evaluation data.

In [3]:
import torch

with torch.no_grad():
    eval_data['embeddings'] = model.embed(eval_data['features']).data.numpy()

Compute nearest neighbors with a [BallTree](http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.BallTree.html).

In [4]:
from sklearn.neighbors import BallTree

nearest_neighbors = BallTree(embeddings, leaf_size=256)
neighbors = nearest_neighbors.query(X=eval_data['embeddings'], k=64, return_distance=True, dualtree=True)

Let's output those neighbors now.

In [6]:
with open('/data/patch_features_small/test2014_neighbors.p', 'wb') as f:
    pickle.dump(neighbors, f)