In [1]:
import numpy as np
from pynndescent import NNDescent
from sklearn.neighbors import NearestNeighbors
from multiprocessing import cpu_count
from metrics_custom import (
    distance_SNN,  
    remove_self_neighbors
)
from generate_data import MFA_model

In [2]:
# Suppress annoying numba warning
import warnings
from numba import NumbaPendingDeprecationWarning
warnings.filterwarnings('ignore', '', NumbaPendingDeprecationWarning)

In [3]:
# Define some constants
num_proc = max(cpu_count() - 2, 1)
seed_rng = np.random.randint(1, high=10000)
K = 20
n_neighbors = max(K + 2, 20)
rho = 0.5
metric_primary = 'cosine'

In [4]:
# Generate data according to a mixture of factor analysis (MFA) model
np.random.seed(seed_rng)

# number of mixture components
n_components = 10
# dimension of the observed space
dim = 100
# dimension of the latent space
dim_latent = 2
dim_latent_range = (10, 20)
model = MFA_model(n_components, dim, dim_latent_range=dim_latent_range, seed_rng=seed_rng)

# Generate data from the model
N = 1000
N_test = 100
data, labels = model.generate_data(N)
data_test, labels_test = model.generate_data(N_test)

In [5]:
# Construct an approximate nearest neighbor (ANN) index to query nearest neighbors
params = {
    'metric': metric_primary, 
    'n_neighbors': n_neighbors,
    'rho': rho,
    'random_state': seed_rng,
    'n_jobs': num_proc, 
    'verbose': True
}
index = NNDescent(data, **params)

Thu Nov 28 21:19:09 2019 Building RP forest with 7 trees
Thu Nov 28 21:19:10 2019 parallel NN descent for 10 iterations
	 0  /  10
	 1  /  10
	 2  /  10


In [6]:
# Query the K nearest neighbors of each point. 
# Since each point will be selected as its own nearest neighbor, we query for `K+1` neighbors and ignore the self neighbors
nn_indices, nn_distances = index.query(data, k=(K + 1))

# Remove each point from its own neighborhood set
nn_indices, nn_distances = remove_self_neighbors(nn_indices, nn_distances)
data_neighbors = nn_indices


In [7]:
%time
# Construct another ANN index based on the SNN distance metric
params = {
    'metric': distance_SNN, 
    'n_neighbors': n_neighbors,
    'rho': rho,
    'random_state': seed_rng,
    'n_jobs': num_proc, 
    'verbose': True
}
index_snn = NNDescent(data_neighbors, **params)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.25 µs
Thu Nov 28 21:19:14 2019 Building RP forest with 7 trees
Thu Nov 28 21:19:14 2019 parallel NN descent for 10 iterations
	 0  /  10
	 1  /  10
	 2  /  10
	 3  /  10


In [8]:
# Query the K nearest neighbors of each point based on the SNN distance 
nn_indices_snn, nn_distances_snn = index_snn.query(data_neighbors, k=(K + 1))

# Remove each point from its own neighborhood set
nn_indices_snn, nn_distances_snn = remove_self_neighbors(nn_indices_snn, nn_distances_snn)

print("Overlap in the nearest neighbors found based on the primary and SNN distance metrics:")
for i in range(5):
    ind = np.intersect1d(nn_indices[i, :], nn_indices_snn[i, :], assume_unique=True)
    print(ind)


Overlap in the nearest neighbors found based on the primary and SNN distance metrics:
[ 65 130 138 153 228 264 271 301 369 608 636]
[  4 134 214 242 360 412 480 619 632 745 755 762 781]
[100 267 282 314 339 364 450 652]
[ 64  68 115 218 485 593 626 641 658 929 970]
[  1  78 214 360 490 619 632 755 781 832]
