In [1]:
import numpy as np
from pynndescent import NNDescent
from sklearn.neighbors import NearestNeighbors
from multiprocessing import cpu_count
from metrics_custom import (
    distance_SNN, 
    neighborhood_membership_vectors
)
from generate_data import MFA_model
from utils import remove_self_neighbors

In [2]:
# Suppress annoying numba warning
import warnings
from numba import NumbaPendingDeprecationWarning
warnings.filterwarnings('ignore', '', NumbaPendingDeprecationWarning)

In [3]:
# Define some constants
num_proc = max(cpu_count() - 2, 1)
seed_rng = np.random.randint(1, high=10000)
K = 20
n_neighbors = max(K + 2, 20)
rho = 0.5
metric_primary = 'euclidean'

In [4]:
# Generate data according to a mixture of factor analysis (MFA) model
np.random.seed(seed_rng)

# number of mixture components
n_components = 10
# dimension of the observed space
dim = 100
# dimension of the latent space
dim_latent = 2
dim_latent_range = (10, 20)
model = MFA_model(n_components, dim, dim_latent_range=dim_latent_range, seed_rng=seed_rng)

# Generate data from the model
N = 1000
N_test = 100
data, labels = model.generate_data(N)
data_test, labels_test = model.generate_data(N_test)

In [5]:
# Construct an approximate nearest neighbor (ANN) index to query nearest neighbors
params = {
    'metric': metric_primary, 
    'n_neighbors': n_neighbors,
    'rho': rho,
    'random_state': seed_rng,
    'n_jobs': num_proc, 
    'verbose': True
}
index = NNDescent(data, **params)

Wed Nov 27 22:04:47 2019 Building RP forest with 7 trees
Wed Nov 27 22:04:47 2019 parallel NN descent for 10 iterations
	 0  /  10
	 1  /  10
	 2  /  10
	 3  /  10


In [6]:
# Query the K nearest neighbors of each point. 
# Since each point will be selected as its own nearest neighbor, we query for `K+1` neighbors and ignore the self neighbors
nn_indices, nn_distances = index.query(data, k=(K + 1))

In [7]:
# Create the K neighbor membership vector for each point.
# `data_neighbors` will be numpy array of 0s and 1s, with shape `(N, N)` and dtype `np.uint8`
data_neighbors = neighborhood_membership_vectors(nn_indices, N)

# Set the diagonal elements of `data_neighbors` to 0 because we don't want a point to be in its own neighborhood set
np.fill_diagonal(data_neighbors, 0)

In [8]:
%time
# Construct another ANN index based on the SNN distance metric
params = {
    'metric': distance_SNN, 
    'n_neighbors': n_neighbors,
    'rho': rho,
    'random_state': seed_rng,
    'n_jobs': num_proc, 
    'verbose': True
}
index_snn = NNDescent(data_neighbors, **params)

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 5.25 µs
Wed Nov 27 22:04:53 2019 Building RP forest with 7 trees
Wed Nov 27 22:04:53 2019 parallel NN descent for 10 iterations
	 0  /  10
	 1  /  10
	 2  /  10
	 3  /  10


In [9]:
# Query the K nearest neighbors of each point based on the SNN distance 
nn_indices_snn_, nn_distances_snn_ = index_snn.query(data_neighbors, k=(K + 1))

# Remove each point from it's own neighborhood set
nn_indices_snn, nn_distances_snn = remove_self_neighbors(nn_indices_snn_, nn_distances_snn_)