# Trivial example for minHashNearestNeighbors

Import everything which is needed and define a function to create sparse artifical datasets

In [2]:
import random

from scipy.sparse import dok_matrix
from scipy.sparse import rand
from scipy.sparse import vstack
from scipy.sparse import csr_matrix

def create_dataset(seed=None,
                   number_of_centroids=None,
                   number_of_instances=None,
                   number_of_features=None,
                   size_of_dataset=None,
                   density=None,
                   fraction_of_density=None
                   ):
    dataset_neighborhood_list = []
    number_of_swapping_elements = int(number_of_features * density * fraction_of_density)
    y = []
    random_local = random.randint
    number_of_features_density = int(number_of_features*density)-1
    for k in xrange(number_of_centroids):
        dataset_neighbor = rand(1, number_of_features, density=density, format='lil', random_state=seed*k)
        nonzero_elements =  dataset_neighbor.nonzero()[1]
        for i in xrange(size_of_dataset):
            neighbor = dataset_neighbor.copy()
            # random.seed(seed*k)
            for j in xrange(number_of_swapping_elements):
                index = random_local(0, number_of_features_density)
                index_swap = random_local(0, number_of_features-1)
                neighbor[0, nonzero_elements[index]] = 0
                neighbor[0, index_swap] = 1
            dataset_neighborhood_list.append(neighbor)
        y.append(k)

    dataset_neighborhood = vstack(dataset_neighborhood_list)

    size_of_noise = number_of_instances-(number_of_centroids*size_of_dataset)
    if size_of_noise > 0:
            dataset_noise = rand(size_of_noise, number_of_features, format='lil', density=density, random_state=seed*seed)
            dataset = vstack([dataset_neighborhood, dataset_noise])
    else:
        dataset = vstack([dataset_neighborhood])
    random_value_generator = random.randint

    # add classes for noisy data
    for i in range(0, size_of_noise):
        y.append(random_value_generator(0, number_of_centroids))

    return csr_matrix(dataset), y

### What is done:
1. import the library neighbors which contains minHashNearestNeighbors and minHashKNeighborsClassifier
2. create a sparse dataset with:
  *5 centroids
  *100 instances
  *1000 features
  *1% non-zero features
  *noise of 20% 
3. Create a minHashNearestNeighbors object and set the number of n_neighbors to 4. The default value is 5.
4. fit the dataset
5. search for neighbors. If n_neighbors is not defined, the value given by the initalization is taken, other wise the default value of 5. If you do not define "algorithm", "approximate" is default.

In [3]:
import neighbors
# create a sparse dataset with 5 centroids, 100 instances, 1000 features, 1% non-zero features and a noise of 20% 
dataset, _ = create_dataset(seed=1, number_of_centroids=5, number_of_instances=100,  number_of_features=1000,
                         size_of_dataset=10, density=0.01,fraction_of_density=0.2)
# fit the dataset
n_neighbors_minHash = neighbors.MinHashNearestNeighbors(n_neighbors = 4)
n_neighbors_minHash.fit(dataset)
# get the n_nearest neighbors with the approximate algorithm
print n_neighbors_minHash.kneighbors(fast=True)
# get the n_nearest neighbors with the exact algorithm
print n_neighbors_minHash.kneighbors()

(array([[ 0.83,  0.93,  0.94,  0.95],
       [ 0.92,  0.94,  0.95,  0.96],
       [ 0.92,  0.94,  0.95,  0.97],
       [ 0.93,  0.94,  0.96,  0.97],
       [ 0.91,  0.92,  0.93,  0.95],
       [ 0.93,  0.94,  0.95,  0.97],
       [ 0.83,  0.9 ,  0.91,  0.92],
       [ 0.91,  0.92,  0.93,  0.96],
       [ 0.9 ,  0.92,  0.93,  0.94],
       [ 0.91,  0.92,  0.94,  0.95],
       [ 0.86,  0.9 ,  0.92,  0.93],
       [ 0.86,  0.92,  0.94,  0.95],
       [ 0.82,  0.84,  0.88,  0.89],
       [ 0.86,  0.9 ,  0.93,  0.94],
       [ 0.81,  0.88,  0.92,  0.93],
       [ 0.82,  0.9 ,  0.93,  0.94],
       [ 0.86,  0.93,  0.95,  0.96],
       [ 0.9 ,  0.92,  0.93,  0.97],
       [ 0.81,  0.89,  0.9 ,  0.94],
       [ 0.84,  0.9 ,  0.92,  0.93],
       [ 0.93,  0.94,  0.95,  0.96],
       [ 0.86,  0.91,  0.94,  0.95],
       [ 0.9 ,  0.95,  0.96,  0.98],
       [ 0.88,  0.93,  0.94,  0.96],
       [ 0.86,  0.91,  0.93,  0.94],
       [ 0.89,  0.9 ,  0.96,  0.97],
       [ 0.91,  0.92,  0.93,  0.94],
