# SpectralClustering and DBSCAN with minHash

In [3]:
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.datasets import make_classification
from sklearn.cluster import DBSCAN
from scipy.sparse import csr_matrix
import hashlib
from random import randint

sparsity_factor = 0.1
samples = 3000
features = 1000
classes = 2
clusters_per_class = 2
scale_ = 1.0
data_dense, y_true = make_classification(n_samples=samples, n_features=features, n_informative=2, 
                                 n_redundant=2, n_repeated=0, n_classes=classes, 
                                 n_clusters_per_class=clusters_per_class, weights=None, flip_y=0.01,
                                 class_sep=1.0, hypercube=True, shift=0.0, scale=scale_,
                                 shuffle=True, random_state=None)
instances_list = []
features_list = []
data_list = []
for i in xrange(len(data_dense)):
    for j in xrange(len(data_dense[i])):
        instances_list.append(i)
        features_list.append(hash(str(data_dense[i][j])[0:4]+str(j)) % (features/sparsity_factor))
        data_list.append(data_dense[i][j])
data_sparse = csr_matrix((data_list, (instances_list, features_list)))

In [6]:
%matplotlib inline
print(__doc__)

import time

import numpy as np
# import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score

from sklearn import cluster, datasets
from sklearn.neighbors import kneighbors_graph
# from sklearn.preprocessing import StandardScaler
# from sklearn.random_projection import SparseRandomProjection

from neighborsMinHash.clustering import MinHashClustering 
# from neighborsMinHash.clustering import MinHashDBSCAN
from neighborsMinHash import MinHash



clustering_names = [
    'SpectralClustering', 'MinHashSpectralClustering', 'DBSCAN', 'MinHashDBSCAN']

# original algorithms
spectral = cluster.SpectralClustering(n_clusters=2, eigen_solver='arpack',
                                      affinity="nearest_neighbors", n_neighbors=10)
dbscan = cluster.DBSCAN(eps=.2, metric="euclidean")

# objects used for algorithms with precomputed minHash
minHash0 = MinHash(n_neighbors=10)
minHash1 = MinHash(n_neighbors=10)

spectralMinHash = cluster.SpectralClustering(n_clusters=2, eigen_solver='arpack',
                                      affinity="precomputed", n_neighbors=10)

dbscanMinHash = cluster.DBSCAN(eps=.2, metric='precomputed')


minHashClusteringSpectralClustering = MinHashClustering(minHash0, spectralMinHash)
minHashClusteringDBSCAN = MinHashClustering(minHash1, dbscanMinHash)

clustering_algorithms=[spectral, minHashClusteringSpectralClustering, dbscan, minHashClusteringDBSCAN]
    
X = data_sparse
for name, algorithm in zip(clustering_names, clustering_algorithms):
    t0 = time.time()
    y_pred = algorithm.fit_predict(X)
    t1 = time.time()
    y_pred = y_pred.astype(np.int)
    print name, ":\t", float("{0:.2f}".format(adjusted_rand_score(y_true, y_pred))), "\t\tComputation time: ", t1 - t0

Automatically created module for IPython interactive environment
SpectralClustering :	-0.0 		Computation time:  18.1502420902
MinHashSpectralClustering :	0.0 		Computation time:  16.2415990829
DBSCAN :	0.0 		Computation time:  6.18379712105
MinHashDBSCAN :	0.0 		Computation time:  2.189920187
