# SpectralClustering and DBSCAN with minHash

In [1]:
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.datasets import make_classification
from sklearn.cluster import DBSCAN
from scipy.sparse import csr_matrix
from scipy import stats
import hashlib
from random import randint
import numpy as np


sparsity_factor = 0.01
samples = 600
features = 100
classes = 2
clusters_per_class = 2
scale_ = 1.0
threshold = -0.2
data_dense, y_true = make_classification(n_samples=samples, n_features=features, n_informative=2, 
                                 n_redundant=0, n_repeated=0, n_classes=classes, 
                                 n_clusters_per_class=clusters_per_class, weights=None, flip_y=0.01,
                                 class_sep=1.0, hypercube=True, shift=0.0, scale=scale_,
                                 shuffle=True, random_state=1)
print data_dense
instances_list = []
features_list = []
data_list = []
for i in xrange(len(data_dense)):
    average_value_of_list = np.average(data_dense[i])
    variance = np.var(data_dense[i])
#     print "Average: ", average_value_of_list
#     print "var: ", variance
    if average_value_of_list < 0:
        data_with_threshold = stats.threshold(data_dense[i], threshmin=average_value_of_list-variance, threshmax=average_value_of_list+variance, newval=0.0)
    else:
        data_with_threshold = stats.threshold(data_dense[i], threshmin=average_value_of_list-variance, threshmax=average_value_of_list+variance, newval=0.0)

    #     data_with_threshold
    for j in xrange(len(data_dense[i])):
        instances_list.append(i)
#         features_list.append(j)
        hash_object = hashlib.sha256(str(j))
        hex_dig = hash_object.hexdigest()
        features_list.append(int(hex_dig, 16) % (features/sparsity_factor))
#         features_list.append(hash(str(j)) % (features/sparsity_factor))
        data_list.append(data_with_threshold[j])
#         data_list.append(data_dense[i][j])
#     print "\n"
data_sparse = csr_matrix((data_list, (instances_list, features_list)))
data_sparse.eliminate_zeros()
# print data_sparse
print data_sparse.getnnz(1)
# print data_with_threshold

[[-1.34831809  1.2516397  -0.05259496 ...,  0.29353471 -0.50648978
  -1.51936458]
 [-1.64043561 -2.06974177  2.28306064 ..., -0.08949769  0.31851596
   0.19447141]
 [ 0.9258695   1.02981673  0.15674275 ..., -0.05183957 -0.30417372
  -0.80135467]
 ..., 
 [ 1.58906495  0.09964962 -0.52485075 ..., -1.05337874  0.70106004
   1.04552039]
 [-1.94478665 -1.51207025 -1.96563778 ...,  0.92153509 -2.50381084
   0.27253074]
 [-0.72420961 -2.5198995   3.01211113 ...,  1.23337709 -1.15124837
   0.9060618 ]]
[66 64 67 62 64 65 64 59 61 62 65 64 62 52 58 64 62 64 63 65 64 58 63 62 62
 65 63 64 74 66 66 65 69 68 69 62 63 61 57 70 60 68 59 60 63 67 64 60 65 65
 73 59 62 70 68 64 62 58 58 69 70 62 57 59 61 53 61 62 60 65 65 64 60 65 64
 71 71 59 67 71 72 64 70 67 65 58 58 67 61 63 68 58 61 60 62 66 63 62 65 72
 61 57 58 70 68 70 65 67 67 53 58 55 56 54 64 60 69 70 63 69 72 62 70 67 56
 65 64 56 62 60 64 61 67 59 65 65 59 67 63 70 65 63 60 64 64 62 63 61 58 63
 68 64 67 68 58 58 57 58 71 65 61 68 68 62 6

In [2]:
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import accuracy_score
from pympler import asizeof
neighbors = NearestNeighbors(n_jobs=4)
neighbors.fit(data_dense)
%time neighbors_dense = neighbors.kneighbors(n_neighbors=5, return_distance=False)

neighbors_sparse = NearestNeighbors(n_jobs=4)
neighbors_sparse.fit(data_sparse)
%time neighbors_sparse = neighbors_sparse.kneighbors(n_neighbors=5, return_distance=False)

accuracy_score_ = 0.0
for x, y in zip(neighbors_dense, neighbors_sparse):
    accuracy_score_ += accuracy_score(x, y)
print "Accuracy_approx: ", accuracy_score_ / float(len(neighbors_dense))
# print neighbors_dense
# print neighbors_sparse

CPU times: user 128 ms, sys: 0 ns, total: 128 ms
Wall time: 130 ms
CPU times: user 40 ms, sys: 28 ms, total: 68 ms
Wall time: 148 ms
Accuracy_approx:  0.012


In [None]:
%matplotlib inline
%load_ext memory_profiler
print(__doc__)

import time

import numpy as np
# import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score

from sklearn import cluster, datasets
from sklearn.neighbors import kneighbors_graph
# from sklearn.preprocessing import StandardScaler
# from sklearn.random_projection import SparseRandomProjection

from neighborsMinHash.clustering import MinHashClustering 
# from neighborsMinHash.clustering import MinHashDBSCAN
from neighborsMinHash import MinHash
from memory_profiler import memory_usage


clustering_names = [
    'SpectralClustering', 'MinHashSpectralClustering', 'DBSCAN', 'MinHashDBSCAN']

# original algorithms
spectral = cluster.SpectralClustering(n_clusters=2, eigen_solver='arpack',
                                      affinity="nearest_neighbors", n_neighbors=5)
dbscan = cluster.DBSCAN(eps=.2, metric="euclidean")

# objects used for algorithms with precomputed minHash
minHash0 = MinHash(n_neighbors=5)
minHash1 = MinHash(n_neighbors=5)

spectral_precomputed = cluster.SpectralClustering(n_clusters=2, eigen_solver='arpack',
                                      affinity="precomputed", n_neighbors=5)

dbscanMinHash = cluster.DBSCAN(eps=.2, metric='precomputed')


minHashClusteringSpectralClustering = MinHashClustering(minHash0, spectral_precomputed)
minHashClusteringDBSCAN = MinHashClustering(minHash1, dbscanMinHash)

clustering_algorithms=[spectral, minHashClusteringSpectralClustering, dbscan, minHashClusteringDBSCAN]
    
X = data_sparse
# print "Size of dense data: ", asizeof.asizeof(data_dense)
# print "Size of sparse data", asizeof.asizeof(data_sparse)
for name, algorithm in zip(clustering_names, clustering_algorithms):
    print "\n"
    t0 = time.time()
    %time %memit y_pred = algorithm.fit_predict(X)
    t1 = time.time()
    
    y_pred = y_pred.astype(np.int)
    print name, ":\tAccuracy: ", float("{0:.2f}".format(adjusted_rand_score(y_true, y_pred)))

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler
Automatically created module for IPython interactive environment


peak memory: 648.00 MiB, increment: 0.00 MiB
CPU times: user 396 ms, sys: 44 ms, total: 440 ms
Wall time: 571 ms
SpectralClustering :	Accuracy:  0.0


