***

Perform DBSCAN clustering from vector array or distance matrix.
    
##    DBSCAN - Density-Based Spatial Clustering of Applications with Noise.

Finds core samples of high density and expands clusters from them.
Good for data which contains clusters of similar density.

---
### Parameters:
---
*    eps : float, optional
        The maximum distance between two samples for them to be considered
        as in the same neighborhood.
        
*    min_samples : int, optional
        The number of samples (or total weight) in a neighborhood for a point
        to be considered as a core point. This includes the point itself.
        
*    metric : string, or callable
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string or callable, it must be one of
        the options allowed by sklearn.metrics.pairwise_distances for
        its metric parameter.
        If metric is "precomputed", X is assumed to be a distance matrix and
        must be square. X may be a sparse matrix, in which case only "nonzero"
        elements may be considered neighbors for DBSCAN.
        
        metric *precomputed* to accept precomputed sparse matrix.
        
        
*    metric_params : dict, optional
        Additional keyword arguments for the metric function.
        
*    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
        The algorithm to be used by the NearestNeighbors module
        to compute pointwise distances and find nearest neighbors.
        
*    leaf_size : int, optional (default = 30)
        Leaf size passed to BallTree or cKDTree. This can affect the speed
        of the construction and query, as well as the memory required
        to store the tree. The optimal value depends
        on the nature of the problem.
        
*    p : float, optional
        The power of the Minkowski metric to be used to calculate distance
        between points.
        
*    n_jobs : int or None, optional (default=None)
        The number of parallel jobs to run.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors.
---
Attributes:
----------
*    core_sample_indices_ : array, shape = [n_core_samples]
        Indices of core samples.
        
*    components_ : array, shape = [n_core_samples, n_features]
        Copy of each core sample found by training.
        
*    labels_ : array, shape = [n_samples]
        Cluster labels for each point in the dataset given to fit().
        Noisy samples are given the label -1.


---
Examples
-------

In [1]:
from sklearn.cluster import DBSCAN
import numpy as np
X = np.array([[1, 2], [2, 2], [2, 3],
              [8, 7], [8, 8], [25, 80]])
clustering = DBSCAN(eps=3, min_samples=2).fit(X)
clustering.labels_
np.array([ 0,  0,  0,  1,  1, -1])
clustering # doctest: +NORMALIZE_WHITESPACE

DBSCAN(algorithm='auto', eps=3, leaf_size=30, metric='euclidean',
    metric_params=None, min_samples=2, n_jobs=1, p=None)

In [2]:
DBSCAN(algorithm='auto', eps=3, leaf_size=30, metric='euclidean',
       metric_params=None, min_samples=2, n_jobs=None, p=None)

DBSCAN(algorithm='auto', eps=3, leaf_size=30, metric='euclidean',
    metric_params=None, min_samples=2, n_jobs=None, p=None)

***
### Generate sample data

In [1]:
print(__doc__)

import numpy as np

from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler

Automatically created module for IPython interactive environment


In [2]:
centers = [[-2, -1], [-0.5, 2], [2.1, 0]]
X, labels_true = make_blobs(n_samples=20000, centers=centers, cluster_std=0.4,
                            random_state=0)

X = StandardScaler().fit_transform(X)

### Compute DBSCAN

In [3]:
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

#### Number of clusters in labels, ignoring noise if present.

In [6]:
#n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

#print('Estimated number of clusters: %d' % n_clusters_)

#print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))

#print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))

#print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))

#print("Adjusted Rand Index: %0.3f"
#      % metrics.adjusted_rand_score(labels_true, labels))

#print("Adjusted Mutual Information: %0.3f"
#      % metrics.adjusted_mutual_info_score(labels_true, labels))

#print("Silhouette Coefficient: %0.3f"
#      % metrics.silhouette_score(X, labels))

NameError: name 'n_clusters_' is not defined

### Plot result

#### Black removed and is used for noise instead.

In [7]:
import matplotlib.pyplot as plt

unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
          for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = [0, 0, 0, 1]

    class_member_mask = (labels == k)

    xy = X[class_member_mask & core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=14)

    xy = X[class_member_mask & ~core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=6)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

NameError: name 'labels' is not defined

***
***

In [10]:
from sklearn.cluster import DBSCAN
import numpy as np
X = np.array([[1, 2], [2, 2], [2, 3],
              [8, 7], [8, 8], [25, 80]])
clustering = DBSCAN(eps=3, min_samples=2).fit(X)
print(clustering.labels_)

[ 0  0  0  1  1 -1]


In [8]:
clustering

DBSCAN(algorithm='auto', eps=3, leaf_size=30, metric='euclidean',
    metric_params=None, min_samples=2, n_jobs=1, p=None)