In [1]:
# Load libraries
import numpy as np
import scipy.special

from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import MeanShift, DBSCAN
from sklearn.utils import check_random_state
from sklearn.decomposition import PCA

from bokeh.plotting import figure,output_notebook, show
from bokeh.layouts import gridplot,column, row
from bokeh.models import ColumnDataSource,ColorBar,Plot, VBar,Grid, LinearAxis
from bokeh.transform import factor_cmap
from bokeh.palettes import Category10

# Data Preprocessing

In [2]:
# Load data
iris = datasets.load_iris()
features = iris.data
print(features[:10],features.shape)

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]] (150, 4)


In [3]:
# Standardize features
scaler = StandardScaler()
features_std = scaler.fit_transform(features)
features_std[:10]

array([[-0.90068117,  1.01900435, -1.34022653, -1.3154443 ],
       [-1.14301691, -0.13197948, -1.34022653, -1.3154443 ],
       [-1.38535265,  0.32841405, -1.39706395, -1.3154443 ],
       [-1.50652052,  0.09821729, -1.2833891 , -1.3154443 ],
       [-1.02184904,  1.24920112, -1.34022653, -1.3154443 ],
       [-0.53717756,  1.93979142, -1.16971425, -1.05217993],
       [-1.50652052,  0.78880759, -1.34022653, -1.18381211],
       [-1.02184904,  0.78880759, -1.2833891 , -1.3154443 ],
       [-1.74885626, -0.36217625, -1.34022653, -1.3154443 ],
       [-1.14301691,  0.09821729, -1.2833891 , -1.44707648]])

# Mean Shift Clustering

In [12]:
%time
# Create mean shift object
cluster = MeanShift(n_jobs=-1,cluster_all=False)
# Train model
model = cluster.fit(features_std)

CPU times: total: 0 ns
Wall time: 0 ns


In [13]:
model.labels_

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1,
        1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  0,
        0,  0, -1,  0,  0,  0, -1,  0,  0, -1,  0, -1,  0,  0,  0,  0,  0,
       -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0, -1,  0,  0,  0,
        0,  0,  0, -1, -1,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0, -1, -1,
       -1,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0, -1,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

In [14]:
model.cluster_centers_

array([[ 0.50161528, -0.32287436,  0.65393539,  0.65261739],
       [-1.05954571,  0.75811468, -1.2998088 , -1.25401594]])

In [15]:
# Convert cluster labels to strings
cluster_labels_str = [str(label) for label in model.labels_]
cluster_labels_str[:10]

['1', '1', '1', '1', '1', '1', '1', '1', '1', '1']

In [16]:
# Reduce dimensionality using PCA
pca = PCA(n_components=2)
data_2d = pca.fit_transform(features_std)

In [17]:
output_notebook()

In [18]:
# Create a Bokeh ColumnDataSource
source = ColumnDataSource(data=dict(x=data_2d[:, 0], y=data_2d[:, 1], color=cluster_labels_str))

# Create a scatter plot with factor_cmap for color mapping
plot = figure(title="Mean Shift Clustering with PCA", x_axis_label='Principal Component 1', 
              y_axis_label='Principal Component 2', width=800, height=600)
plot.scatter(x='x', y='y', size=8, color=factor_cmap('color', palette=Category10[3], factors=["0","1","2"]), source=source)
# Show the plot
show(plot)

# DBScan Clustering

In [19]:
# Create DBSCAN object
cluster = DBSCAN(n_jobs=-1)
# Train model
model = cluster.fit(features_std)

In [20]:
model.labels_

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1,
        0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  1,
        1,  1,  1,  1,  1, -1, -1,  1, -1, -1,  1, -1,  1,  1,  1,  1,  1,
       -1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
       -1,  1, -1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1, -1,  1,
        1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1,  1, -1,  1,  1, -1, -1,
       -1,  1,  1, -1,  1,  1, -1,  1,  1,  1, -1, -1, -1,  1,  1,  1, -1,
       -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1],
      dtype=int64)

In [21]:
model.cluster_centers_

AttributeError: 'DBSCAN' object has no attribute 'cluster_centers_'

In [22]:
# Convert cluster labels to strings
cluster_labels_str = [str(label) for label in model.labels_]
cluster_labels_str[:10]

['0', '0', '0', '0', '0', '0', '0', '0', '0', '0']

In [23]:
# Create a Bokeh ColumnDataSource
source = ColumnDataSource(data=dict(x=data_2d[:, 0], y=data_2d[:, 1], color=cluster_labels_str))

# Create a scatter plot with factor_cmap for color mapping
plot = figure(title="Mean Shift Clustering with PCA", x_axis_label='Principal Component 1', 
              y_axis_label='Principal Component 2', width=800, height=600)
plot.scatter(x='x', y='y', size=8, color=factor_cmap('color', palette=Category10[3], factors=["0","1","2"]), source=source)
# Show the plot
show(plot)

# End Of Density based clustering