### Поређење алгоритама за кластеризацију на 2Д  подацима

In [1]:
import time
from sklearn import cluster, datasets
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
import numpy as np

import plotly.graph_objects as go
from plotly.subplots import make_subplots

import warnings
warnings.filterwarnings("ignore")

In [2]:
np.random.seed(0)

n_samples = 1500
X, y = datasets.make_blobs(n_samples=n_samples, random_state=170)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
X_an = np.dot(X, transformation)
an = (X_an, y)

circles = datasets.make_circles(n_samples=n_samples, factor=.5,
                                      noise=.05)
moons = datasets.make_moons(n_samples=n_samples, noise=.05)
blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
no_structure = np.random.rand(n_samples, 2), None

datasets = [circles, moons, blobs, an, no_structure]

clustering_names = [
    'MiniBatchKMeans', 'AffinityPropagation', 'MeanShift',
    'SpectralClustering', 'Ward', 'AgglomerativeClustering',
    'DBSCAN', 'Birch']

In [3]:
def plot_clusters(datasets, clustering_names, clustering_algorithms):
    
    fig = make_subplots(rows=len(datasets), cols=len(clustering_names), subplot_titles=clustering_names)
    fig.update_layout(
        autosize=False,
        width=1200, 
        height=800,
        margin=go.layout.Margin(
            l=10, 
            r=10, 
            b=10, 
            t=50, 
            pad=0
        ),
        template = "plotly_dark",
        showlegend=False
    )

    fig.update_xaxes(showticklabels=False)
    fig.update_yaxes(showticklabels=False)


    for i_dataset, dataset in enumerate(datasets):
        X, y = dataset
        X = StandardScaler().fit_transform(X)

        for i, (alg_name, algorithm) in enumerate(zip(clustering_names, clustering_algorithms)):

            t0 = time.time()
            algorithm.fit(X)
            t1 = time.time()
            
            if hasattr(algorithm, 'labels_'):
                y_pred = algorithm.labels_.astype(int)
            else:
                y_pred = algorithm.predict(X)

            fig.add_trace(go.Scatter(x=X[:, 0], y=X[:, 1], mode='markers',
                marker=dict(
                    size=2.5,
                    color=y_pred,
                    colorscale='Agsunset',
                )), row=i_dataset+1, col=i+1)
            
            if hasattr(algorithm, 'cluster_centers_'):
                centers = algorithm.cluster_centers_

                fig.add_trace(go.Scatter(
                    x=centers[:, 0], 
                    y=centers[:, 1], 
                    mode='markers',
                    marker=dict(
                        size=10,
                        color=np.arange(len(centers)),
                        colorscale='Agsunset',
                        line=dict(width=2, color='darkred') 
                    )
                ), row=i_dataset+1, col=i+1)

        
            annotation_trace = go.Scatter(x=[max(X[:, 0])], 
                                        y=[min(X[:, 1])+0.1], 
                                        mode="text", 
                                        showlegend=False, 
                                        name="", 
                                        textposition="bottom center",
                                        text=[f'{round(t1-t0, 2)}s', ""],
                                        hoverinfo='none')

            fig.add_trace(annotation_trace, row=i_dataset+1, col=i+1)

    return fig

In [4]:
def apply_cluster_algorithm(X, n_clusters=2):
    X = StandardScaler().fit_transform(X)

    bandwidth = cluster.estimate_bandwidth(X, quantile=0.3)

    connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)
    connectivity = 0.5 * (connectivity + connectivity.T)


    clustering_algorithms = [
        cluster.MiniBatchKMeans(n_clusters=n_clusters), 
        cluster.AffinityPropagation(damping=.9, preference=-200),
        cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True),
        cluster.SpectralClustering(n_clusters=n_clusters, eigen_solver='arpack', affinity="nearest_neighbors"),
        cluster.AgglomerativeClustering(n_clusters=n_clusters, linkage='ward', connectivity=connectivity),
        cluster.AgglomerativeClustering(linkage="average", n_clusters=n_clusters, connectivity=connectivity),
        cluster.DBSCAN(eps=.18),
        cluster.Birch(n_clusters=n_clusters)
    ]

    return clustering_algorithms


for i_dataset, dataset in enumerate(datasets):
    X, y = dataset
    clustering_algorithms = apply_cluster_algorithm(X)

In [5]:
clustering_algorithms

[MiniBatchKMeans(n_clusters=2),
 AffinityPropagation(damping=0.9, preference=-200),
 MeanShift(bandwidth=1.3271069873041095, bin_seeding=True),
 SpectralClustering(affinity='nearest_neighbors', eigen_solver='arpack',
                    n_clusters=2),
 AgglomerativeClustering(connectivity=<1500x1500 sparse matrix of type '<class 'numpy.float64'>'
 	with 17196 stored elements in Compressed Sparse Row format>),
 AgglomerativeClustering(connectivity=<1500x1500 sparse matrix of type '<class 'numpy.float64'>'
 	with 17196 stored elements in Compressed Sparse Row format>,
                         linkage='average'),
 DBSCAN(eps=0.18),
 Birch(n_clusters=2)]

In [6]:
fig = plot_clusters(datasets, clustering_names, clustering_algorithms)
fig.show()

  File "c:\Users\kosti\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "c:\Users\kosti\AppData\Local\Programs\Python\Python39\lib\subprocess.py", line 505, in run
    with Popen(*popenargs, **kwargs) as process:
  File "c:\Users\kosti\AppData\Local\Programs\Python\Python39\lib\subprocess.py", line 951, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\kosti\AppData\Local\Programs\Python\Python39\lib\subprocess.py", line 1420, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,


## Вредност силуете - *Silhouette score*

Вредност силуете је мера колико је објекат сличан свом кластеру у поређењу са другим кластерима. Ова вредност варира у опсегу [-1, 1], тако да:

- Вредност близу +1 указује на то да је пример далеко од суседних кластера, то јест генерисани кластери су добро сепарирани;
- Вредност близу 0 указује на то да је пример унутар или веома близу границе одлуке између два суседна кластера, што значи да може бити преклопа кластера;
- Негативна вредност указује на то да је пример можда додељен погрешном кластеру и да можда постоји сличнији кластер.

## Коефицијент силуете

Коефицијент силуете се рачуна по следећој формули:

$$\frac{b - a}{\max(a, b)}$$


In [9]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
from sklearn import cluster, datasets
from sklearn.preprocessing import StandardScaler
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np

n_samples = 1500
X, y = blobs = datasets.make_blobs(n_samples=n_samples, random_state=42)
X = StandardScaler().fit_transform(X)

db = DBSCAN(eps=0.25)
db.fit(X)
labels = db.labels_
n_clusters_1 = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_1 = list(labels).count(-1)

df = pd.DataFrame(X, columns=['x', 'y'])
df['cluster'] = labels

score_1 = silhouette_score(X, labels)

subplot_titles = ["DBSCAN кластеризација јасно сепарираних Blob података", "DBSCAN кластеризација Blob података са додатим шумом"]

fig = make_subplots(rows=1, cols=2, subplot_titles=subplot_titles)


fig.add_trace(go.Scatter(x=df['x'], y=df['y'], mode='markers', marker=dict(color = df['cluster']), showlegend=False), row=1, col=1)


X, y = blobs = datasets.make_blobs(n_samples=n_samples, random_state=42)
X = StandardScaler().fit_transform(X)

X += np.random.uniform(-0.4, 0.4, size=X.shape) 

db = cluster.DBSCAN(eps=0.25)
db.fit(X)
labels = db.labels_
n_clusters_2 = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_2 = list(labels).count(-1)

df = pd.DataFrame(X, columns=['x', 'y'])
df['cluster'] = labels

score_2 = silhouette_score(X, labels)

fig.add_trace(go.Scatter(x=df['x'], y=df['y'], mode='markers', marker=dict(color = df['cluster']), showlegend=False), row=1, col=2)

fig.update_layout(
        autosize=False,
        width=1200, 
        height=600,
        template = "plotly_dark",
    )

fig.show()

print('******** Фигура 1 ********')
print(f'Silhouette Score: {score_1}')
print('Број кластера %d' % n_clusters_2)
print('Врој ванграничних тачака: %d' % n_noise_2)

print('******** Фигура 2 ********')
print(f'Silhouette Score: {score_2}')
print('Број кластера %d' % n_clusters_2)
print('Врој ванграничних тачака: %d' % n_noise_2)

******** Фигура 1 ********
Silhouette Score: 0.8441415002720908
Број кластера 3
Врој ванграничних тачака: 3
******** Фигура 2 ********
Silhouette Score: 0.5989892017348671
Број кластера 3
Врој ванграничних тачака: 3
