In [6]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

n_samples = 1500
dataset = datasets.make_blobs(n_samples=n_samples, centers=2, center_box=(-7.0, 7.5),
                              cluster_std=[1.4, 1.7],
                              random_state=42)
X_2, _ = datasets.make_blobs(n_samples=n_samples, random_state=170, centers=[[-4, -3]], cluster_std=[1.9])
transformation = [[1.2, -0.8], [-0.4, 1.7]]
X_2 = np.dot(X_2, transformation)
X, y = np.concatenate((dataset[0], X_2)), np.concatenate((dataset[1], np.array([2] * len(X_2))))


In [3]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# сначала получим предсказанные кластеры при помощи метода кластеризации
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X)
kmeans_pred = kmeans.labels_

# теперь посчитаем коэффициент силуэта
silhouette_score(X=X, labels=kmeans_pred, metric='euclidean')


[2 2 1 ... 0 0 0]


0.5131209788437305

In [7]:
# 4.11.1
from sklearn.mixture import GaussianMixture

gm = GaussianMixture(n_components=3, random_state=42)
gm.fit(X)
y_pred = gm.predict(X)

silhouette_score(X=X, labels=y_pred, metric='euclidean')


0.49893287606943293

In [13]:
# 4.11.2
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN


# K-means
def calc_kmeans_silhouette_score(clusters=3):
    kmeans = KMeans(n_clusters=clusters, random_state=42)
    kmeans.fit(X)
    kmeans_pred = kmeans.labels_
    return silhouette_score(X=X, labels=kmeans_pred, metric='euclidean')


# GaussianMixture
def calc_gaussianmixture_silhouette_score(components=3):
    gm = GaussianMixture(n_components=components, random_state=42)
    gm.fit(X)
    y_pred = gm.predict(X)
    return silhouette_score(X=X, labels=y_pred, metric='euclidean')


# Агломеративная кластеризация
def calc_agglomerative_clustering_silhouette_score(clusters=3):
    ac = AgglomerativeClustering(n_clusters=clusters) 
    ac.fit(X)
    y_pred = ac.labels_.astype(np.int)
    return silhouette_score(X=X, labels=y_pred, metric='euclidean')


# DBSCAN
def calc_dbscan_silhouette_score(epsilon=0.9, samples=35):
    dbscan = DBSCAN(eps=epsilon, min_samples=samples)
    dbscan.fit(X)
    y_pred = dbscan.labels_.astype(np.int)
    return silhouette_score(X=X, labels=y_pred, metric='euclidean')


print('K-means silhouette_score: ', calc_kmeans_silhouette_score())
print('GaussianMixture silhouette_score: ', calc_gaussianmixture_silhouette_score())
print('AgglomerativeClustering silhouette_score: ', calc_agglomerative_clustering_silhouette_score())
print('DBSCAN silhouette_score: ', calc_dbscan_silhouette_score())


K-means silhouette_score:  0.5131209788437305


GaussianMixture silhouette_score:  0.49893287606943293


AgglomerativeClustering silhouette_score:  0.4811992210663849


DBSCAN silhouette_score:  0.4454335539277996


In [19]:
# 4.11.3

kmeans_value = (-1, -1)
gaussian_value = (-1, -1)
agglomerative_value = (-1, -1)

for x in range(2, 10):
    if kmeans_value[1] < calc_kmeans_silhouette_score(x):
        kmeans_value = (x, calc_kmeans_silhouette_score(x))
    
    if gaussian_value[1] < calc_gaussianmixture_silhouette_score(x):
        gaussian_value = (x, calc_gaussianmixture_silhouette_score(x))
    
    if agglomerative_value[1] < calc_agglomerative_clustering_silhouette_score(x):
        agglomerative_value = (x, calc_agglomerative_clustering_silhouette_score(x))

print(kmeans_value[0], " ", gaussian_value[0], " ", agglomerative_value[0])


3   4   4
