In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

n_samples = 1500
dataset = datasets.make_blobs(n_samples=n_samples, centers=2, center_box=(-7.0, 7.5),
                              cluster_std=[1.4, 1.7],
                              random_state=42)
X_2, _ = datasets.make_blobs(n_samples=n_samples, random_state=170, centers=[[-4, -3]], cluster_std=[1.9])
transformation = [[1.2, -0.8], [-0.4, 1.7]]
X_2 = np.dot(X_2, transformation)
X, y = np.concatenate((dataset[0], X_2)), np.concatenate((dataset[1], np.array([2] * len(X_2))))


In [2]:
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import v_measure_score

# сначала получим предсказанные кластеры при помощи метода кластеризации
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X)
kmeans_pred = kmeans.labels_

# теперь посчитаем v_measure
v_measure_score(labels_true=y, labels_pred=kmeans_pred)


0.795592702456356

In [4]:
# 4.14.1
from sklearn.mixture import GaussianMixture


# GaussianMixture
def calc_gaussianmixture_v_measure_score(components=3):
    gm = GaussianMixture(n_components=components, random_state=42)
    gm.fit(X)
    y_pred = gm.predict(X)
    return v_measure_score(labels_true=y, labels_pred=y_pred)

calc_gaussianmixture_v_measure_score()


0.9332760595996924

In [11]:
# 4.14.2
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN


# K-means
def calc_kmeans_v_measure_score(clusters=3, method_init='k-means++', num_init=10):
    kmeans = KMeans(n_clusters=clusters, random_state=42, init=method_init, n_init=num_init)
    kmeans.fit(X)
    kmeans_pred = kmeans.labels_
    return v_measure_score(labels_true=y, labels_pred=kmeans_pred)


# Агломеративная кластеризация
def calc_agglomerative_clustering_v_measure_score(clusters=3, linkage='ward'):
    ac = AgglomerativeClustering(n_clusters=clusters, linkage=linkage) 
    ac.fit(X)
    y_pred = ac.labels_.astype(np.int)
    return v_measure_score(labels_true=y, labels_pred=y_pred)


# DBSCAN
def calc_dbscan_v_measure_score(epsilon=0.9, samples=35):
    dbscan = DBSCAN(eps=epsilon, min_samples=samples)
    dbscan.fit(X)
    y_pred = dbscan.labels_.astype(np.int)
    return v_measure_score(labels_true=y, labels_pred=y_pred)


print('K-means v_measure_score: ', calc_kmeans_v_measure_score())
print('GaussianMixture v_measure_score: ', calc_gaussianmixture_v_measure_score())
print('AgglomerativeClustering v_measure_score: ', calc_agglomerative_clustering_v_measure_score())
print('DBSCAN v_measure_score: ', calc_dbscan_v_measure_score())


K-means v_measure_score:  0.795592702456356
GaussianMixture v_measure_score:  0.9332760595996924


AgglomerativeClustering v_measure_score:  0.703747024360433
DBSCAN v_measure_score:  0.7732549110297919


In [9]:
# 4.14.3

print('K-means v_measure_score: ', calc_kmeans_v_measure_score(method_init='k-means++', num_init=1))
print('K-means v_measure_score: ', calc_kmeans_v_measure_score(method_init='random', num_init=1))


K-means v_measure_score:  0.7911556191133551
K-means v_measure_score:  0.5990620007488798


In [10]:
# 4.14.4
from sklearn.cluster import MiniBatchKMeans


# MiniBatchKMeans
def calc_kmeans_mini_batch_v_measure_score(clusters=3, method_init='k-means++', num_init=10):
    kmeans_mini_batch = MiniBatchKMeans(n_clusters=clusters, random_state=42, init=method_init, n_init=num_init)
    kmeans_mini_batch.fit(X)
    kmeans_mini_batch_pred = kmeans.labels_
    return v_measure_score(labels_true=y, labels_pred=kmeans_mini_batch_pred)


print('K-means v_measure_score: ', calc_kmeans_v_measure_score(num_init=1))
print('MiniBatchKMeans v_measure_score: ', calc_kmeans_mini_batch_v_measure_score(num_init=1))


K-means v_measure_score:  0.7911556191133551
MiniBatchKMeans v_measure_score:  0.795592702456356


In [12]:
# 4.14.5

print('AgglomerativeClustering v_measure_score: ', calc_agglomerative_clustering_v_measure_score(linkage='ward'))
print('AgglomerativeClustering v_measure_score: ', calc_agglomerative_clustering_v_measure_score(linkage='complete'))
print('AgglomerativeClustering v_measure_score: ', calc_agglomerative_clustering_v_measure_score(linkage='average'))
print('AgglomerativeClustering v_measure_score: ', calc_agglomerative_clustering_v_measure_score(linkage='single'))


AgglomerativeClustering v_measure_score:  0.703747024360433


AgglomerativeClustering v_measure_score:  0.4105143541946565


AgglomerativeClustering v_measure_score:  0.5391616325942709
AgglomerativeClustering v_measure_score:  0.0008842106330108959


In [14]:
# 4.14.6
from sklearn.neighbors import kneighbors_graph

connectivity = kneighbors_graph(X, n_neighbors=6, include_self=False)
connectivity = 0.5 * (connectivity + connectivity.T)

ac = AgglomerativeClustering(n_clusters=3, connectivity=connectivity)
ac.fit(X)
y_pred = ac.labels_.astype(np.int)

print(v_measure_score(labels_true=y, labels_pred=y_pred))
print(calc_agglomerative_clustering_v_measure_score(clusters=3))


0.902634398342219


0.703747024360433


In [17]:
# 4.14.7

print('DBSCAN v_measure_score [eps=0.9, min_samples=35]: ', calc_dbscan_v_measure_score(epsilon=0.9, samples=35))
print('DBSCAN v_measure_score [eps=0.8, min_samples=35]: ', calc_dbscan_v_measure_score(epsilon=0.8, samples=35))


DBSCAN v_measure_score [eps=0.9, min_samples=35]:  0.7732549110297919
DBSCAN v_measure_score [eps=0.8, min_samples=35]:  0.7063100442576922


In [19]:
# 4.14.8
def calc_dbscan_without_noise_v_measure_score(epsilon=0.9, samples=35):
    dbscan = DBSCAN(eps=epsilon, min_samples=samples)
    dbscan.fit(X)
    y_pred = dbscan.labels_.astype(np.int)
    return v_measure_score(labels_true=y[y_pred != -1], labels_pred=y_pred[y_pred != -1])


calc_dbscan_without_noise_v_measure_score()


0.9667281300681958

In [22]:
# 4.14.9
from sklearn.preprocessing import StandardScaler

print(calc_agglomerative_clustering_v_measure_score())

X_st = StandardScaler().fit_transform(X)
ac = AgglomerativeClustering(n_clusters=3) 
ac.fit(X_st)
y_pred = ac.labels_.astype(np.int)
print(v_measure_score(labels_true=y, labels_pred=y_pred))


0.703747024360433


0.9079037199053294


In [23]:
# 4.14.10
from sklearn.preprocessing import MinMaxScaler

print(calc_agglomerative_clustering_v_measure_score())

X_minmax = MinMaxScaler().fit_transform(X)
ac = AgglomerativeClustering(n_clusters=3) 
ac.fit(X_minmax)
y_pred = ac.labels_.astype(np.int)
print(v_measure_score(labels_true=y, labels_pred=y_pred))


0.703747024360433


0.8929241488344335
