In [None]:
# 2020-10-25 created by Akson

In [None]:
# Code9.1
# 准备数据

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs

blob_centers = np.array(
    [[ 0.2,  2.3],
     [-1.5 ,  2.3],
     [-2.8,  1.8],
     [-2.8,  2.8],
     [-2.8,  1.3]])
blob_std = np.array([0.4, 0.3, 0.1, 0.1, 0.1])

X, y = make_blobs(n_samples = 2000, centers = blob_centers, cluster_std = blob_std, random_state = 7)

plt.scatter(X[:, 0], X[:, 1], c = y, s = 1)
plt.show()

In [None]:
# Code9.2
# 使用K-means聚类

from sklearn.cluster import KMeans

k = 5
kmeans = KMeans(n_clusters = k)
y_pred = kmeans.fit_predict(X)

print(y_pred)
print(y_pred is kmeans.labels_)
print(kmeans.cluster_centers_)
print()

In [None]:
# Code9.3
# 使用小批量k-means

from sklearn.cluster import MiniBatchKMeans

minibatch_kmeans = MiniBatchKMeans(n_clusters = 5)
minibatch_kmeans.fit(X)

In [None]:
# Code9.4
# 引入图片素材

from matplotlib.image import imread

image = imread('../image/ladybug/ladybug.png')
plt.imshow(image)
plt.show()
print(image.shape)

In [None]:
# Code9.5
# 使用K-means压缩图像

X = image.reshape(-1, 3)
kmeans = KMeans(n_clusters = 5).fit(X)
segmented_img = kmeans.cluster_centers_[kmeans.labels_]
segmented_img = segmented_img.reshape(image.shape)

plt.imshow(segmented_img)
plt.show()

In [None]:
# Code9.6
# 运用kmeans预处理数据

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

X_digits, y_digits = load_digits(return_X_y = True)
X_train, X_test, y_train, y_test = train_test_split(X_digits, y_digits)

In [None]:
# Code9.7
# 与传统逻辑回归方法对比

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

log_reg = LogisticRegression(multi_class = "ovr", solver = "lbfgs", max_iter = 5000, random_state = 42)
log_reg.fit(X_train, y_train)
print('Logistic score: %s' % log_reg.score(X_test, y_test))

pipeline = Pipeline([
    ('kmeans', KMeans(n_clusters = 50)),
    ('log_reg', LogisticRegression(multi_class = "ovr", solver = "lbfgs", max_iter = 5000, random_state = 42)),
])

pipeline.fit(X_train, y_train)
print('With kmeans score: %s' % pipeline.score(X_test, y_test))


In [None]:
# Code9.8
# 寻找最优聚类数

from sklearn.model_selection import GridSearchCV

param_grid = dict(kmeans__n_clusters = range(2, 100))
grid_clf = GridSearchCV(pipeline, param_grid, cv = 3, verbose = 2, n_jobs = -1)
grid_clf.fit(X_train, y_train)

print(grid_clf.best_params_)
print(grid_clf.score(X_test, y_test))

In [None]:
# Code9.9
# 使用半监督学习

# 只用前五十个数据
n_labeled = 50
log_reg = LogisticRegression(multi_class = "ovr", solver = "lbfgs", max_iter = 5000, random_state = 42)
log_reg.fit(X_train[:n_labeled], y_train[:n_labeled])
print('Only use 50 digit to train: %s' % log_reg.score(X_test, y_test))

# 将数据分成50个聚类
k = 50
kmeans = KMeans(n_clusters = k, random_state = 42)
X_digits_dist = kmeans.fit_transform(X_train)
# print(X_train[0, 2])
# print(X_digits_dist.shape)
representative_digit_idx = np.argmin(X_digits_dist, axis = 0)
X_representative_digits = X_train[representative_digit_idx]

plt.figure(figsize=(8, 2))
for index, X_representative_digit in enumerate(X_representative_digits):
    plt.subplot(k // 10, 10, index + 1)
    plt.imshow(X_representative_digit.reshape(8, 8), cmap="binary", interpolation="bilinear")
    plt.axis('off')
plt.show()

# 根据选出的代表图片人工标记
y_representative_digits = np.array([
    8, 0, 3, 1, 6, 7, 4, 2, 2, 9,
    4, 9, 2, 5, 5, 7, 4, 2, 1, 8, 
    9, 6, 1, 4, 2, 3, 3, 5, 9, 8,
    5, 7, 7, 9, 9, 9, 4, 3, 8, 4,
    3, 0, 7, 5, 0, 6, 6, 7, 1, 8
])

log_reg = LogisticRegression(multi_class = "ovr", solver = "lbfgs", max_iter = 5000, random_state = 42)
log_reg.fit(X_representative_digits, y_representative_digits)
print('with kmeans labeld: %s' % log_reg.score(X_test, y_test))

# 标签传播
y_train_propagated = np.empty(len(X_train), dtype = np.int32)
for i in range(k):
    y_train_propagated[kmeans.labels_ == i] = y_representative_digits[i]
    
log_reg = LogisticRegression(multi_class = "ovr", solver = "lbfgs", max_iter = 5000, random_state = 42)
log_reg.fit(X_train, y_train_propagated)
print('with kmeans propagated: %s' % log_reg.score(X_test, y_test))

In [None]:
# Code9.10
# DBSCAN

from sklearn.cluster import DBSCAN
from sklearn.datasets import make_moons

X, y = make_moons(n_samples = 1000, noise = 0.05)
dbscan = DBSCAN(eps = 0.2, min_samples = 5)
dbscan.fit(X)

print(dbscan.labels_)
print(len(dbscan.core_sample_indices_))
# print(dbscan.core_sample_indices_)
print(dbscan.components_)



In [None]:
# Code9.11
# 数据准备

X1, y1 = make_blobs(n_samples = 1000, centers = ((4, -4), (0, 0)), random_state = 42)
X1 = X1.dot(np.array([[0.374, 0.95], [0.732, 0.598]]))
X2, y2 = make_blobs(n_samples = 250, centers = 1, random_state = 42)
X2 = X2 + [6, -8]
X = np.r_[X1, X2]
y = np.r_[y1, y2]

In [None]:
# Code9.12
# 高斯混合模型

from sklearn.mixture import GaussianMixture

gm = GaussianMixture(n_components = 3, n_init = 10)
gm.fit(X)

# 基于高式混合模型输出的权重
print(gm.weights_)
print(gm.means_)
print(gm.covariances_)

# 检查算法是否收敛
print(gm.converged_)
# 迭代次数
print(gm.n_iter_)
# 对原数据集聚类（硬）
print(gm.predict(X))
# 对原数据集聚类（软）
print(gm.predict_proba(X))

# 可以使用这个模型来生成新实例
X_new, y_new = gm.sample(6)
print(X_new)
print(y_new)

In [None]:
# Code9.13
# 异常值检测

densities = gm.score_samples(X)
print(densities)
density_threshold = np.percentile(densities, 4)
print(density_threshold)
anomalies = X[densities < density_threshold]
print(anomalies)

In [None]:
# Code9.14

print(gm.bic(X))
print(gm.aic(X))

In [None]:
# Code9.15
# 使用贝叶斯高斯混合模型

from sklearn.mixture import BayesianGaussianMixture

bgm = BayesianGaussianMixture(n_components = 10, n_init = 10)
bgm.fit(X)

print(np.round(bgm.weights_, 2))