# Обучение без учителя
Мы не знаем "правильный ответ". Результат очень зависит от конкретного метода.

## Кластеризация
Поиск групп "похожих" объектов. Похожа на классификацию, когда в задаче классы не заданы.
![image.png](https://scikit-learn.org/stable/_images/sphx_glr_plot_cluster_comparison_001.png)


### Искусственный пример - просто двухмерные пятна

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import multivariate_normal

a = pd.DataFrame(multivariate_normal([16, 120], [[1, 0], [0, 20]], seed=1).rvs(100), columns=['x1', 'x2'])
b = pd.DataFrame(multivariate_normal([10, 100], [[1, 0], [0, 20]], seed=1).rvs(100), columns=['x1', 'x2'])
c = pd.DataFrame(multivariate_normal([1, 150], [[1, 0], [0, 20]], seed=1).rvs(100), columns=['x1', 'x2'])
a['class'] = 'a'
b['class'] = 'b'
c['class'] = 'c'
abc = pd.concat([a, b, c])

In [None]:
Xa = abc.drop(columns=['class'])
abc.sample(5)

#### KMeans

<img src="https://ds055uzetaobb.cloudfront.net/brioche/uploads/y4KGN92h7r-screen-shot-2016-05-05-at-43007-pm.png?width=1200">

In [None]:
import seaborn as sns
fg = sns.relplot(x='x1', y='x2', hue='class', data=abc)

In [None]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=3, random_state=1)
km.fit(Xa)
km

In [None]:
km.labels_

In [None]:
abc['cluster'] = km.labels_

In [None]:
import matplotlib.pyplot as plt
plt.scatter(x='x1', y='x2', c='cluster', marker='.', data=abc)
plt.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], c='red', marker='o')

In [None]:
from sklearn.metrics import pairwise_distances_argmin, silhouette_score

n=7
fig, plots = plt.subplots(1, n+1, figsize=(4*n, 4), sharex=True, sharey=True)
inertia = []
score = []
initial = np.array([[0, 90], [15, 90], [15, 160]])
# initial = 'random'
# initial = 'k-means++'

plots[0].scatter(initial[:, 0], initial[:, 1], c='red', marker='o')
labels0 = pairwise_distances_argmin(abc[['x1', 'x2']].values, initial)
plots[0].scatter(x='x1', y='x2', c=labels0, marker='.', data=abc)

for i in range(1, n+1):
  km = KMeans(n_clusters=3, n_init=1, init=initial, max_iter=i, random_state=5).fit(Xa)
  plots[i].scatter(x='x1', y='x2', c=km.labels_, marker='.', data=abc)
  plots[i].scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], c='red', marker='o')
  plots[i].set_title(km.n_iter_)
  inertia.append(km.inertia_)
  score.append(silhouette_score(Xa, km.labels_))

In [None]:
print(inertia)
plt.plot(inertia)

In [None]:
print(score)
plt.plot(score)

In [None]:
from sklearn.metrics import silhouette_score, silhouette_samples
abc['score'] = silhouette_samples(Xa, abc.cluster)
centers = pd.DataFrame(km.cluster_centers_, columns=['x1', 'x2'])
centers['class'] = 'center'
centers['cluster'] = 'center'
centers
sns.relplot(x='x1', y='x2', hue='score', style='cluster', hue_norm=(-1, 1), palette="icefire", data=pd.concat([abc, centers]))

#### MeanShift

<img src="https://image.slidesharecdn.com/icdmtrackingandpursuit-140523110059-phpapp02/95/implementing-camshift-on-a-mobile-robot-for-person-tracking-and-pursuiticdm-9-638.jpg?cb=1400843848">

In [None]:
abc.sample(5)

In [None]:
from sklearn.cluster import estimate_bandwidth
estimate_bandwidth(Xa)

In [None]:
from sklearn.cluster import MeanShift

ms = MeanShift()
abc['cluster'] = ms.fit_predict(Xa)
abc['cluster'].values

In [None]:
ms.cluster_centers_

In [None]:
centers = pd.DataFrame(ms.cluster_centers_, columns=['x1', 'x2'])
centers['class'] = 'center'
centers['cluster'] = 'center'
centers

In [None]:
import seaborn as sns
sns.relplot(x='x1', y='x2', hue='cluster', style='class', data=pd.concat([abc, centers]))

In [None]:
n=7
fig, plots = plt.subplots(1, n+1, figsize=(4*n, 4), sharex=True, sharey=True)
initial = abc[['x1', 'x2']].values

for i in range(1, n+1):
  ms = MeanShift(max_iter=i-1, bandwidth=4, seeds=initial).fit(Xa)
  plots[i].scatter(x='x1', y='x2', c=ms.labels_, marker='.', data=abc)
  plots[i].scatter(ms.cluster_centers_[:, 0], ms.cluster_centers_[:, 1], c='red', marker='o')
  plots[i].set_title(f"{ms.n_iter_}, {ms.cluster_centers_.shape}")

**почему так?**

In [None]:
sns.relplot(x='x1', y='x2', hue='cluster', style='class', data=pd.concat([abc, centers]))\
  .set(xlim=(-20, 60),ylim=(80, 160))


#### Препроцессинг, pipeline




In [None]:
Xa = abc.drop(columns=['class', 'cluster', 'score'])
abc.sample(5)

In [None]:
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline

nkm = Pipeline([('prep', RobustScaler()), ('clusterer', KMeans(3))])
nkm.fit(Xa)

In [None]:
nkm['prep'].transform(Xa).shape

In [None]:
abc['nx1'], abc['nx2'] = nkm['prep'].transform(Xa).transpose()

In [None]:
abc['cluster']=nkm['clusterer'].labels_

In [None]:
abc.head()

In [None]:
centers = pd.DataFrame(nkm['clusterer'].cluster_centers_, columns=['nx1', 'nx2'])
centers['class'] = 'center'
centers['cluster'] = 'center'
centers

In [None]:
sns.relplot(x='nx1', y='nx2', hue='cluster', style='class', data=pd.concat([abc, centers]))

In [None]:
nkm.predict(Xa)

In [None]:
nkm.transform(Xa)

### Нелинейный пример

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import multivariate_normal

Na = 2000
Nb = 2000
a = pd.DataFrame({'distance': np.random.randn(Na)*10 + 130, 'angle': np.random.rand(Na) * 2*np.pi})
b = pd.DataFrame({'distance': np.random.randn(Nb)*10 + 40, 'angle': np.random.rand(Nb) * 2*np.pi})
a['class'] = 'a'
b['class'] = 'b'
ring = pd.concat([a, b])
ring['x1'] = ring.distance * np.cos(ring.angle)
ring['x2'] = ring.distance * np.sin(ring.angle)
Xr = ring[['x1', 'x2']]
ring.sample(5)

In [None]:
sns.relplot(x='x1', y='x2', hue='class', data=ring)

In [None]:
model = KMeans(2)
model.fit(Xr)
model

In [None]:
ring['cluster']=model.labels_
sns.relplot(x='x1', y='x2', hue='cluster', size='class', data=ring)

#### Задание
1. Применить другую модель, подходящую для кластеризации нелинейного примера.

### Mnist
На какие кластеры разбивается множество рукописных цифр?

In [None]:
import pandas as pd
mnist_train = pd.read_csv('/content/sample_data/mnist_train_small.csv', header=None)
mnist_test = pd.read_csv('/content/sample_data/mnist_test.csv', header=None)

In [None]:
X_mnist = mnist_train.loc[:, 1:]

In [None]:
from sklearn.cluster import MiniBatchKMeans, AgglomerativeClustering
mnist_model = MiniBatchKMeans(10)
mnist_model.fit(X_mnist)

In [None]:
mnist_cluster = mnist_train[[0]].rename(columns={0: 'target'})
mnist_cluster['cluster'] = mnist_model.labels_
mnist_cluster

In [None]:
mnist_cluster['number'] = 1
mnist_cluster.groupby(['target', 'cluster']).sum()

In [None]:
cluster_counts = mnist_cluster.pivot_table('number', index='target', columns='cluster', aggfunc=np.sum, fill_value=0.)
cluster_counts

In [None]:
from matplotlib import pyplot as plt

n_ = 10
for cluster in range(10):
  for target in range(10):
    sample = X_mnist[(mnist_cluster.target==target) & (mnist_cluster.cluster==cluster)].iloc[:n_]
    if(cluster_counts.loc[target, cluster] > 300):
      fig, axes = plt.subplots(1, n_, figsize=(16, 4))
      plt.title(f'{target} {cluster} {cluster_counts.loc[target, cluster]}')
      for i in range(n_):
        img = sample.values[i].reshape((28, 28))
        axes[i].imshow(img, cmap='gray')


## Кластеризация на графах

### Графы с весом на рёбрах (расстоянием)

In [None]:
import networkx as nx
g = nx.Graph(directed=False)
g.add_edge('A', 'B', weight=0)
g.add_edge('B', 'C', weight=1)
g.add_edge('C', 'D', weight=1)
g.add_edge('A', 'C', weight=3)
pos=nx.spring_layout(g)
nx.draw_networkx(g, pos)
nx.draw_networkx_edge_labels(g, pos)
nx.shortest_path(g, 'A', 'D', weight='weight')

In [None]:
nx.to_pandas_edgelist(g)

In [None]:
nx.to_scipy_sparse_matrix(g).todense()

In [None]:
nx.to_pandas_adjacency(g)

### Кластеры на графе

In [None]:
g = nx.from_edgelist([(0, 1, {'d': 1}),
                      (0, 2, {'d': 1}),
                      (1, 2, {'d': 1}),
                      ('A', 'B', {'d': 1}),
                      ('A', 'C', {'d': 1}),
                      ('B', 'C', {'d': 1}),
                      (0, 'A', {'d': 0.1})])
pos=nx.spring_layout(g)
nx.draw_networkx_edge_labels(g, pos)
nx.draw_networkx(g, pos)

In [None]:
X = nx.to_pandas_adjacency(g, weight='d')
X

In [None]:
from sklearn.cluster import AffinityPropagation
model = AffinityPropagation(affinity='precomputed')
X = nx.to_pandas_adjacency(g, weight='d')
model.fit(X)
X['cluster'] = model.labels_
X

### Кластеризация областей изображения

In [None]:
img = mnist_train.iloc[10, 1:].values.reshape((28, 28))
plt.imshow(img, cmap='gray')

In [None]:
from sklearn.feature_extraction import img_to_graph, grid_to_graph
grid_to_graph(3, 4).toarray()

In [None]:
plt.spy(grid_to_graph(3, 4))

In [None]:
X = img_to_graph([[0, 0], [0, 0], [11, 12]])
pd.DataFrame(X.toarray())

In [None]:
X

In [None]:
X.data = (12 - X.data)/12
pd.DataFrame(X.toarray())

In [None]:
img_to_graph(img)

In [None]:
np.std(img_to_graph(img).data)

In [None]:
X = img_to_graph(img)
X.data = np.exp(- X.data/50)
X.toarray().shape

In [None]:
X.toarray()[:5, :5]

In [None]:
plt.imshow(X.toarray()[300:350, 300:350])

In [None]:
from sklearn.cluster import SpectralClustering
model = SpectralClustering(4, affinity='precomputed')
model.fit(X)
plt.imshow(model.labels_.reshape((28, 28)))

In [None]:
from sklearn.cluster import AgglomerativeClustering
X = img_to_graph(img)
model = AgglomerativeClustering(4, affinity='precomputed', linkage='average', connectivity=grid_to_graph(28, 28))
model.fit(X.toarray())
plt.imshow(model.labels_.reshape((28, 28)))

## Уменьшение размерности

### Простой пример - кошки

In [None]:
import numpy as np
N = 500
cats = pd.DataFrame()
genes = np.random.rand(N)
age = 12 * np.random.rand(N)
# luck = np.random.randn(N)
luck = np.zeros(N)
cats['weight'] =     2 * genes +  2  * age/12 + 0.5 * luck + 0.1 * np.random.randn(N)
cats['lives']  = 9 + 2 * genes -  3  * age/12 +       luck + 0.1 * np.random.randn(N)
cats['fur']    =     2 * genes + 0.5 * age/12 + 0.2 * luck + 0.1 * np.random.randn(N)

In [None]:
cats

In [None]:
mid = np.array([2, 8.5, 1.25])
dg = np.array([2, 2, 2])
da = np.array([2, -3, 0.5])

In [None]:
import seaborn as sns
sns.relplot(x='weight', y='lives', hue='fur', data=cats)

In [None]:
import plotly.graph_objs as go
vg = go.Scatter3d( x = [mid[0], (mid+dg)[0]],
                       y = [mid[1], (mid+dg)[1]],
                       z = [mid[2], (mid+dg)[2]],
                       marker = dict( size = 1,
                                      color = "red"),
                       line = dict( color = "red",
                                    width = 6)
                     )
va = go.Scatter3d( x = [mid[0], (mid+da)[0]],
                       y = [mid[1], (mid+da)[1]],
                       z = [mid[2], (mid+da)[2]],
                       marker = dict( size = 1,
                                      color = "green"),
                       line = dict( color = "green",
                                    width = 6)
                     )


In [None]:
import plotly.express as px
px.scatter_3d(cats, x='weight', y='lives', z='fur', size=age, color=genes).add_trace(vg).add_trace(va)

In [None]:
sns.heatmap(cats.corr(), vmin=-1, vmax=1)
cats.corr()

In [None]:
from sklearn.decomposition import PCA
pca_cats = PCA(2)
pca_cats.fit(cats)

In [None]:
pca_cats.components_

In [None]:
pca_cats.explained_variance_ratio_

In [None]:
c = pca_cats.components_
v0 = go.Scatter3d( x = [mid[0], (mid+c[0])[0]],
                       y = [mid[1], (mid+c[0])[1]],
                       z = [mid[2], (mid+c[0])[2]],
                       marker = dict( size = 1,
                                      color = "red"),
                       line = dict( color = "red",
                                    width = 6)
                     )
v1 = go.Scatter3d( x = [mid[0], (mid+c[1])[0]],
                       y = [mid[1], (mid+c[1])[1]],
                       z = [mid[2], (mid+c[1])[2]],
                       marker = dict( size = 1,
                                      color = "green"),
                       line = dict( color = "green",
                                    width = 6)
                     )
# v2 = go.Scatter3d( x = [mid[0], (mid+c[2])[0]],
#                        y = [mid[1], (mid+c[2])[1]],
#                        z = [mid[2], (mid+c[2])[2]],
#                        marker = dict( size = 1,
#                                       color = "blue"),
#                        line = dict( color = "blue",
#                                     width = 6)
#                      )
px.scatter_3d(cats, x='weight', y='lives', z='fur', size=age, color=genes).add_trace(v0).add_trace(v1)#.add_trace(v2)

In [None]:
space = pca_cats.transform(cats)
sns.jointplot(space[:, 0], space[:, 1])

In [None]:
reconstructed = pd.DataFrame(pca_cats.inverse_transform(space), columns=['weight', 'lives', 'fur'])
reconstructed

In [None]:
px.scatter_3d(reconstructed, x='weight', y='lives', z='fur', size=age, color=genes).add_trace(v0).add_trace(v1)#.add_trace(v2)#.add_trace(v2)

### Mnist

In [None]:
Y_mnist = mnist_train.loc[:, 0]
print(X_mnist.shape, Y_mnist.shape)

In [None]:
from sklearn.decomposition import PCA
model = PCA(200)
model.fit(X_mnist)

In [None]:
model.transform(X_mnist).shape

In [None]:
pc = pd.DataFrame(model.transform(X_mnist), columns=[str(i) for i in range(200)])
pc['y']=Y_mnist
pc.head()

In [None]:
from matplotlib import pyplot as plt
for i in range(4):
  plt.imshow(model.components_[i].reshape((28, 28)), cmap='gray')
  plt.figure()

In [None]:
plt.imshow(model.components_[199].reshape((28, 28)), cmap='gray')

In [None]:
plt.plot(model.explained_variance_ratio_[:10])

In [None]:
plt.plot(model.explained_variance_ratio_.cumsum()[:30])

In [None]:
sns.relplot(data=pc.sample(2000), x='0', y='1', hue='y')

In [None]:
from sklearn.manifold import TSNE
sample = mnist_train.sample(5000)
embedding = TSNE().fit_transform(sample.loc[:, 1:])
embedding.shape

In [None]:
sns.relplot(x=embedding[:, 0], y=embedding[:, 1], palette='deep', hue=sample.loc[:, 0])

### Autoencoder
<img src="https://d1m75rqqgidzqn.cloudfront.net/wp-data/2020/04/29201743/Blog_info_29-04-2020-R-01-1024x438.png" width=750px/>

In [None]:
import tensorflow as tf
from tensorflow.keras import Input
from tensorflow.keras.layers import Dense, concatenate

def build_autoencoder():
  inp = Input(28*28, name='inp')
  enc = Dense(128, 'relu', kernel_regularizer='l1', name='enc')(inp)
  classes = Dense(10, 'relu', kernel_regularizer='l1', name='classes')(enc)
  dec = Dense(128, 'relu', kernel_regularizer='l1', name='dec')(classes)
  out = Dense(28*28, 'relu', kernel_regularizer='l1', name='out')(dec)
  model = tf.keras.Model(inputs=inp, outputs=out)
  encoder = tf.keras.Model(inputs=inp, outputs=classes)
  return model, encoder


In [None]:
from tensorflow.keras.utils import plot_model
model, encoder = build_autoencoder()
plot_model(model, show_shapes=True)

In [None]:
plot_model(encoder, show_shapes=True)

In [None]:
X = mnist_train.loc[:, 1:]

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
callbacks = [EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=0, restore_best_weights=True)]
model.compile(loss='mean_squared_error')
%time history = model.fit(X, X, verbose=0, batch_size=1024, epochs=512, validation_split=0.2, callbacks=callbacks)

In [None]:
pd.DataFrame(history.history).plot()

In [None]:
i = 4
img = mnist_train.iloc[i, 1:].values.reshape((28, 28))
plt.imshow(img, cmap='gray')

In [None]:
result = model.predict(mnist_train.iloc[[i], 1:])
result.shape

In [None]:
plt.imshow(result.reshape((28, 28)), cmap='gray')

In [None]:
encoded = encoder.predict(X)
encoded.shape

In [None]:
sns.relplot(x=encoded[:, 0], y=encoded[:, 1], palette='deep', hue=mnist_train.loc[:, 0])

In [None]:
embedding = PCA(4).fit_transform(encoded)

In [None]:
sns.relplot(x=embedding[:, 0], y=embedding[:, 1], palette='deep', hue=mnist_train.loc[:, 0])

## Задание
1. Кластеризовать свой датасет любой моделью
  - скорее всего понадобятся pipeline и scaler
2. Визуализировать результаты кластеризации:
  - спроецировать датасет на плоскость с помощью TSNE
  - раскрасить кластеры