In [1]:
from sklearn.datasets import fetch_openml
import numpy as np
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.target = mnist.target.astype(np.uint8)
X = mnist["data"]
y = mnist["target"]

In [2]:
kmeans = []
kmeans_sil = []

In [3]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

for i in range(8, 13):
    kmeans_i = KMeans(n_clusters=i)
    kmeans_i.fit(X)
    sil = silhouette_score(X, kmeans_i.labels_)
    kmeans.append(kmeans_i)
    kmeans_sil.append(sil)

In [4]:
print(kmeans_sil)

[0.07338485845283446, 0.05681310734673186, 0.0587299713614337, 0.05835714796282286, 0.05816082126680993]


In [5]:
import pickle
with open('kmeans_sil.pkl', 'wb') as fp:
    pickle.dump(kmeans_sil, fp)

In [6]:
kmeans_10 = kmeans[2]
y_pred = kmeans_10.predict(X)

In [7]:
from sklearn.metrics import confusion_matrix
conf = confusion_matrix(y, y_pred)

In [8]:
print(conf)

[[  74  173    7 1260 5041   43    2  290    4    9]
 [   9    7   10    7    0    6 4293    8 3527   10]
 [ 205  151   78  239   56  215  427  320  435 4864]
 [1034   33   46  506   22  186  458 4597   53  206]
 [  19  170 2192  256    9 3735  182    0  231   30]
 [1133   70  214 1847   59  426  166 2125  266    7]
 [  17 4441    4 1926   76   68  205   39   44   56]
 [  20    4 4405   12   21 2086  377    5  310   53]
 [4120   54  186  340   37  208  336 1177  315   52]
 [  89   16 2852   30   50 3456  267   86   92   20]]


In [9]:
import numpy as np
arg_maxes = list(set(np.argmax(row) for row in conf))
kmeans_argmax = sorted(arg_maxes)
kmeans_argmax

[0, 1, 2, 4, 5, 6, 7, 9]

In [10]:
with open('kmeans_argmax.pkl', 'wb') as fp:
    pickle.dump(kmeans_argmax, fp)

In [11]:
distances = [np.linalg.norm(X[i] - x2) for i in range(300) for x2 in X if not (X[i]==x2).all()]
min_distances = sorted(distances)[:10]

In [12]:
with open('dist.pkl', 'wb') as fp:
    pickle.dump(min_distances, fp)

In [13]:
mean = np.mean(min_distances[:3])
min_distances

[279.26152617215286,
 304.37641170103836,
 317.5893575043093,
 328.7658741414626,
 333.4546445920344,
 352.89800226127664,
 355.1774204534967,
 358.07401469528617,
 359.64287842247063,
 360.42474942767177]

In [14]:
from sklearn.cluster import DBSCAN
dbscan_len = []
for eps in np.arange(mean, mean*1.1, mean*0.04):
    dbscan = DBSCAN(eps=eps)
    dbscan.fit(X)
    dbscan_len.append(len(set(dbscan.labels_)))

In [None]:
with open('dbscan_len.pkl', 'wb') as fp:
    pickle.dump(dbscan_len, fp)

In [17]:
dbscan_len

[4, 7, 22]