In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
%matplotlib inline

# np.random.seed(2)

from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale


sns.set(style='white', context='notebook', palette='deep')

In [None]:
# pandas version
print('pandas',pd.__version__)

In [None]:
# define train set
from google.colab import drive
drive.mount('/content/drive')\

train = pd.read_csv('./drive/My Drive/DACON/data/train.csv')
train_copy = train.copy()

In [None]:
x_train = train_copy.iloc[:,3:]
y_train = train_copy['digit']
x_train = np.divide(x_train, 255.)

n_samples, n_features = x_train.shape
n_digits = len(np.unique(y_train)) # n_digits = 10
labels = y_train


In [None]:
x_digit_id = []

for i in range(10):
    x_digit_id.append( train_copy[train_copy['digit']==i]['id'].values )


In [None]:
reduced_data = PCA(n_components=2).fit_transform(x_train)

kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
kmeans.fit(reduced_data)

reduced_data

In [None]:
digit_data = []
for i in range(10):
    digit_data.append( reduced_data[x_digit_id[i]-1,:] )

In [None]:
# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .02     # point in the mesh [x_min, x_max]x[y_min, y_max].

# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

In [None]:
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.clf()
for i in range(10):
    plt.figure(figsize=(15,15))
    plt.imshow(Z, 
           interpolation='nearest',
           extent=(xx.min(), xx.max(), yy.min(), yy.max()),
           cmap=plt.cm.Paired,
           aspect='auto', origin='lower'
           )

    # plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'w.', markersize=10)
    plt.plot(digit_data[i][:, 0], digit_data[i][:, 1], 'k.', markersize=10)

    # Plot the centroids as a white X
    centroids = kmeans.cluster_centers_
    plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='x', s=169, linewidths=3,
                color='w', zorder=10)
    plt.title('K-means clustering (PCA-reduced data) : {}'.format(i),fontsize=20)
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())

plt.show()