In [1]:
import keras
from keras.datasets import cifar10 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import pandas as pd

In [2]:
(x_train,y_train), (x_test,y_test) = cifar10.load_data()

In [3]:
#dataset shape
print("Training: {}".format(x_train.shape))
print("Test: {}".format(x_test.shape))

Training: (50000, 32, 32, 3)
Test: (10000, 32, 32, 3)


In [4]:
#number of classes in our dataset
classes = np.unique(y_train)
nClasses = len(classes)
print('Number of Outputs: ', nClasses)
print('Number of Output Classes: ', classes)

Number of Outputs:  10
Number of Output Classes:  [0 1 2 3 4 5 6 7 8 9]


In [5]:
# Reshape the data to be in the form of (num_samples, num_features) 
x=x_train
x_train = x_train.reshape(x_train.shape[0], 32*32*3) 
x_test = x_test.reshape(x_test.shape[0], 32*32*3) 
  
# Normalize the data 
x_train = x_train/255.0 
x_test = x_test/255.0

In [6]:
# x_train_flat = x_train.reshape(-1,3072)
feat_cols = ['pixel' + str(i) for i in range(x_train.shape[1])]
df_cifar = pd.DataFrame(x_train, columns = feat_cols)
df_cifar['Label'] = y_train
print('Size of Data Frame: {}'.format(df_cifar.shape))

Size of Data Frame: (50000, 3073)


In [7]:
df_cifar.head()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel3063,pixel3064,pixel3065,pixel3066,pixel3067,pixel3068,pixel3069,pixel3070,pixel3071,Label
0,0.231373,0.243137,0.247059,0.168627,0.180392,0.176471,0.196078,0.188235,0.168627,0.266667,...,0.847059,0.721569,0.54902,0.592157,0.462745,0.329412,0.482353,0.360784,0.282353,6
1,0.603922,0.694118,0.733333,0.494118,0.537255,0.533333,0.411765,0.407843,0.372549,0.4,...,0.560784,0.521569,0.545098,0.560784,0.52549,0.556863,0.560784,0.521569,0.564706,9
2,1.0,1.0,1.0,0.992157,0.992157,0.992157,0.992157,0.992157,0.992157,0.992157,...,0.305882,0.333333,0.32549,0.309804,0.333333,0.32549,0.313725,0.337255,0.329412,9
3,0.109804,0.098039,0.039216,0.145098,0.133333,0.07451,0.14902,0.137255,0.078431,0.164706,...,0.211765,0.184314,0.109804,0.247059,0.219608,0.145098,0.282353,0.254902,0.180392,4
4,0.666667,0.705882,0.776471,0.658824,0.698039,0.768627,0.694118,0.72549,0.796078,0.717647,...,0.294118,0.309804,0.321569,0.278431,0.294118,0.305882,0.286275,0.301961,0.313725,1


In [8]:
from sklearn.manifold import TSNE

In [None]:
tsne = TSNE(n_components=3)
tsn_cifar = tsne.fit_transform(df_cifar)

In [None]:
k_means = KMeans(init = "k-means++", n_clusters = 10, max_iter = 200)

In [None]:
k_means.fit(tsn_cifar)

In [None]:
tsn_cifar.shape

In [None]:
k_means_labels = k_means.labels_ #List of labels of each dataset
print("The list of labels of the clusters are " + str(np.unique(k_means_labels)))

In [None]:
G = len(np.unique(k_means_labels)) #Number of labels
cluster_index= [[] for i in range(G)]
for i, label in enumerate(k_means_labels,0):
    for n in range(G):
        if label == n:
            cluster_index[n].append(i)
        else:
            continue     

In [None]:
plt.figure(figsize=(15,15));
clust = 4 #enter label number to visualise
num = 100 #num of data to visualize from the cluster
for i in range(1,num): 
    plt.subplot(10, 10, i); #(Number of rows, Number of column per row, item number)
#     plt.imshow(x_train[cluster_index[clust][i+500]].reshape(-1, 32*32*3), cmap = plt.cm.binary);
    plt.imshow(x_train[cluster_index[clust][i+500]].reshape(32,32,3), cmap = plt.cm.binary);
    
plt.show()

In [None]:
Y_clust = [[] for i in range(G)]
for n in range(G):
    Y_clust[n] = y_train[cluster_index[n]] #Y_clust[0] contains array of "correct" category from y_train for the cluster_index[0]
    assert(len(Y_clust[n]) == len(cluster_index[n])) #dimension confirmation

In [None]:
#counts the number of each category in each cluster
def counter(cluster):
    unique, counts = np.unique(cluster, return_counts=True)
    label_index = dict(zip(unique, counts))
    return label_index

In [None]:
label_count= [[] for i in range(G)]
for n in range(G):
    label_count[n] = counter(Y_clust[n])

label_count[1] #Number of items of a certain category in cluster 1

In [None]:
classes = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']  
class_names = {0:'airplane', 1:'automobile',2: 'bird',3: 'cat',4: 'deer',5:
               'dog',6: 'frog', 7:'horse',8:  'ship',9: 'truck'} #Dictionary of class names

#A function to plot a bar graph for visualising the number of items of certain category in a cluster
def plotter(label_dict):
    plt.bar(range(len(label_dict)), list(label_dict.values()), align='center')
    a = []
    for i in [*label_dict]: a.append(class_names[i])
    plt.xticks(range(len(label_dict)), list(a), rotation=45, rotation_mode='anchor')

In [None]:
#Bar graph with the number of items of different categories clustered in it
plt.figure(figsize=(20,20))
for i in range (1,11):
    plt.subplot(5, 2, i)
    plotter(label_count[i-1]) 
    plt.title("Cluster" + str(i-1))

In [None]:
#cluster visualisation
my_members = (k_means_labels == 4) #Enter different Cluster number to view its 3D plot
my_members.shape
fig = plt.figure(figsize=(15, 10))
ax = fig.add_subplot(1,1,1,projection='3d')
ax.plot(tsn_cifar[my_members, 0], tsn_cifar[my_members,1],tsn_cifar[my_members,2], 'w', markerfacecolor="blue", marker='.',markersize=10)

In [None]:
import plotly as py
import plotly.graph_objs as go
import plotly.express as px

In [None]:
#3D Plotly Visualisation of Clusters using go

layout = go.Layout(
    title='<b>Cluster Visualisation</b>',
    yaxis=dict(
        title='<i>Y</i>'
    ),
    xaxis=dict(
        title='<i>X</i>'
    )
)

colors = ['red','green' ,'blue','purple','magenta','yellow','cyan','maroon','teal','black']
trace = [ go.Scatter3d() for _ in range(11)]
for i in range(0,10):
    my_members = (k_means_labels == i)
    index = [h for h, g in enumerate(my_members) if g]
    trace[i] = go.Scatter3d(
            x=tsn_cifar[my_members, 0],
            y=tsn_cifar[my_members, 1],
            z=tsn_cifar[my_members, 2],
            mode='markers',
            marker = dict(size = 2,color = colors[i]),
            hovertext=index,
            name='Cluster'+str(i),
   
            )

fig = go.Figure(data=[trace[0],trace[1],trace[2],trace[3],trace[4],trace[5],trace[6],trace[7],trace[8],trace[9]], layout=layout)
    
py.offline.iplot(fig)

In [None]:
#If you hover over the points in the above plots you get an index value
plt.figure(figsize=(2,2));
n = 34658 #Use that value here to visualise the selected data
plt.imshow(x_train[n].reshape(32, 32,3), cmap = plt.cm.binary)
plt.show()

In [None]:
from sklearn.metrics import silhouette_score, davies_bouldin_score
score = silhouette_score(lda_cifar,k_means_labels)
print('Silhouetter Score: %.3f' % score)

In [None]:
davies_score=davies_bouldin_score(lda_cifar,k_means_labels)
print('davies Score: %.3f' % davies_score)