#### Reading data 
In this cell, we are reading data from ClusterSet1.txt with whitespace as delimiter

In [13]:
# allow inline plots
%matplotlib notebook

# importing necessary modules
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans as kmeans
markers_list = ['o', 's', 'd', '^', 'x', '+', '*', 'h', 'v', '<', '>'
                , '1', '2', '3', '4', '8', 'H', 'D', 'p', '|', 'o', '_', ',']

# reading in the CSV data

data = pd.read_csv('ClusterSet1.txt', delim_whitespace=True, header = None)
X = data.values


#### K-means algorithm
This algorithm takes number of clusters to be created and multi-dimensional data points and returns a k means object with labels for differet clusters

In [46]:

def create_kmeans(clusters, X) :
    k = kmeans(n_clusters=CLUSTERS )
    k.fit(X)
    return k

In [47]:

def plot_graph(kmeans_cluster, X) :
    cluster_labels = kmeans_cluster.labels_
    n_clusters = max(cluster_labels) + 1
    
    ax = Axes3D(plt.figure())
    ax.w_xaxis.set_ticklabels([]); ax.w_yaxis.set_ticklabels([])
    ax.w_zaxis.set_ticklabels([])
    ax.set_xlabel('X1'); ax.set_ylabel('X2'); ax.set_zlabel('X3')

    for label in range(n_clusters):
        X_label = X[cluster_labels == label]
        ax.scatter(X_label[:,0], X_label[:,1], X_label[:, 2], label = str(label)
                   , marker = markers_list[label])
    ax.legend()

#### Choosing K
In order to choose the perfect K, we vary k from 1 to 12 and plot the distortion index for each kmeans answer. The distortion value doesn't change after cluster count = 9, thus, it is chosen as the best K for this dataset.

In [44]:
total_cluster_limit = 12
losses = []
for n_clusters in range(1, 15):
    losses.append(kmeans(n_clusters = n_clusters).fit(X).inertia_)

ax = plt.subplots()[1]
ax.plot(range(1, len(losses) + 1), losses)

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x23b8529e438>]

#### Visualizing the clusters
For K=9, we get kmeans clusters. This data is 3-dimensional so we encode with different markers types for each cluster plot a 3d plot for the different clusters

In [45]:
clusters = 9
k  =create_kmeans(clusters, X)
plot_graph(k, X)

<IPython.core.display.Javascript object>