In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

In [2]:
class KMeans(object):
    """ Implement KMeans in tensorflow
        with Lloyd Algorithm
        1. init: randomly init K cluster centroids
        2. cluster assignment: X to its closest centroids
        3. move centroid step: compute means of each centroid and do new assignments
    """
    def __init__(self, num_clusters, max_steps=100, random_state=None):
        self.num_clusters = num_clusters
        self.max_steps = max_steps
        self.random_state = random_state
        
    def fit(self, X):
        """
        Args:
            X : array-like or sparse matrix, shape=(n_samples, n_features)
            Training instances to cluster.
        """
        self._train(X)
    
    def get_cluster_centers(self):
        """ Get cluster centers, please call this after fit
        Returns:
            cluster_centers: an array of cluster centers
        """
        return self.cluster_centers_
    
    
    def predict(self, X):
        """ do predictions for samples
        
        Args:
            X: ndarray, samples object
        Returns:
            labels: array, Index of the cluster centroid each sample belongs to
        """
        graph = tf.Graph()
        m, n = X.shape
        with graph.as_default():
            x_tensor = tf.placeholder(tf.float32, shape=X.shape, name="X_test")
            centroids = tf.convert_to_tensor(self.cluster_centers_)
            predictions = self._find_closest_centroids(x_tensor, centroids)
            with tf.Session() as sess:
                predictions = sess.run(predictions, feed_dict={x_tensor: X})
        return predictions
    
    
    def _init_centroids(self, X, num_centroids):
        """init cluster centers randomly
        
        Args:
            X: Tensor object, input samples
            num_centroids: python integer, the number of centroids
        """
        with tf.name_scope("init_centroid"):
            k = num_centroids
            centroids = tf.random_shuffle(X, seed=self.random_state)[:k,]
            print("randomly init centroids number: {}".format(k))
        return centroids
    
    
    def _get_distance(self, X, vector):
        """ Calculate euclid distance between samples X and center vector
        
        Args:
            X: Tensor, shape is m * n
            vector: Tensor, shape is 1 * n
        
        Returns:
            a tensor object of distances, such as [[0.1], [0.2], [3.4]]
        """
        with tf.name_scope("get_distance"):
            m, n = X.shape
            # reduce_sum output rank is like [a, b, c], reshape it the get vector like [[a], [b], [c]]
            # reshape it for concat tensor convenience
            distance = tf.sqrt(tf.reduce_sum(tf.square(X - vector), axis=1)) # axis =1 means by column
            distance = tf.reshape(distance, [m, 1])
        return distance
    
    
    def _find_closest_centroids(self, X, centroids):
        """ assign each sample to the closest centroid
        Args:
            X: tensor object, the samples, shape is [m, n]
            centroids: tensor object, the centroids, shape is [k, n]
            
        Returns:
            idx: tensor object, Index of the cluster centroid of which each sample belongs to
                 return object is like [0, 1, 0, 1], shape is [1, m]
        """
        with tf.name_scope("find_closest_centroids"):
            m, n = X.shape
            k, n = centroids.shape
            distances = []
            for i in range(k):
                d = self._get_distance(X, centroids[i])
                distances.append(d)
            # assign centroids of each row    
            # concated by column, distances[:, i] is the distance between X and center i
            distances = tf.concat(distances, axis=1)
            # get the index of minumum distances for each sample
            idx = tf.cast(tf.argmin(distances, axis=1), tf.int32)
        return idx
    
    
    def _compute_new_centroids(self, X, idx, num_centroids):
        """ Compute new centroids by computing mean of each cluster group       
        Args:
            X: tensor object, the samples, shape is [m, n]
            idx: tensor object, the index of centroids for each sample, shape is [m]
            num_centroids: python integer, the number of centroids
        
        Returns:
            a tensor object of new centroids, format is like [[0.5, 0.5], [1, 2]]
        """
        with tf.name_scope("compute_new_centroids"):
            m, n = X.shape
            k = num_centroids
            centroids = []
            for i in range(k):
                centroid_no = tf.ones(m, dtype=tf.int32) * i
                mask = tf.equal(idx, centroid_no) # get bool mask for cluster i
                cluster_group = tf.boolean_mask(X, mask)
                centroid = tf.reduce_mean(cluster_group, axis=0) # shape is [n]
                centroid = tf.reshape(centroid, [1, n]) # reshape for concat convenience
                centroids.append(centroid)
            centroids = tf.concat(centroids, axis=0)
        return centroids
        
    def _train(self, X):
        """ train data
        
        Args:
            X: python ndarray
        """
        graph = tf.Graph()
        k = self.num_clusters
        max_steps = self.max_steps
        m, n = X.shape
        with graph.as_default():
            # Build graph
            x_tensor = tf.placeholder(tf.float32, shape=[m, n], name="X")
#             with tf.variable_scope("centroids", reuse=tf.AUTO_REUSE):
#                     centroids_var = tf.get_variable("centroids", shape=[k, n], initializer=tf.ones_initializer())
            centroids = self._init_centroids(x_tensor, k)
            for i in range(max_steps):
                with tf.name_scope("train_step_{}".format(i)):
                    idx = self._find_closest_centroids(x_tensor, centroids)
                    centroids = self._compute_new_centroids(x_tensor, idx, k)
            with tf.Session() as sess:
#                 sess.run(tf.global_variables_initializer())
                self.labels_, self.cluster_centers_ =  sess.run([idx, centroids], feed_dict={x_tensor: X})
                self.labels_tensor_ = tf.convert_to_tensor(self.labels_)
                self.cluster_centers_tensor_ = tf.convert_to_tensor(self.cluster_centers_)
#                 print(self.labels_)
#                 print(self.cluster_centers_)
                
    def _test(self, X):
        """ train data
        """
        graph = tf.Graph()
        X_data = np.array([[1., 0.], [0, 1], [2., 2.], [3., 3.]])
        [m, n] = X_data.shape
        k = 2
        with graph.as_default():
            x_tensor = tf.placeholder(tf.float32, shape=[m, n], name="X")
            centroids = tf.constant([[1., 1.], [2., 2.]])
            idx = tf.constant([0, 0, 1, 1])
            d = self._compute_new_centroids(x_tensor, idx, k)
        with tf.Session(graph=graph) as sess:
#             sess.run(tf.global_variables_initializer())
            res = sess.run(d, feed_dict={x_tensor: X_data})
            print(res)

In [4]:
kmeans = KMeans(num_clusters=2, max_steps=10, random_state=42)
X = np.array([
    [1, 2], [1, 4], [1, 0],
    [4, 2], [4, 4], [4, 0]
])
kmeans._train(X)
print kmeans.labels_
print kmeans.cluster_centers_

randomly init centroids number: 2
[0 0 0 1 1 1]
[[ 1.  2.]
 [ 4.  2.]]


In [5]:
X_test = np.array([[1, 1], [4, 3], [5, 6]])
kmeans.predict(X_test)

array([0, 1, 1], dtype=int32)

In [6]:
# test with iris data
iris = pd.read_csv("./datasets/iris/iris_training.csv")
iris.head(10)

Unnamed: 0,120,4,setosa,versicolor,virginica
0,6.4,2.8,5.6,2.2,2
1,5.0,2.3,3.3,1.0,1
2,4.9,2.5,4.5,1.7,2
3,4.9,3.1,1.5,0.1,0
4,5.7,3.8,1.7,0.3,0
5,4.4,3.2,1.3,0.2,0
6,5.4,3.4,1.5,0.4,0
7,6.9,3.1,5.1,2.3,2
8,6.7,3.1,4.4,1.4,1
9,5.1,3.7,1.5,0.4,0


In [7]:
iris_train = iris.as_matrix()[:, :4]
kmeans = KMeans(num_clusters=3, max_steps=1, random_state=42)
kmeans.fit(iris.as_matrix())
print kmeans.labels_
print kmeans.cluster_centers_

randomly init centroids number: 3
[1 0 0 2 2 2 2 1 0 2 0 1 2 2 1 0 1 1 1 2 1 1 2 1 1 2 0 1 0 1 0 2 0 1 1 1 1
 1 2 2 1 1 1 2 2 1 2 1 2 1 2 0 0 2 1 0 1 1 1 0 0 1 1 1 0 1 2 1 1 2 2 0 2 1
 1 2 0 0 0 1 2 0 0 0 1 2 1 0 1 2 1 0 2 2 1 2 2 1 0 2 2 0 2 0 2 2 2 2 0 2 1
 0 2 1 2 0 0 2 2 0]
[[ 5.80937433  2.73750019  4.22812462  1.32499993  1.0625    ]
 [ 6.67999935  3.00222206  5.5422225   1.99111164  1.88888884]
 [ 4.99767494  3.3744185   1.48837209  0.2697674   0.02325581]]


In [12]:
iris_test = iris.as_matrix()[:4, :]
kmeans.predict(iris_test)

array([1, 0, 0, 2], dtype=int32)