In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

  from ._conv import register_converters as _register_converters


In [2]:
def create_samples(n_clusters, n_samples_per_cluster, n_features, embiggen_factor, seed):
    np.random.seed(seed)
    slices = []
    centroids = []
    # Create samples for each cluster
    for i in range(n_clusters):
        samples = tf.random_normal((n_samples_per_cluster, n_features),
                                  mean=0.0, stddev=5.0, dtype=tf.float32, seed=seed, name="cluster_{}".format(i))
        current_centroid = (np.random.random((1, n_features)) * embiggen_factor) - (embiggen_factor/2)
        centroids.append(current_centroid)
        samples += current_centroid
        slices.append(samples)
    # Create a big "samples" dataset
    samples = tf.concat(slices, 0, name='samples') # concatenates the n_clusters samples into one list across the 0 axis
    centroids = tf.concat(centroids, 0, name='centroids') # similarily for the centroids
    return centroids, samples
    
def choose_random_centroids(samples, n_clusters):
    # Step 0: Initialisation: Select `n_clusters` number of random points
    n_samples = tf.shape(samples)[0]
    random_indices = tf.random_shuffle(tf.range(0, n_samples))
    begin = [0,]
    size = [n_clusters,]
    size[0] = n_clusters
    centroid_indices = tf.slice(random_indices, begin, size)
    initial_centroids = tf.gather(samples, centroid_indices)
    return initial_centroids    

def assign_to_nearest(samples, centroids):
    expanded_vectors = tf.expand_dims(samples, 0)
    expanded_centroids = tf.expand_dims(centroids, 1)
    distances = tf.reduce_sum( tf.square(
               tf.subtract(expanded_vectors, expanded_centroids)), 2)
    mins = tf.argmin(distances, 0)
    nearest_indices = mins
    return nearest_indices

def update_centroids(samples, nearest_indices, n_clusters):
    # Updates the centroid to be the mean of all samples associated with it.
    nearest_indices = tf.to_int32(nearest_indices)
    partitions = tf.dynamic_partition(samples, nearest_indices, n_clusters)
    new_centroids = tf.concat([tf.expand_dims(tf.reduce_mean(partition, 0), 0) for partition in partitions], 0)
    return new_centroids

In [10]:
n_clusters = 3
n_samples_per_cluster = 500
n_features = 2
embiggen_factor = 35
seed = 42
n_iterations = 10

ans, samples = create_samples(n_clusters, n_samples_per_cluster, n_features, embiggen_factor, seed)

samples = tf.Variable(samples)
centroids = tf.Variable(choose_random_centroids(samples, n_clusters))

nearest_indices = assign_to_nearest(samples, centroids)
updated_centroids = update_centroids(samples, nearest_indices, n_clusters)
training_op = tf.assign(centroids, updated_centroids)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    print(ans.eval(), 'this is the actual answer')
    print(centroids.eval(), 'this is the initial random centroid')
    for i in range(n_iterations):
        sess.run(training_op)
    print(centroids.eval(), f'this is the final guess after {n_iterations} iterations')

# plot_clusters(samples, centroids, n_samples_per_cluster)

[[ -4.39109584  15.77500072]
 [  8.11978796   3.45304695]
 [-12.03934758 -12.04019179]] this is the actual answer
[[-9.302317  20.529745 ]
 [ 5.3026304  1.6563395]
 [ 7.0891495  3.8291836]] this is the initial random centroid
[[ -4.50219    15.896543 ]
 [-12.256792  -12.084414 ]
 [  8.084836    3.1792228]] this is the final guess after 10 iterations
