In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.datasets.samples_generator import make_blobs
X, cluster_assignments = make_blobs(n_samples=200, centers=4, cluster_std=0.60, random_state=0)
print(type(X))
print(type(X[0]))
print(type(X[0, 1]))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.float64'>




In [3]:
import kmeans as km

# Check if it works. 
cl = km.KMeans(k=3)
print(cl.n_clusters)

3


In [4]:
def get_labels_and_inertia(X, centers, distances): 
    n_samples = X.shape[0]
    n_clusters = centers.shape[0]
    labels = np.full(n_samples, -1, np.int32)
    inertia = 0.0

    # Calculate distance between each data point and each centroid. 
    for sample_idx in range(n_samples):
        min_dist = -1
        for center_idx in range(n_clusters):
            dist = 0.0
            
            # Get distance between the data point and the centroid. 
            # ||a - b||^2 = ||a||^2 + ||b||^2 -2 <a, b>
            dist += np.dot(X[sample_idx], centers[center_idx])
            dist *= -2
            dist += np.dot(X[sample_idx], X[sample_idx])
            dist += np.dot(centers[center_idx], centers[center_idx])

            # Get minimum distance. 
            if min_dist == -1 or dist < min_dist:
                min_dist = dist
                # Get the cluster assigned to this datapoint. 
                labels[sample_idx] = center_idx
                distances[sample_idx] = dist
                
        # Add to inertia. 
        inertia += min_dist

    return labels, inertia

In [5]:
k = 3
n_samples, n_features = X.shape

In [6]:
def init_centers(X): 
        shuffled_indices = np.random.permutation(len(X))
        center_indices = shuffled_indices[:k]
        centers = np.zeros(shape=(k, X.shape[1]), dtype=X.dtype)
        for i, idx in enumerate(center_indices): 
            print(i, idx)
            centers[i] = X[idx]
            
        return centers

In [7]:
from numpy.random import rand


# Get 2d array with random values. 
# centers = a = np.random.uniform(0, 1, (k, n_features))
centers = init_centers(X)
centers

0 83
1 181
2 11


array([[-0.14455399,  2.28187277],
       [ 2.03169783,  0.19680756],
       [ 2.43040639, -0.06357093]])

In [8]:
distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype)
print(X.dtype)
distances

float64


array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [9]:
labels, inertia = get_labels_and_inertia(X, centers, distances)
print(type(labels))
print('labels:', labels)
print('inertia:', inertia)

<class 'numpy.ndarray'>
labels: [1 0 0 0 1 0 0 0 0 0 0 2 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 1 1 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 1 0 0 0 1 1 0 0
 0 0 0 0 2 2 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 2 0 0 0 0 1 1 0 0
 0 0 1 1 0 0 0 0 0 0 0 0 2 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 0 1 1 0 2 0 1
 0 0 0 0 0 0 1 0 0 0 0 1 0 1 1 0 0 0 0 1 2 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0
 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1]
inertia: 2115.683203086229


In [10]:
distances[:10]

array([ 2.28108523,  0.95595575,  4.67762807, 30.44146857,  0.22119079,
       10.83976029,  2.94365009,  1.04054021, 10.98180326,  6.83531017])

In [11]:
def move_to_mean(X, labels): 
    cluster_to_assigned_points = dict()
    for i, cluster in enumerate(labels): 
        cluster_to_assigned_points.setdefault(cluster, []).append(X[i])
        
    cluster_to_mean_point =  np.zeros(shape=(len(cluster_to_assigned_points), X.shape[1]), dtype=X.dtype)
    for k, v in cluster_to_assigned_points.items():
        cluster_to_mean_point[k] = pd.Series(v).mean()
        
    return cluster_to_mean_point

In [12]:
ret = move_to_mean(X, labels)
ret

array([[-0.67342087,  4.97845707],
       [ 1.94930666,  0.9867111 ],
       [ 2.88350488,  0.74655634]])

In [13]:
row = [1, 2, 3]

In [14]:
row * 3

[1, 2, 3, 1, 2, 3, 1, 2, 3]

In [15]:
r = np.array([1, 2, 3])

In [16]:
r * 3

array([3, 6, 9])

In [17]:
n = 10
k = 3 
row = row * int(n / k) 
row

[1, 2, 3, 1, 2, 3, 1, 2, 3]

In [18]:
row.extend([row[i] for i in range(n % k)])
row

[1, 2, 3, 1, 2, 3, 1, 2, 3, 1]

In [19]:
cost_matrix = np.zeros(shape=(10, 10), dtype=X.dtype)
for i in range(10): 
    cost_matrix[i] = np.array(row)

cost_matrix

array([[1., 2., 3., 1., 2., 3., 1., 2., 3., 1.],
       [1., 2., 3., 1., 2., 3., 1., 2., 3., 1.],
       [1., 2., 3., 1., 2., 3., 1., 2., 3., 1.],
       [1., 2., 3., 1., 2., 3., 1., 2., 3., 1.],
       [1., 2., 3., 1., 2., 3., 1., 2., 3., 1.],
       [1., 2., 3., 1., 2., 3., 1., 2., 3., 1.],
       [1., 2., 3., 1., 2., 3., 1., 2., 3., 1.],
       [1., 2., 3., 1., 2., 3., 1., 2., 3., 1.],
       [1., 2., 3., 1., 2., 3., 1., 2., 3., 1.],
       [1., 2., 3., 1., 2., 3., 1., 2., 3., 1.]])

In [20]:
cost_matrix[5][5]

3.0