In [2]:
import numpy as np
import time
from intercluster.utils import *
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
n = 1000
k = 10
d = 15

X = np.random.uniform(size = (n,d))
centers = np.random.uniform(size = (k,d))
assignment1 = np.zeros((n, k))
assignment1[:,1] = 1
assignment2 = np.ones((n,k))

In [13]:
assignment1.shape

(1000, 10)

In [36]:
start = time.time()
kmeans_cost(X, centers, assignment1)
end = time.time()
print(end - start)

0.04607677459716797


In [37]:
start = time.time()
kmeans_cost(X, centers, assignment2)
end = time.time()
print(end - start)

0.07129764556884766


In [22]:
y = np.array([
    [1,2],
    [2,3]
])
y / np.array([2,3])

array([[0.5       , 0.66666667],
       [1.        , 1.        ]])

In [23]:
def kmeans_cost2(
    X : NDArray,
    centers : NDArray,
    assignment : NDArray,
    average : bool = False,
    normalize : bool = False
) -> float:
    """
    Computes the squared L2 norm cost of a clustering with an associated set of centers.

    Args:
        X (np.ndarray): (n x d) Dataset
        
        centers (np.ndarray): (k x d) Set of representative centers for each of the k clusters.
        
        assignment (np.ndarray: bool): n x k boolean (or binary) matrix with entry (i,j) 
            being True (1) if point i belongs to cluster j and False (0) otherwise. 
        
        average (bool, optional): Whether to average the per-point cost by the number of clusters
            that the point is assigned to. Defaults to False.
        
        normalize (bool, optional): Whether to normalize the cost by the number of points
            covered in the clustering. Defaults to False.

    Returns:
        cost (float): Total cost of the clustering.
    """
        
    n,d= X.shape
    center_dist_arr = center_dists(X, centers, norm = 2, square = True)
    center_dist_arr = center_dist_arr * assignment
    center_dist_sum = np.sum(center_dist_arr, axis = 1)
    n_assigns = np.sum(assignment, axis = 1)

    if average:
        cost = np.sum(center_dist_sum / n_assigns)
    else:
        cost = np.sum(center_dist_sum)
            
    if normalize:
        covered = coverage(assignment) * n
        cost /= covered
        
    return cost

In [32]:
start = time.time()
kmeans_cost2(X, centers, assignment1)
end = time.time()
print(end - start)

0.0012793540954589844


In [33]:
start = time.time()
kmeans_cost2(X, centers, assignment2)
end = time.time()
print(end - start)

0.0016965866088867188


In [38]:
a = np.array([-1, 0, 1, 2, 3], dtype=float)
b = np.array([ 0, 0, 0, 2, 2], dtype=float)

# If you don't pass `out` the indices where (b == 0) will be uninitialized!
c = np.divide(a, b, out=np.zeros_like(a), where=b!=0)

In [39]:
c

array([0. , 0. , 0. , 1. , 1.5])

In [41]:
np.array_equal(np.array([]), np.array([]))

True

In [46]:
x = {1,2,3}
x.union({4})

{1, 2, 3, 4}

In [49]:
{_ for _ in x if _ != 2}

{1, 3}