In [None]:
from coreset_streaming import Coreset_Streaming
import numpy as np
import random 
import matplotlib.pyplot as plt
import helper_functions as hf

In [None]:
def add_n_points_to_cs(cs, n, r):
    """ Args:
            cs: coreset
            n: number of points to add
            r: range from 0 to r (inclusive) to sample from
    """
    points = []
    for i in range(n):
        point = [random.randint(0, r), random.randint(0,r)]
        cs.add_point(point)
        
def generate_uniform_test_data():
    x_arr = []
    for i in range(10000):
        x_val, y_val = np.random.randint(0,100), np.random.randint(0,100)
        x_arr.append([x_val, y_val])
    return x_arr

def get_arrs_for_plotting_stream(stream):
    og_xs = []
    og_ys = []
    for (x, y) in stream:
        og_xs.append(x)
        og_ys.append(y)
    return (og_xs, og_ys)

def get_arrs_for_plotting_coreset(coreset):
    cs_xs = []
    cs_ys = []
    weights = []
    for ((x, y), weight) in coreset:
        cs_xs.append(x)
        cs_ys.append(y)
        weights.append(weight)
        
    return (cs_xs, cs_ys, weights)

def run_stream(stream, max_cs_size=1000, chunk_size=1000):
    """ Returns the coreset.
    """
    a = Coreset_Streaming(max_cs_size)
    b = Coreset_Streaming(max_cs_size)

    # first add first chunk to a
    for i in range(chunk_size):
        a.add_point(stream[i])

    counter = 1
    for i in range(chunk_size, len(stream)):
        b.add_point(stream[i])

        if counter % chunk_size == 0:
            if a.can_union(b):
                a.union(b)
                b = Coreset_Streaming(max_cs_size)
            else:
                while not a.can_union(b):
                    # will the resolution of b ever be more than that of a?
                    if a.resolution > b.resolution:
                        b.double_resolution()
                    else:
                        a.double_resolution()
        counter += 1        

    return a

def get_true_and_coreset_kmeans_centers(stream, coreset, k, rng):
    cs_points = []
    weights = []
    for (point, weight) in coreset.coreset:
        cs_points.append(point)
        weights.append(weight)
    kmeans_centers = hf.weighted_kmeans(stream, k, rng)
    kmeans_coreset = hf.weighted_kmeans(np.array(cs_points), k, rng, w = weights)
    return kmeans_centers, kmeans_coreset

def plot_true_and_coreset_kmeans(kmeans_centers, kmeans_coreset, stream, save=False, save_fn="img.png"):
    (og_xs, og_ys) = get_arrs_for_plotting_stream(stream)
    (cs_xs, cs_ys, weights) = get_arrs_for_plotting_coreset(coreset.coreset)
    (kxs, kys) = get_arrs_for_plotting_stream(kmeans_centers)
    (ckxs, ckys) = get_arrs_for_plotting_stream(kmeans_coreset)
    fig = plt.figure(figsize=(8, 6), dpi=80)
    plt.scatter(og_xs, og_ys, s=0.1, label="Original points")
    plt.scatter(cs_xs, cs_ys, s=1, c="orange", label="Coreset")
    plt.scatter(kxs, kys, s=20, c="red", label="K-means on Original points")
    plt.scatter(ckxs, ckys, s=20, c="purple", label="Weighted K-means on Coreset points")
    plt.legend(loc="lower right", fontsize=9)
    if save:
        plt.savefig(save_fn)
    else:
        plt.plot()
    
def plot_coreset_sized_by_weight(stream, coreset, save=False, save_fn="img.png"):
    (og_xs, og_ys) = get_arrs_for_plotting_stream(stream)
    (cs_xs, cs_ys, weights) = get_arrs_for_plotting_coreset(coreset.coreset)
    fig = plt.figure(figsize=(8, 6), dpi=80)
    plt.scatter(og_xs, og_ys, s=0.1, label="Original points")
    plt.scatter(cs_xs, cs_ys, s=weights, c="red", label="Coreset")
    plt.legend(loc="lower right", fontsize=9)
    if save:
        plt.savefig(save_fn)
    else:
        plt.plot()

In [None]:
uniform_stream = generate_uniform_test_data()
coreset = run_stream(uniform_stream, max_cs_size=1000)
(og_xs, og_ys) = get_arrs_for_plotting_stream(uniform_stream)
(cs_xs, cs_ys, weights) =  get_arrs_for_plotting_coreset(coreset.coreset)
fig = plt.figure(figsize=(8, 6), dpi=80)
plt.scatter(og_xs, og_ys, s=0.1, label="Original points")
plt.scatter(cs_xs, cs_ys, s=weights, c="red", label="Coreset")
plt.legend(loc="lower right", fontsize=9)

In [None]:
# generate 3 gaussian clusters
rng = np.random.default_rng(12345)
n = 10000
k = 3
means1 = [[5, 5], [-5, -5], [0, 0]]
covs1 = [np.array([[1, 0], [0, 1]]), np.array([[1, 0], [0, 1]]), np.array([[7, 0], [0, 1]])]
gaussian_stream = hf.simulate_gaussian_clusters(rng, [3000, 2000, 5000], 3, means1, covs1)
coreset = run_stream(gaussian_stream, max_cs_size=1000)    
plot_coreset_sized_by_weight(gaussian_stream, coreset)

In [None]:
kmeans_centers, kmeans_coreset = get_true_and_coreset_kmeans_centers(gaussian_stream, coreset, k, rng)
plot_true_and_coreset_kmeans(kmeans_centers, kmeans_coreset, gaussian_stream)

In [None]:
# k-median code
# cs_points = []
# weights = []
# for (point, weight) in coreset.coreset:
#     cs_points.append(point)
#     weights.append(weight)
# k = 3
# kmedians_centers = hf.weighted_kmedians(gaussian_stream, k, rng)
# kmedians_coreset = hf.weighted_kmedians(np.array(cs_points), k, rng, w = weights)
# (kxs, kys) = get_arrs_for_plotting_stream(kmedians_centers)
# (ckxs, ckys) = get_arrs_for_plotting_stream(kmedians_centers)
# fig = plt.figure(figsize=(8, 6), dpi=80)
# plt.scatter(og_xs, og_ys, s=0.1, label="Original points")
# plt.scatter(cs_xs, cs_ys, s=1, c="orange", label="Coreset")
# plt.scatter(kxs, kys, s=20, c="red", label="K-medians on Original points")
# plt.scatter(ckxs, ckys, s=20, c="purple", label="Weighted K-medians on Coreset points")
# plt.legend(loc="lower right", fontsize=9)

In [None]:
max_sizes = [5, 10, 30, 50, 100, 500, 1000, 2000, 3000, 5000, 7000, 10000, 12000]
num_repeats = 5 # number of times to run k-means to make sure the number is reliable
all_diffs = []
xs = []
for coreset_size in max_sizes:
    print(coreset_size)
    cost_diffs = []
    for i in range(num_repeats):
        try: 
            coreset = run_stream(gaussian_stream, max_cs_size=coreset_size, chunk_size=1000)
            cs_points = []
            for (point, weight) in coreset.coreset:
                cs_points.append(point)
                
            kmeans_centers, kmeans_coreset = get_true_and_coreset_kmeans_centers(gaussian_stream, 
                                                                                 coreset, 
                                                                                 k, rng)
            plot_true_and_coreset_kmeans(kmeans_centers, kmeans_coreset, gaussian_stream, 
                                         save=True, save_fn="{}.png".format(coreset_size))
            
            plot_coreset_sized_by_weight(gaussian_stream, coreset, save=True, save_fn="weights_{}.png".format(coreset_size))
            
            true_cost = hf.cluster_cost(cs_points, kmeans_centers)
            cs_cost = hf.cluster_cost(cs_points, kmeans_coreset)
            cost_diffs.append(abs(true_cost - cs_cost))
        except:
            continue
    xs.append(coreset_size)
    all_diffs.append(cost_diffs)

In [None]:
diff_means = []
for diff_vec in all_diffs:
    diff_means.append(np.median(diff_vec))
    
str_xs = [str(x) for x in xs]
plt.figure(figsize=(8, 6), dpi=80)
plt.scatter(str_xs, diff_means)
plt.ylabel("Cost difference")
plt.xlabel("Max Coreset Size")
plt.title("Cost difference between coreset kmeans and regular k-means")