In [None]:
from functools import reduce
from ipywidgets import interact
from matplotlib import pyplot as plt, cm
import numpy as np
import operator
import sklearn.cluster

In [None]:
x_min = 0
x_max = 10

def generate(m1, m2, size, xt=6):
    X = np.random.uniform(0, x_max, size=size)
    b1 = 0
    xt = 6
    b2 = (m1-m2)*xt
    M = np.array([m1 if x < xt else m2 for x in X])
    B = np.array([b1 if x < xt else b2 for x in X])
    Y = np.random.normal(size=X.size) + M*X + B
    return X, Y


X, Y = generate(2, -1, 101)

plt.plot(X, Y, 'o', ms=3, alpha=0.3)
#plt.plot([0, xt], [b1, m1*xt+b1], 'r')
#plt.plot([xt, x_max], [m1*xt+b1, m2*x_max+b2], 'm')
#plt.plot([xt, x_max], [m1*xt+b1, m1*x_max+b1], 'r--', alpha=0.5)
#plt.plot([0, xt], [b2, m2*xt+b2], 'm--', alpha=0.5)

In [None]:
X = np.arange(0, 8)
Y = np.array([0, 3, 6, 9, 8, 7, 6, 5])


def fit_plane(x, y):
    A = np.vstack([x, np.ones(x.size)]).T
    Ainv = np.linalg.inv(np.matmul(A.T, A))
    return np.matmul(Ainv, np.matmul(A.T, y))


def partition(items, labels):
    return {label: items[labels == label] for label in set(labels)}


def zip_values(*dicts):
    keys = reduce(set.intersection, [set(d.keys()) for d in dicts])
    get_values = lambda key: [d.get(key) for d in dicts]
    return {key: get_values(key) for key in keys}


def itemize(*dicts):
    return [[k] + v for k, v in zip_values(*dicts).items()]


@interact(
    split_index=(2, X.size-2)
)
def f(split_index):
    # Partition into clusters.
    labels = np.array([0 if i < split_index else 1 for i in range(X.size)])

    # Fit planes.
    planes = []
    errors = []
    for label in set(labels):
        x = X[labels == label]
        y = Y[labels == label]
        p = fit_plane(x, y)
        y_approx = p[0] * x + p[1]
        error = (np.sum(y) - y_approx) ** 2
        planes.append(p)
        errors.append(error)

    fig, ax = plt.subplots()

    colors = ['b', 'g']
    ax.scatter(X, Y, c=[colors[label] for label in labels])

    for label in set(labels):
        x = X[labels == label]
        y = Y[labels == label]
        p = planes[label]
        xx = np.array([x.min(), x.max()])
        yy = p[0] * xx + p[1]
        ax.plot(xx, yy, c=colors[label])


In [None]:
@interact(bucket_count=(1,50))
def f(bucket_count=10):
    fig, axs = plt.subplots(2, 2, figsize=(15, 8))
    ax0 = axs[0,0]
    ax1 = axs[0,1]
    ax2 = axs[1,0]
    ax3 = axs[1,1]
    mz = []
    bz = []
    buckets = np.linspace(x_min, x_max, bucket_count+1)
    for x1, x2, i in zip(buckets, buckets[1:], range(buckets.size)):
        indexes = np.where(np.logical_and(X > x1, X < x2))
        xx = X[indexes]
        yy = Y[indexes]
        A = np.vstack([xx, np.ones(xx.size)]).T
        b = yy[:,np.newaxis]
        theta = np.matmul(np.linalg.inv(np.matmul(A.T, A)), np.matmul(A.T, b))
        ax0.plot(xx, yy, 'o', color=cm.tab20(i), alpha=0.3)
        m = theta[0,0]
        b = theta[1,0]
        mz.append(m)
        bz.append(b)
        ax0.plot([x1, x2], [m*x1+b, m*x2+b], 'k')

    for m, b, i in zip(mz, bz, range(len(mz))):
        ax1.plot(m, b, 'o', color=cm.tab20(i))
    ax2.bar(range(len(mz)), mz)
    ax2.set_title('m')
    ax3.bar(range(len(bz)), bz)
    ax3.set_title('b')

In [None]:
def get_points(segments, points_per_unit_dist, y_spread):
    segments_slope = np.diff(segments[1,:]) / np.diff(segments[0,:])
    points = np.array([]).reshape([0,2])
    grads = np.array([]).reshape([0,2])
    normals = np.array([]).reshape([0,2])
    for i in range(segments.shape[1] - 1):
        p1 = segments[:,i]
        p2 = segments[:,i+1]
        n = int(np.linalg.norm(p2 - p1) * points_per_unit_dist)
        m = segments_slope[i]
        x = np.random.uniform(p1[0], p2[0], n)
        noise = np.random.normal(size=n) * y_spread
        y = noise + m * (x - p1[0]) + p1[1]
        point = np.vstack([x, y]).T
        delta = np.array([1, m]).reshape([1, 2])
        grad = delta / np.linalg.norm(delta)
        normal = np.array([1, -1/grad[0,1]])
        normal = normal / np.linalg.norm(normal)
        normal = np.tile(normal, [point.shape[0], 1])
        grad = np.tile(grad, [point.shape[0], 1])
        points = np.vstack([points, point])
        grads = np.vstack([grads, grad])
        normals = np.vstack([normals, normal])
    return points, grads, normals


def partition(items, labels):
    split_indexes = np.where(np.diff(labels))[0] + 1
    split_items = np.split(items, split_indexes)
    split_labels = [x[0] for x in np.split(labels, split_indexes)]
    return zip(split_labels, split_items)


def quantize_color(n):
    return cm.tab20c(n/10)


@interact(
    seed=(0,10),
    segment_count=(1,20),
    cluster_count=(1,40),
    points_per_unit_dist=(1,100),
    x_scale=(0.1,3,0.1),
    y_spread=(0.01,0.2,0.01),
    weight=(0.,3.,0.05),
    step_count=(1,50),
)
def f(
    seed=2,
    segment_count=10,
    cluster_count=10,
    points_per_unit_dist=50,
    x_scale=1,
    y_spread=0.05,
    show_labels=True,
    show_lines=True,
    show_normals=False,
    weight=1.0,
    step_count=1,
):
    np.random.seed(seed)
    segments = np.vstack([
        np.cumsum(np.random.uniform(0, x_scale, segment_count + 1)),
        np.random.uniform(size=segment_count + 1),
    ])
    points, grads, normals = get_points(segments, points_per_unit_dist, y_spread)
    
    items = np.hstack([points, weight*normals])
    #items = points
    if show_labels:
        learner = sklearn.cluster.KMeans(n_clusters=cluster_count).fit(items)
        labels = learner.predict(items)
    else:
        labels = np.zeros(points.shape[0])

    fig, ax = plt.subplots(figsize=(15,4))
    
    indexes = np.random.choice(points.shape[0], cluster_count)
    centroids = items[indexes]

    for i in range(step_count):
        labels = np.array([], dtype=int)
        for item in items:
            errors = np.linalg.norm(item - centroids, axis=1)
            label = int(np.argmin(errors))
            labels = np.append(labels, label)
        #labels = np.argmin(np.linalg.norm((items.reshape([1,2,-1]) - centroids.reshape([-1,2,1])), axis=1), axis=0)
            
        for label in range(cluster_count):
            centroids[label] = np.array(items[labels==label].mean(axis=0))

    if show_normals:
        for i in range(points.shape[0]):
            p = points[i,:]
            g = grads[i,:] * 0.05
            l = labels[i]
            color = quantize_color(l)
            ax.plot(p[0], p[1], 'o', ms=4, color=quantize_color(l), alpha=0.5)
            ax.plot([p[0]-g[0], p[0]+g[0]], [p[1]-g[1], p[1]+g[1]], color=color, alpha=0.5)
    else:
        for label, points_subset in partition(points, labels):
            color = quantize_color(label)
            ax.plot(points_subset[:,0], points_subset[:,1], 'o', color=color, ms=4, alpha=0.4)

    if show_lines:
        for label in set(labels):
            color = quantize_color(label)
            points_subset = points[labels == label]
            X = points_subset[:,0]
            Y = points_subset[:,1]
            x0 = X.min()
            x1 = X.max()
            m, b = np.polyfit(X, Y, 1)
            ax.plot([x0, x1], [m*x0+b, m*x1+b], color=color, lw=3, alpha=0.5)
            
            #x_avg, y_avg, dx_avg, dy_avg = learner.cluster_centers_[label]
            #m_avg = dy_avg / dx_avg
            #b_avg = - (m_avg * x_avg) + y_avg
            #ax.plot([x_avg - dx_avg, x_avg + dx_avg], [y_avg - dy_avg, y_avg + dy_avg], color=color, lw=6, dashes=(2,1))

            ax.plot(centroids[label,0], centroids[label,1], 'kx', ms=20, mew=7, alpha=0.7)
            ax.plot(centroids[label,0], centroids[label,1], 'x', ms=20, mew=5, color=color)
            

    ax.plot(segments[0,:], segments[1,:], 'o-', color=quantize_color(0), alpha=0.4)
    ax.set_aspect('equal')
    ax.set_ylim(-0.2, 1.2)