In [None]:
#default_exp clustering

# Machine Learning Clustering

> Simple Clustering techniques implemented with pytorch to be used in more elaborate projects.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#exports
import torch
import pandas
import random
from fastcore.all import *

# K-Means

## Data Processing

In [None]:
#exports
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.sampledata.iris import flowers

In [None]:
#hide
flowers.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_idx
0,5.1,3.5,1.4,0.2,setosa,0
1,4.9,3.0,1.4,0.2,setosa,0
2,4.7,3.2,1.3,0.2,setosa,0
3,4.6,3.1,1.5,0.2,setosa,0
4,5.0,3.6,1.4,0.2,setosa,0


In [None]:
#export
class Categorize(Transform):
    as_item_force=False
    def __init__(self, data: pandas.core.series.Series):
        data = L(list(data)).unique()
        self.idx2val = data
        self.val2idx = data.val2idx()

    def encodes(self, idx: int): return self.idx2val[idx]
    def decodes(self, cat: str): return self.val2idx[cat]

In [None]:
#exports
cat = Categorize(flowers["species"])
flowers["species_idx"] = flowers.species.map(cat.decodes)

In [None]:
flowers.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_idx
0,5.1,3.5,1.4,0.2,setosa,0
1,4.9,3.0,1.4,0.2,setosa,0
2,4.7,3.2,1.3,0.2,setosa,0
3,4.6,3.1,1.5,0.2,setosa,0
4,5.0,3.6,1.4,0.2,setosa,0


## Plotting

In [None]:
#export
def plot_iris(data: pandas.core.frame.DataFrame):
    colormap = {0: 'red', 1: 'green', 2: 'blue'}
    colors = [colormap[x] for x in data['species_idx']]

    p = figure(title = "Iris Morphology")
    p.xaxis.axis_label = 'Petal Length'
    p.yaxis.axis_label = 'Petal Width'

    p.circle(data["petal_length"], data["petal_width"], color=colors, fill_alpha=0.2, size=10)

    output_notebook()

    show(p)
    return p

In [None]:
plot_iris(flowers)

## K-Means Clustering

In [None]:
#exports
k = 3
it = 100
data = torch.tensor(flowers[flowers.columns[:4]].values)
centers = data[random.sample(range(len(data)), k)]

In [None]:
#export
def dist(point:torch.tensor, cluster:torch.tensor):
    return sum((point[0]-cluster[1])**2)

In [None]:
#export
def get_distances(data:torch.tensor, centers:torch.tensor):
    data_ = data.unsqueeze(1)
    diff = torch.cat([data_,data_,data_], dim=1)-centers
    return torch.sum(diff**2, 2)

In [None]:
#export
def calc_centers(data:torch.tensor, groups:torch.tensor, k:int):
    centers = [torch.mean(data[groups==i], dim=0) for i in range(k)]
    return torch.cat([c.unsqueeze(0) for c in centers], dim=0)

In [None]:
#exports
for x in range(it):
    distances = get_distances(data, centers)
    groups = torch.argmin(distances, 1)
    centers = calc_centers(data, groups, 3)

## Show Results

In [None]:
#exports
np_results = np.concatenate((data.numpy(), groups.unsqueeze(1).numpy()), 1)
results = pandas.DataFrame(np_results, columns=flowers.columns[flowers.columns!="species"])

In [None]:
plot_iris(results)

# C-Means

## Prep

In [None]:
update_js = """
    var data = source.data;
    var f = slider.value;
    red_fill = data['red_fill'];
    green_fill = data['green_fill'];
    blue_fill = data['blue_fill'];
    for (i = 0; i < data['setosa'].length; i++) {
        red_fill[i] = data['setosa'][i]*f;
        green_fill[i] = data['versicolor'][i]*f;
        blue_fill[i] = data['virginica'][i]*f;
    }
    
    source.change.emit();
"""

In [None]:
hide_js = """
    var indexOf = [].indexOf || function(item) { for (var i = 0, l = this.length; i < l; i++) { if (i in this && this[i] === item) return i; } return -1; };
    red.visible = indexOf.call(select.active,0)>=0;
    green.visible = indexOf.call(select.active,1)>=0;
    blue.visible = indexOf.call(select.active,2)>=0;
    red.change.emit();
    green.change.emit();
    blue.change.emit();
"""

In [None]:
tooltips = """
        <div>
            <h3>petal_length:</h3> @petal_length; <h3>petal_width:</h3> @petal_width; <h3>categories:</h3> @setosa, @versicolor, @virginica
        </div>
    """

In [None]:
#exports
c_flowers = flowers
one_hot = pandas.get_dummies(c_flowers['species'], dtype=float)
one_hot_show = pandas.get_dummies(c_flowers['species'].replace({'setosa': 'red_fill', 'versicolor': 'green_fill', 'virginica': 'blue_fill'}), dtype=float)
c_flowers = c_flowers.join(one_hot)
c_flowers = c_flowers.join(one_hot_show)
c_flowers

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_idx,setosa,versicolor,virginica,blue_fill,green_fill,red_fill
0,5.1,3.5,1.4,0.2,setosa,0,1.0,0.0,0.0,0.0,0.0,1.0
1,4.9,3.0,1.4,0.2,setosa,0,1.0,0.0,0.0,0.0,0.0,1.0
2,4.7,3.2,1.3,0.2,setosa,0,1.0,0.0,0.0,0.0,0.0,1.0
3,4.6,3.1,1.5,0.2,setosa,0,1.0,0.0,0.0,0.0,0.0,1.0
4,5.0,3.6,1.4,0.2,setosa,0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,2,0.0,0.0,1.0,1.0,0.0,0.0
146,6.3,2.5,5.0,1.9,virginica,2,0.0,0.0,1.0,1.0,0.0,0.0
147,6.5,3.0,5.2,2.0,virginica,2,0.0,0.0,1.0,1.0,0.0,0.0
148,6.2,3.4,5.4,2.3,virginica,2,0.0,0.0,1.0,1.0,0.0,0.0


## Plotting

In [None]:
#exports
from bokeh.models import CheckboxGroup, HoverTool, ColumnDataSource, CustomJS, Slider
from bokeh.layouts import column, layout

In [None]:
#export
def c_plot_iris(data: pandas.core.frame.DataFrame):
    source = ColumnDataSource(data)
    TOOLTIPS = tooltips
    
    p = figure(title = 'Iris Morphology', tooltips=TOOLTIPS)
    p.xaxis.axis_label = 'Petal Length'
    p.yaxis.axis_label = 'Petal Width'

    red_circles = p.circle("petal_length", "petal_width", color="red", fill_alpha="red_fill", size=10, line_alpha=0, source=source)
    green_circles = p.circle("petal_length", "petal_width", color="green", fill_alpha="green_fill", size=10, line_alpha=0, source=source)
    blue_circles = p.circle("petal_length", "petal_width", color="blue", fill_alpha="blue_fill", size=10, line_alpha=0, source=source)

    select = CheckboxGroup(labels=["red","green","blue"], active=[0,1,2], width=100)
    slider = Slider(start=0.1, end=1, value=1, step=.01, title="Transparency")
    
    param_update = CustomJS(args=dict(source=source, slider=slider, select=select), code=update_js)
    hide_update = CustomJS(code=hide_js, args=dict(red=red_circles, green=green_circles, blue=blue_circles, select=select))
    slider.js_on_change('value', param_update)
    select.js_on_change('active', hide_update)
    tweak = column(select, slider)
    output_notebook()
    show(layout([[p, tweak]]))

In [None]:
p = c_plot_iris(c_flowers)

## C-means

In [None]:
#export
def c_calc_centers(U:torch.Tensor, points:torch.Tensor):
    weighted_sum = points.t()@U
    weighted_mean = weighted_sum/U.sum(dim=0)
    return weighted_mean.t()

In [None]:
#export
def update_u(centers:torch.Tensor, point:torch.Tensor):
    d_ij = distances(centers, point)
    dist_proportions = d_ij/d_ij.t().unsqueeze(2)
    return 1/dist_proportions.sum(dim=0)

In [None]:
#export
def distances(centers:torch.Tensor, point:torch.Tensor):
    diff = point.unsqueeze(1)-centers
    return (diff**2).sum(dim=2)

In [None]:
#exports
c_data = torch.FloatTensor(c_flowers[c_flowers.columns[:4]].values)
U = torch.zeros(150,3).scatter(1,torch.randint(3,(150,1)), 1.)

it = 100
eps = 10e-8

In [None]:
#exports
for x in range(it):
    centers = c_calc_centers(U, c_data)
    U_new = update_u(centers, c_data)
    if ((U-U_new)**2).sum() < eps: break
    U = U_new

## Show Results

In [None]:
c_np_results = np.concatenate((c_data.numpy(), U.numpy(), U.numpy()), 1)
c_results = pandas.DataFrame(c_np_results, columns=c_flowers.columns[~c_flowers.columns.isin(["species", "species_idx"])])

In [None]:
p = c_plot_iris(c_results)