In [None]:
#default_exp core

# Machine Learning Clustering

> Simple Clustering techniques implemented with pytorch to be used in more elaborate projects.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
import torch
import pandas
import random
from fastcore.all import *

# K-Means
## Data Processing

In [None]:
#export
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.sampledata.iris import flowers

In [None]:
flowers.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [None]:
#exports
class Categorize(Transform):
    as_item_force=False
    def __init__(self, data: pandas.core.series.Series):
        data = L(list(data)).unique()
        self.idx2val = data
        self.val2idx = data.val2idx()
        
    def encodes(self, idx: int): return self.idx2val[idx]

    def decodes(self, cat: str): return self.val2idx[cat]

In [None]:
#exports
cat = Categorize(flowers["species"])
flowers["species_idx"] = flowers.species.map(cat.decodes)

In [None]:
flowers.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_idx
0,5.1,3.5,1.4,0.2,setosa,0
1,4.9,3.0,1.4,0.2,setosa,0
2,4.7,3.2,1.3,0.2,setosa,0
3,4.6,3.1,1.5,0.2,setosa,0
4,5.0,3.6,1.4,0.2,setosa,0


## Plotting

In [None]:
#exports
def plot_iris(data: pandas.core.frame.DataFrame):
    colormap = {0: 'red', 1: 'green', 2: 'blue'}
    colors = [colormap[x] for x in data['species_idx']]

    p = figure(title = "Iris Morphology")
    p.xaxis.axis_label = 'Petal Length'
    p.yaxis.axis_label = 'Petal Width'

    p.circle(data["petal_length"], data["petal_width"], color=colors, fill_alpha=0.2, size=10)

    output_notebook()

    show(p)
    return p

In [None]:
plot_iris(flowers)

## K-Means Clustering

In [None]:
#exports
k = 3
it = 100
data = torch.Tensor(flowers[flowers.columns[:4]].values)
centers = data[random.sample(range(len(data)), k)]

In [None]:
def dist(point:torch.Tensor, cluster:torch.Tensor):
    return sum((point[0]-cluster[1])**2)

In [None]:
def get_distances(data:torch.Tensor, centers:torch.Tensor):
    data_ = data.unsqueeze(1)
    diff = torch.cat([data_,data_,data_], dim=1)-centers
    return torch.sum(diff**2, 2)

In [None]:
def calc_centers(data:torch.Tensor, groups:torch.Tensor, k:int):
    centers = [torch.mean(data[groups==i], dim=0) for i in range(k)]
    return torch.cat([c.unsqueeze(0) for c in centers], dim=0)

In [None]:
for x in range(it):
    distances = get_distances(data, centers)
    groups = torch.argmin(distances, 1)
    centers = calc_centers(data, groups, 3)

## Show Results

In [None]:
np_results = np.concatenate((data.numpy(), groups.unsqueeze(1).numpy()), 1)

In [None]:
results = pandas.DataFrame(np_results, columns=flowers.columns[flowers.columns!="species"])

In [None]:
plot_iris(results)

# C-Means
## PreProcessing

## Plotting

In [None]:
from bokeh.models import CheckboxGroup, HoverTool, ColumnDataSource

In [None]:
c_flowers = flowers

In [None]:
colormap = {0: 'red', 1: 'green', 2: 'blue'}
c_flowers["colors"] = colormap[flowers["species_idx"]]

TypeError: 'Series' objects are mutable, thus they cannot be hashed

In [None]:
c_flowers["colors"] = [colormap[x] for x in flowers['species_idx']]

In [None]:
c_flowers

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_idx,colors
0,5.1,3.5,1.4,0.2,setosa,0,red
1,4.9,3.0,1.4,0.2,setosa,0,red
2,4.7,3.2,1.3,0.2,setosa,0,red
3,4.6,3.1,1.5,0.2,setosa,0,red
4,5.0,3.6,1.4,0.2,setosa,0,red
...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,2,blue
146,6.3,2.5,5.0,1.9,virginica,2,blue
147,6.5,3.0,5.2,2.0,virginica,2,blue
148,6.2,3.4,5.4,2.3,virginica,2,blue


In [None]:
#exports
def c_plot_iris(data: pandas.core.frame.DataFrame):
    colormap = {0: 'red', 1: 'green', 2: 'blue'}
    colors = [colormap[x] for x in data['species_idx']]

    source = ColumnDataSource(data)
    TOOLTIPS = [
        ("petal_length", "@petal_length"),
        ("petal_width", "@petal_width"),
        ("Correct", "@species")
    ]
    
    p = figure(title = "Iris Morphology", tooltips=TOOLTIPS)
    p.xaxis.axis_label = 'Petal Length'
    p.yaxis.axis_label = 'Petal Width'
    
    p.circle("petal_length", "petal_width", fill_alpha=0.2, size=10, source=source)
    
    output_notebook()

    show(p)

In [None]:
select = CheckboxGroup(labels=["0","1","2"], active=[0,1,2])
show(select)
p = c_plot_iris(c_flowers)

In [None]:
from bokeh.models.widgets import MultiSelect

multi_select = MultiSelect(title="Option:", value=["foo", "quux"],
                           options=[("foo", "Foo"), ("bar", "BAR"), ("baz", "bAz"), ("quux", "quux")])

show(multi_select)

## C-means