<a href="https://colab.research.google.com/github/jamestheengineer/data-science-from-scratch-Python/blob/master/Chapter_20.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Clustering chapter. Example of unsupervised learning.

# Only do this once per VM, otherwise you'll get multiple clones and nested directories
!git clone https://github.com/jamestheengineer/data-science-from-scratch-Python.git
%cd data-science-from-scratch-Python/
!pip install import-ipynb
import import_ipynb

Cloning into 'data-science-from-scratch-Python'...
remote: Enumerating objects: 93, done.[K
remote: Counting objects: 100% (93/93), done.[K
remote: Compressing objects: 100% (93/93), done.[K
remote: Total 480 (delta 60), reused 0 (delta 0), pack-reused 387[K
Receiving objects: 100% (480/480), 1.18 MiB | 14.01 MiB/s, done.
Resolving deltas: 100% (300/300), done.
/content/data-science-from-scratch-Python
Collecting import-ipynb
  Downloading https://files.pythonhosted.org/packages/63/35/495e0021bfdcc924c7cdec4e9fbb87c88dd03b9b9b22419444dc370c8a45/import-ipynb-0.1.3.tar.gz
Building wheels for collected packages: import-ipynb
  Building wheel for import-ipynb (setup.py) ... [?25l[?25hdone
  Created wheel for import-ipynb: filename=import_ipynb-0.1.3-cp36-none-any.whl size=2976 sha256=b785eb1d06d2762e12a3bd03480f6768a2ba359dc1cef515e7321fa2ffe7a8ee
  Stored in directory: /root/.cache/pip/wheels/b4/7b/e9/a3a6e496115dffdb4e3085d0ae39ffe8a814eacc44bbf494b5
Successfully built import-ipynb

In [3]:
from Chapter_04 import Vector

def num_differences(v1: Vector, v2: Vector) -> int:
  assert len(v1) == len(v2)
  return len([x1 for x1, x2 in zip(v1,v2) if x1 != x2])

assert num_differences([1,2,3],[2,1,3]) == 2
assert num_differences([1,2],[1,2]) == 0

importing Jupyter notebook from Chapter_04.ipynb


In [4]:
from typing import List
from Chapter_04 import vector_mean

def cluster_means(k: int,
                  inputs: List[Vector],
                  assignments: List[int]) -> List[Vector]:
  # cluster[i] contains the inputs whose assignment is i
  clusters = [[] for i in range(k)]
  for input, assignment in zip(inputs, assignents):
    clusters[assignment].append(input)

  # if a cluster is empty, just use a random point
  return [vector_mean(cluster) if cluster else random.choice(inputs)
          for cluster in clusters]

In [5]:
import itertools
import random
import tqdm
from Chapter_04 import squared_distance

class KMeans:
  def __init__(self, k: int) -> None:
    self.k = k
    self.means = None

  def classify(self, input: Vector) -> int:
    """return the index of the cluster closes to the input"""
    return min(range(self.k),
               key=lambda i: squared_distance(input, self.means[i]))
    
  def train(self, inputs: List[Vector]) -> None:
    # Start with random assignments
    assignments = [random.randrange(self.k) for _ in inputs]

    with tqdm.tqdm(itertools.count()) as t:
      for _ in t:
        # Compute means and find new assignments
        self.means = cluster_means(self.k, inputs, assignments)
        new_assignments = [self.classify(input) for input in inputs]

        # Check how many assignments changed and if we're done
        num_changed = num_differences(assignments, new_assignments)
        if num_changed == 0:
          return
        
        # Otherwise keep the new assignments, and compute new means
        assignments = new_assignments
        self.means = cluster_means(self.k, inputs, assignments)
        t.set_description(f"changed: {num_changed} / {len(inputs)}")