<a href="https://colab.research.google.com/github/jamestheengineer/data-science-from-scratch-Python/blob/master/Chapter_12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Only do this once per VM, otherwise you'll get multiple clones and nested directories
!git clone https://github.com/jamestheengineer/data-science-from-scratch-Python.git
%cd data-science-from-scratch-Python/
!pip install import-ipynb

In [0]:
# k-Nearest Neighbors
from typing import List
from collections import Counter

def raw_majority_vote(labels: List[str]) -> str:
  votes= Counter(labels)
  winner, _ = votes.most_common(1)[0]
  return winner

assert raw_majority_vote(['a','b','c','b']) == 'b'

In [0]:
# The previous function doesn't handle ties. Let's reduce k until we find a unique winner
def majority_vote(labels: List[str]) -> str:
  """Assumes that labels are ordered from nearest to farthest."""
  vote_counts = Counter(labels)
  winner, winner_count = vote_counts.most_common(1)[0]
  num_winners = len([count
                     for count in vote_counts.values()
                     if count == winner_count])
  if num_winners == 1:
    return winner
  else:
    return majority_vote(labels[:-1]) # try again without the farthest

# Tie, so look at first 4
assert majority_vote(['a','b','c','b','a']) == 'b'

In [0]:
from typing import NamedTuple
import import_ipynb
from Chapter_4 import Vector, distance

class LabeledPoint(NamedTuple):
  point: Vector
  label: str

def knn_classify(k: int,
                 labeled_points: List[LabeledPoint],
                 new_point: Vector) -> str:
    # Order the labeled points from nearest to farthest
    by_distance = sorted(labeled_points,
                         key=lambda lp: distance(lp.point, new_point))
    
    # Find the labels for the k closest
    k_nearest_labels = [lp.label for lp in by_distance[:k]]

    # And let them vote
    return majority_vote(k_nearest_labels)

