# Intro

- [Choosing the right estimator](https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html) (scikit-learn)



# Data

In [6]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# A few samples, organized as: [classification, consistency, temperature, [ingredients]].
samples = [
  ['yes', 'crunchy', 'normal', ['anise', 'baking powder', 'egg', 'flour', 'oil', 'sugar']], # Biscotti
  ['yes', 'chewy', 'cool', ['cookies', 'milk', 'skim milk', 'sugar', 'vanilla']], # Oreo ice cream
  ['chewy', '', []],
  ['yes', 'mashed', 'warm', ['butter', 'cheese', 'coriander', 'black pepper', 'garlic', 'potato', 'salt']], # Mash potatoes
  ['no', 'mashed', 'warm', ['bay leaf', 'chickpea', 'chili', 'coriander', 'cumin', 'garlic', 'ginger', 'oil', 'onion', 'salt', 'tomato', 'tumeric']], # Chole chickpea curry
  ['yes', 'solid', 'cool', ['banana', 'butter', 'egg', 'flour', 'milk', 'salt', 'strawberry']], # Crepes
  ['no', 'solid', '', []],
  ['yes', 'thick', 'cool', ['banana', 'mango', 'pineapple']], # Smoothie
  ['yes', 'thick', 'cool', ['banana', 'milk', 'strawberry', 'vanilla']], # Banana & strawberry milkshake
  ['no', 'thick', 'cool', ['chocolate', 'milk', 'vanilla']], # Chocolate milkshake
  ['yes', 'liquid', '', []],
  ['no', 'liquid', '', []],
]

encoder = OneHotEncoder(handle_unknown='ignore')
training_set = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
training_labels = np.array([1, 2, 6, 1, 2, 6])

validation_set = np.array([[-3, -3], [4, 8]])
validation_labels = np.array([9, 32])

# [Nearest Neighbours](https://scikit-learn.org/stable/modules/neighbors.html)

Scikit-learn supports three nearest neighbours algorithms:
- [Brute-force](https://scikit-learn.org/stable/modules/neighbors.html#brute-force).
- [K-D Tree](https://scikit-learn.org/stable/modules/neighbors.html#k-d-tree): best for low dimensional data (D < 20).
- [Ball Tree](https://scikit-learn.org/stable/modules/neighbors.html#ball-tree): better for high dimensional data.


In [2]:
from sklearn.neighbors import NearestNeighbors

nbrs = NearestNeighbors(n_neighbors=2, algorithm='auto').fit(training_set)
distances, indices = nbrs.kneighbors(training_set)

print('Distances:')
distances

print('Indices:')
indices

Distances:
Indices:


array([[0, 1],
       [1, 0],
       [2, 1],
       [3, 4],
       [4, 3],
       [5, 4]])

# [Random Forest](https://scikit-learn.org/stable/modules/ensemble.html#forests-of-randomized-trees)

# [Stochastic Gradient Descent](https://scikit-learn.org/stable/modules/sgd.html)

Requires a lot of data (>100k)

# [Support Vector Machine](https://scikit-learn.org/stable/modules/svm.html)

In [4]:
from sklearn import svm

In [8]:
# SVC: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC
svc = svm.SVC()
svc.fit(training_set, training_labels)

print('SVC support vectors:', svc.support_vectors_)

svc.predict(validation_set)

SVC support vectors: [[-1. -1.]
 [ 1.  1.]
 [-2. -1.]
 [ 2.  1.]
 [-3. -2.]
 [ 3.  2.]]


array([6, 6])