# Intro

- [Choosing the right estimator](https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html) (scikit-learn)



# Data

In [79]:
# Data samples, organized as: [classification, consistency, temperature, [ingredients]].
foods = [
  [1, 'crunchy', 'normal', ['anise', 'baking powder', 'egg', 'flour', 'oil', 'sugar']], # Biscotti
  [1, 'crunchy', 'cool', ['almond', 'barley malt', 'milk', 'rice', 'salt', 'sugar', 'wheat bran', 'whole grain wheat']], # Vanilla almond Special K
  [1, 'crunchy', 'normal', ['almond', 'apple', 'arugula', 'cranberry', 'oil', 'salt']], # Salad
  [1, 'chewy', 'cool', ['cookies', 'milk', 'skim milk', 'sugar', 'vanilla']], # Oreo ice cream
  [1, 'mashed', 'warm', ['butter', 'cheese', 'coriander', 'black pepper', 'garlic', 'potato', 'salt']], # Mash potatoes
  [0, 'mashed', 'warm', ['bay leaf', 'chickpea', 'chili', 'coriander', 'cumin', 'garlic', 'ginger', 'oil', 'onion', 'salt', 'tomato', 'tumeric']], # Chole chickpea curry
  [1, 'solid', 'cool', ['banana', 'butter', 'egg', 'flour', 'milk', 'salt', 'strawberry']], # Crepes
  [1, 'thick', 'cool', ['banana', 'mango', 'pineapple']], # Banana/mango moothie
  [0, 'thick', 'cool', ['blueberry', 'pineapple', 'strawberry']], # Berry smoothie
  [1, 'thick', 'cool', ['banana', 'milk', 'strawberry', 'vanilla']], # Banana & strawberry milkshake
  [0, 'thick', 'cool', ['chocolate', 'milk', 'vanilla']], # Chocolate milkshake
  [0, 'solid', 'warm', ['bell pepper', 'cheese', 'garlic', 'oil', 'olive', 'onion', 'salt', 'sugar', 'tomato']], # Olive pizza
  [0, 'solid', 'warm', ['anchovy', 'bell pepper', 'cheese', 'garlic', 'mayonnaise', 'oil', 'olive', 'onion', 'salt', 'spinach', 'sugar']], # Anchovies pizza
  [1, 'solid', 'warm', ['arugula', 'bell pepper', 'cheese', 'coriander', 'garlic', 'oil', 'onion', 'salt', 'spinach', 'sugar', 'tomato']], # Arugula pizza
  [1, 'solid', 'warm', ['black pepper', 'cheese', 'egg', 'garlic', 'mushroom', 'oil', 'onion', 'red pepper', 'salt', 'spinach']], # Mushroom spinach omelet
]

print('Size of data set:', len(foods))

Size of data set: 14


In [80]:
# Utils.
def get_descriptions(samples):
  """Retrieves food descriptions from a data set."""

  return list(map(lambda sample: sample[1:3], foods))

def get_ingredients(samples):
  """Retrieves ingredients from a data set."""

  return list(map(lambda sample: sample[3], foods))

def encode_samples(samples):
  """Encodes food samples to use as inputs to a model."""
  sample_descriptions = list(map(lambda sample: sample[1:3], samples))
  sample_ingredients = list(map(lambda sample: sample[3], samples))
  encoded_data_set = []
  encoded_descriptions = food_descriptions_encoder\
    .transform(sample_descriptions)\
    .toarray()
  encoded_ingredients = ingredients_encoder.transform(sample_ingredients)

  for i in range(len(samples)):
    encoded_data_set.append(np.concatenate([encoded_descriptions[i], encoded_ingredients[i]]))
  
  return encoded_data_set

In [81]:
# Encoders.
import numpy as np
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer, OneHotEncoder

food_descriptions_encoder = OneHotEncoder()
food_descriptions_encoder.fit(get_descriptions(foods))
ingredients_encoder = MultiLabelBinarizer()
ingredients_encoder.fit(get_ingredients(foods))

print('Food descriptions:', food_descriptions_encoder.categories_)
print('Ingredients:', ingredients_encoder.classes_)

Food descriptions: [array(['chewy', 'crunchy', 'mashed', 'solid', 'thick'], dtype=object), array(['cool', 'normal', 'warm'], dtype=object)]
Ingredients: ['almond' 'anchovy' 'anise' 'apple' 'arugula' 'baking powder' 'banana'
 'barley malt' 'bay leaf' 'bell pepper' 'black pepper' 'blueberry'
 'butter' 'cheese' 'chickpea' 'chili' 'chocolate' 'cookies' 'coriander'
 'cranberry' 'cumin' 'egg' 'flour' 'garlic' 'ginger' 'mango' 'mayonnaise'
 'milk' 'oil' 'olive' 'onion' 'pineapple' 'potato' 'rice' 'salt'
 'skim milk' 'spinach' 'strawberry' 'sugar' 'tomato' 'tumeric' 'vanilla'
 'wheat bran' 'whole grain wheat']


In [82]:
# Encode, randomize and split data set.
from sklearn.model_selection import train_test_split

X = encode_samples(foods)
y = list(map(lambda sample: sample[0], foods))
X_train, X_test, y_train, y_test = train_test_split(X, y)
X_test, X_validate, y_test, y_validate = train_test_split(X_test, y_test)

print('Training set size:', len(y_train))
print('Testing set size:', len(y_test))
print('Validation set size:', len(y_validate))

Training set size: 10
Testing set size: 3
Validation set size: 1


# [Nearest Neighbours](https://scikit-learn.org/stable/modules/neighbors.html)

Scikit-learn supports three nearest neighbours algorithms:
- [Brute-force](https://scikit-learn.org/stable/modules/neighbors.html#brute-force).
- [K-D Tree](https://scikit-learn.org/stable/modules/neighbors.html#k-d-tree): best for low dimensional data (D < 20).
- [Ball Tree](https://scikit-learn.org/stable/modules/neighbors.html#ball-tree): better for high dimensional data.


In [None]:
from sklearn.neighbors import NearestNeighbors

nbrs = NearestNeighbors(n_neighbors=2, algorithm='auto').fit(training_set)
distances, indices = nbrs.kneighbors(training_set)

print('Distances:')
distances

print('Indices:')
indices

Distances:
Indices:


array([[0, 1],
       [1, 0],
       [2, 1],
       [3, 4],
       [4, 3],
       [5, 4]])

# [Random Forest](https://scikit-learn.org/stable/modules/ensemble.html#forests-of-randomized-trees)

# [Stochastic Gradient Descent](https://scikit-learn.org/stable/modules/sgd.html)

Requires a lot of data (>100k)

# [Support Vector Machine](https://scikit-learn.org/stable/modules/svm.html)

In [83]:
# SVC: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
from sklearn import svm

svc_linear = svm.SVC(kernel='linear')
svc_linear.fit(X_train, y_train)
print('Linear SVC score:', svc_linear.score(X_test, y_test))

svc_poly_3 = svm.SVC(kernel='poly', degree=3)
svc_poly_3.fit(X_train, y_train)
print('Polynomial (3) SVC score:', svc_poly_3.score(X_test, y_test))

svc_poly_10 = svm.SVC(kernel='poly', degree=10)
svc_poly_10.fit(X_train, y_train)
print('Polynomial (10) SVC score:', svc_poly_10.score(X_test, y_test))

# https://scikit-learn.org/stable/auto_examples/svm/plot_rbf_parameters.html#rbf-svm-parameters
svc_rbf = svm.SVC(kernel='rbf')
svc_rbf.fit(X_train, y_train)
print('Radial basis function SVC score:', svc_rbf.score(X_test, y_test))

svc_sigmoid = svm.SVC(kernel='sigmoid')
svc_sigmoid.fit(X_train, y_train)
print('Sigmoid SVC score:', svc_sigmoid.score(X_test, y_test))


Linear SVC score: 0.6666666666666666
Polynomial (3) SVC score: 0.6666666666666666
Polynomial (10) SVC score: 0.6666666666666666
Radial basis function SVC score: 0.6666666666666666
Sigmoid SVC score: 0.6666666666666666
