# Intro

- [Choosing the right estimator](https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html) (scikit-learn)



# Data

In [127]:
# Data samples, organized as: [classification, consistency, temperature, [ingredients]].
foods = [
  [1, 'crunchy', 'normal', ['anise', 'baking powder', 'egg', 'flour', 'oil', 'sugar']], # Biscotti
  [1, 'crunchy', 'cool', ['almond', 'barley malt', 'milk', 'rice', 'salt', 'sugar', 'wheat bran', 'whole grain wheat']], # Vanilla almond Special K
  [1, 'crunchy', 'normal', ['almond', 'apple', 'arugula', 'cranberry', 'oil', 'salt']], # Arugula salad
  [1, 'chewy', 'cool', ['cookies', 'milk', 'skim milk', 'sugar', 'vanilla']], # Oreo ice cream
  [1, 'mashed', 'warm', ['butter', 'cheese', 'coriander', 'black pepper', 'garlic', 'potato', 'salt']], # Mash potatoes
  [0, 'mashed', 'warm', ['bay leaf', 'chickpea', 'chili', 'coriander', 'cumin', 'garlic', 'ginger', 'oil', 'onion', 'salt', 'tomato', 'tumeric']], # Chole chickpea curry
  [1, 'mashed', 'warm', ['bean', 'black pepper', 'carrot', 'garlic', 'onion', 'potato', 'salt', 'tomato', 'tumeric']], # Chili

  # Smoothies & milkshakes
  [1, 'thick', 'cool', ['banana', 'mango', 'pineapple']],
  [1, 'thick', 'cool', ['blueberry', 'kiwi', 'mango', 'pineapple']],
  [0, 'thick', 'cool', ['blueberry', 'pineapple', 'strawberry']],
  [0, 'thick', 'cool', ['blueberry', 'kiwi', 'strawberry']],
  [1, 'thick', 'cool', ['banana', 'milk', 'strawberry', 'vanilla']],
  [1, 'thick', 'cool', ['banana', 'milk', 'kiwi', 'vanilla']],
  [0, 'thick', 'cool', ['chocolate', 'milk', 'vanilla']],
  [0, 'thick', 'cool', ['banana', 'chocolate', 'milk', 'vanilla']],
  [0, 'thick', 'cool', ['milk', 'strawberry', 'vanilla']],

  # Pizzas
  [0, 'solid', 'warm', ['bell pepper', 'cheese', 'flour', 'garlic', 'oil', 'olive', 'onion', 'salt', 'sugar', 'tomato']], # Olives
  [0, 'solid', 'warm', ['anchovy', 'bell pepper', 'cheese', 'flour', 'garlic', 'mayonnaise', 'oil', 'olive', 'onion', 'salt', 'spinach', 'sugar']], # Anchovies
  [1, 'solid', 'warm', ['arugula', 'bell pepper', 'cheese', 'coriander', 'flour', 'garlic', 'oil', 'onion', 'salt', 'spinach', 'sugar', 'tomato']], # Arugula
  [1, 'solid', 'warm', ['bell pepper', 'cheese', 'flour', 'garlic', 'mushroom', 'oil', 'onion', 'salt', 'sugar']], # Mushroom
  [1, 'solid', 'warm', ['cheese', 'flour', 'garlic', 'oil', 'onion', 'salt', 'sugar', 'tomato']], # Cheese

  # Omelets
  [1, 'solid', 'warm', ['black pepper', 'cheese', 'egg', 'garlic', 'mushroom', 'oil', 'onion', 'red pepper', 'salt', 'spinach']], # Mushroom spinach omelet
  [1, 'solid', 'warm', ['bean', 'black pepper', 'oil', 'onion', 'rice', 'salt', 'tumeric']], # Rice and beans
  [1, 'solid', 'warm', ['black pepper', 'mushroom', 'oil', 'onion', 'pea', 'rice', 'salt']], # Riz djondjon
  [1, 'solid', 'cool', ['banana', 'butter', 'egg', 'flour', 'milk', 'salt', 'strawberry']], # Crepes
  [0, 'solid', 'warm', ['black pepper', 'oil', 'okra', 'onion', 'rice', 'salt', 'spinach']], # Kalalou

  # Soups
  [1, 'liquid', 'warm', ['black pepper', 'carrot', 'garlic', 'onion', 'potato', 'salt', 'squash']], # Butternut squash soup
  [0, 'liquid', 'warm', ['black pepper', 'butter', 'flour', 'garlic', 'onion', 'peanut', 'salt']], # Peanut soup
]

print('Size of data set:', len(foods))

Size of data set: 28


In [128]:
# Utils.
def get_descriptions(samples):
  """Retrieves food descriptions from a data set."""

  return list(map(lambda sample: sample[1:3], foods))

def get_ingredients(samples):
  """Retrieves ingredients from a data set."""

  return list(map(lambda sample: sample[3], foods))

def encode_samples(samples):
  """Encodes food samples to use as inputs to a model."""
  sample_descriptions = list(map(lambda sample: sample[1:3], samples))
  sample_ingredients = list(map(lambda sample: sample[3], samples))
  encoded_data_set = []
  encoded_descriptions = food_descriptions_encoder\
    .transform(sample_descriptions)\
    .toarray()
  encoded_ingredients = ingredients_encoder.transform(sample_ingredients)

  for i in range(len(samples)):
    encoded_data_set.append(np.concatenate([encoded_descriptions[i], encoded_ingredients[i]]))
  
  return encoded_data_set

In [129]:
# Encoders.
import numpy as np
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer, OneHotEncoder

food_descriptions_encoder = OneHotEncoder()
food_descriptions_encoder.fit(get_descriptions(foods))
ingredients_encoder = MultiLabelBinarizer()
ingredients_encoder.fit(get_ingredients(foods))

print(f'Food descriptions:', food_descriptions_encoder.categories_)
print(f'Ingredients ({len(ingredients_encoder.classes_)}):', ingredients_encoder.classes_)

Food descriptions: [array(['chewy', 'crunchy', 'liquid', 'mashed', 'solid', 'thick'],
      dtype=object), array(['cool', 'normal', 'warm'], dtype=object)]
Ingredients (53): ['almond' 'anchovy' 'anise' 'apple' 'arugula' 'baking powder' 'banana'
 'barley malt' 'bay leaf' 'bean' 'bell pepper' 'black pepper' 'blueberry'
 'butter' 'carrot' 'cheese' 'chickpea' 'chili' 'chocolate' 'cookies'
 'coriander' 'cranberry' 'cumin' 'egg' 'flour' 'garlic' 'ginger' 'kiwi'
 'mango' 'mayonnaise' 'milk' 'mushroom' 'oil' 'okra' 'olive' 'onion' 'pea'
 'peanut' 'pineapple' 'potato' 'red pepper' 'rice' 'salt' 'skim milk'
 'spinach' 'squash' 'strawberry' 'sugar' 'tomato' 'tumeric' 'vanilla'
 'wheat bran' 'whole grain wheat']


In [130]:
# Encode, randomize and split data set.
from sklearn.model_selection import train_test_split

X = encode_samples(foods)
y = list(map(lambda sample: sample[0], foods))
X_train, X_test, y_train, y_test = train_test_split(X, y)
X_test, X_validate, y_test, y_validate = train_test_split(X_test, y_test)

print('Training set size:', len(y_train))
print('Testing set size:', len(y_test))
print('Validation set size:', len(y_validate))

Training set size: 21
Testing set size: 5
Validation set size: 2


# [Nearest Neighbours](https://scikit-learn.org/stable/modules/neighbors.html)

Scikit-learn supports three nearest neighbours algorithms:
- [Brute-force](https://scikit-learn.org/stable/modules/neighbors.html#brute-force).
- [K-D Tree](https://scikit-learn.org/stable/modules/neighbors.html#k-d-tree): best for low dimensional data (D < 20).
- [Ball Tree](https://scikit-learn.org/stable/modules/neighbors.html#ball-tree): better for high dimensional data.

## Summary

- Higher K is better.
- No difference between distance and uniform weights.

In [131]:
from sklearn.neighbors import KNeighborsClassifier as KNN

knn_distance_3 = KNN(n_neighbors=3, weights='distance').fit(X_train, y_train)
print('Distance KNN (K=3) score:', knn_distance_3.score(X_test, y_test))

knn_distance_15 = KNN(n_neighbors=15, weights='distance').fit(X_train, y_train)
print('Distance KNN (K=15) score:', knn_distance_15.score(X_test, y_test))

knn_uniform_3 = KNN(n_neighbors=3, weights='uniform').fit(X_train, y_train)
print('Uniform KNN (K=3) score:', knn_uniform_3.score(X_test, y_test))

knn_uniform_15 = KNN(n_neighbors=15, weights='uniform').fit(X_train, y_train)
print('Uniform KNN (K=15) score:', knn_uniform_15.score(X_test, y_test))

Distance KNN (K=3) score: 0.6
Distance KNN (K=15) score: 0.8
Uniform KNN (K=3) score: 0.6
Uniform KNN (K=15) score: 0.8


# [Random Forest](https://scikit-learn.org/stable/modules/ensemble.html#forests-of-randomized-trees)

# [Stochastic Gradient Descent](https://scikit-learn.org/stable/modules/sgd.html)

Requires a lot of data (>100k)

# [Support Vector Machine](https://scikit-learn.org/stable/modules/svm.html)

## Summary

- Higher polynomial is better.
- Lower C is better for RBF.
- Linear doesn't work very well.

In [132]:
# SVC: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
from sklearn.svm import SVC

svc_linear = SVC(kernel='linear').fit(X_train, y_train)
print('Linear SVC score:', svc_linear.score(X_test, y_test))

svc_poly_3 = SVC(kernel='poly', degree=3).fit(X_train, y_train)
print('Polynomial (3) SVC score:', svc_poly_3.score(X_test, y_test))

svc_poly_10 = SVC(kernel='poly', degree=10).fit(X_train, y_train)
print('Polynomial (10) SVC score:', svc_poly_10.score(X_test, y_test))

# https://scikit-learn.org/stable/auto_examples/svm/plot_rbf_parameters.html#rbf-svm-parameters
svc_rbf_c1 = SVC(C=1.0, kernel='rbf').fit(X_train, y_train)
print('Radial basis function (C=1) SVC score:', svc_rbf_c1.score(X_test, y_test))

svc_rbf_c4 = SVC(C=4.0, kernel='rbf').fit(X_train, y_train)
print('Radial basis function (C=4) SVC score:', svc_rbf_c4.score(X_test, y_test))

svc_rbf_c10 = SVC(C=10.0, kernel='rbf').fit(X_train, y_train)
print('Radial basis function (C=10) SVC score:', svc_rbf_c10.score(X_test, y_test))

svc_sigmoid = SVC(kernel='sigmoid').fit(X_train, y_train)
print('Sigmoid SVC score:', svc_sigmoid.score(X_test, y_test))


Linear SVC score: 0.4
Polynomial (3) SVC score: 0.6
Polynomial (10) SVC score: 0.8
Radial basis function (C=1) SVC score: 0.8
Radial basis function (C=4) SVC score: 0.6
Radial basis function (C=10) SVC score: 0.6
Sigmoid SVC score: 0.8
