# Model exploration

- [Choosing the right estimator](https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html)
- [Tuning the hyper-parameters of an estimator using grid search](https://scikit-learn.org/stable/modules/grid_search.html)



# Data

In [1]:
# Data samples, organized as: [classification, consistency, temperature, [ingredients]].
foods = [
  # Pizzas
  [0, 'solid', 'warm', ['bell pepper', 'cheese', 'flour', 'garlic', 'oil', 'olive', 'onion', 'salt', 'sugar', 'tomato']], # Olives
  [0, 'solid', 'warm', ['anchovy', 'bell pepper', 'cheese', 'flour', 'garlic', 'mayonnaise', 'oil', 'olive', 'onion', 'salt', 'spinach', 'sugar']], # Anchovies
  [0, 'solid', 'warm', ['apple', 'coriander', 'cheese', 'flour', 'garlic', 'oil', 'onion', 'salt', 'sugar']], # Apple & goat cheese
  [1, 'solid', 'warm', ['arugula', 'bell pepper', 'cheese', 'coriander', 'flour', 'garlic', 'oil', 'onion', 'salt', 'spinach', 'sugar', 'tomato']], # Arugula
  [1, 'solid', 'warm', ['bell pepper', 'cheese', 'flour', 'garlic', 'mushroom', 'oil', 'onion', 'salt', 'sugar']], # Mushroom
  [1, 'solid', 'warm', ['cheese', 'flour', 'garlic', 'oil', 'onion', 'salt', 'sugar', 'tomato']], # Cheese

  # Products
  # [1, 'crunchy', 'normal', ['anise', 'baking powder', 'egg', 'flour', 'oil', 'sugar']], # Biscotti
  # [1, 'crunchy', 'cool', ['almond', 'barley malt', 'milk', 'rice', 'salt', 'sugar', 'wheat bran', 'whole grain wheat']], # Vanilla almond Special K

  # Smoothies & milkshakes
  [1, 'chewy', 'cool', ['cookies', 'milk', 'skim milk', 'sugar', 'vanilla']], # Oreo ice cream
  [0, 'thick', 'cool', ['blueberry', 'pineapple', 'strawberry']],
  [0, 'thick', 'cool', ['blueberry', 'kiwi', 'strawberry']],
  [0, 'thick', 'cool', ['chocolate', 'milk', 'vanilla']],
  [0, 'thick', 'cool', ['banana', 'chocolate', 'milk', 'vanilla']],
  [0, 'thick', 'cool', ['milk', 'strawberry', 'vanilla']],
  [1, 'thick', 'cool', ['banana', 'mango', 'pineapple']],
  [1, 'thick', 'cool', ['blueberry', 'kiwi', 'mango', 'pineapple']],
  [1, 'thick', 'cool', ['banana', 'milk', 'strawberry', 'vanilla']],
  [1, 'thick', 'cool', ['banana', 'milk', 'kiwi', 'vanilla']],

  # Soups & chilis
  [0, 'mashed', 'warm', ['bay leaf', 'chickpea', 'chili', 'coriander', 'cumin', 'garlic', 'ginger', 'oil', 'onion', 'salt', 'tomato', 'tumeric']], # Chole chickpea curry
  [0, 'mashed', 'warm', ['black pepper', 'chickpea', 'chili', 'coriander', 'cumin', 'garlic', 'ginger', 'oil', 'onion', 'potato', 'salt', 'spinach', 'tomato', 'tumeric']], # Sweet potato curry
  [1, 'mashed', 'warm', ['black pepper', 'butter', 'cheese', 'coriander', 'garlic', 'potato', 'salt']], # Mash potatoes
  [1, 'mashed', 'warm', ['bean', 'black pepper', 'carrot', 'chili', 'garlic', 'onion', 'potato', 'salt', 'tomato', 'tumeric']], # Chili
  [0, 'liquid', 'warm', ['black pepper', 'butter', 'flour', 'garlic', 'onion', 'peanut', 'salt']], # Peanut soup
  [1, 'thick', 'warm', ['black pepper', 'carrot', 'chili', 'garlic', 'onion', 'potato', 'salt', 'squash']], # Butternut squash soup

  # Miscellaneous (stuff made out of ingredients from above)
  [1, 'crunchy', 'normal', ['almond', 'apple', 'arugula', 'cranberry', 'oil', 'salt']], # Arugula salad
  [1, 'solid', 'warm', ['black pepper', 'cheese', 'egg', 'garlic', 'mushroom', 'oil', 'onion', 'red pepper', 'salt', 'spinach']], # Mushroom spinach omelet
  [1, 'solid', 'warm', ['bean', 'black pepper', 'oil', 'onion', 'rice', 'salt', 'tumeric']], # Rice and beans
  [1, 'solid', 'warm', ['black pepper', 'mushroom', 'oil', 'onion', 'pea', 'rice', 'salt']], # Riz djondjon
  [1, 'solid', 'cool', ['banana', 'butter', 'egg', 'flour', 'milk', 'salt', 'strawberry']], # Crepes
  [0, 'solid', 'warm', ['black pepper', 'oil', 'okra', 'onion', 'rice', 'salt', 'spinach']], # Kalalou
]

print('Size of data set:', len(foods))

Size of data set: 28


In [2]:
# General utility functions.
def get_descriptions(samples):
  """Retrieves food descriptions from a data set."""

  return list(map(lambda sample: sample[1:3], foods))

def get_ingredients(samples):
  """Retrieves ingredients from a data set."""

  return list(map(lambda sample: sample[3], foods))

def encode_samples(samples):
  """Encodes food samples to use as inputs to a model."""
  sample_descriptions = list(map(lambda sample: sample[1:3], samples))
  sample_ingredients = list(map(lambda sample: sample[3], samples))
  encoded_data_set = []
  encoded_descriptions = food_descriptions_encoder\
    .transform(sample_descriptions)\
    .toarray()
  encoded_ingredients = ingredients_encoder.transform(sample_ingredients)

  for i in range(len(samples)):
    encoded_data_set.append(np.concatenate([encoded_descriptions[i], encoded_ingredients[i]]))
  
  return encoded_data_set

In [3]:
# Encoders.
import numpy as np
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer, OneHotEncoder

food_descriptions_encoder = OneHotEncoder()
food_descriptions_encoder.fit(get_descriptions(foods))
ingredients_encoder = MultiLabelBinarizer()
ingredients_encoder.fit(get_ingredients(foods))

print(f'Food descriptions:', food_descriptions_encoder.categories_)
print(f'Ingredients ({len(ingredients_encoder.classes_)}):', ingredients_encoder.classes_)

Food descriptions: [array(['chewy', 'crunchy', 'liquid', 'mashed', 'solid', 'thick'],
      dtype=object), array(['cool', 'normal', 'warm'], dtype=object)]
Ingredients (48): ['almond' 'anchovy' 'apple' 'arugula' 'banana' 'bay leaf' 'bean'
 'bell pepper' 'black pepper' 'blueberry' 'butter' 'carrot' 'cheese'
 'chickpea' 'chili' 'chocolate' 'cookies' 'coriander' 'cranberry' 'cumin'
 'egg' 'flour' 'garlic' 'ginger' 'kiwi' 'mango' 'mayonnaise' 'milk'
 'mushroom' 'oil' 'okra' 'olive' 'onion' 'pea' 'peanut' 'pineapple'
 'potato' 'red pepper' 'rice' 'salt' 'skim milk' 'spinach' 'squash'
 'strawberry' 'sugar' 'tomato' 'tumeric' 'vanilla']


In [4]:
# Encode, randomize and split data set.
from sklearn.model_selection import train_test_split

X = encode_samples(foods)
y = list(map(lambda sample: sample[0], foods))
X_train, X_test, y_train, y_test = train_test_split(X, y)

print('Training set size:', len(y_train))
print('Testing set size:', len(y_test))

Training set size: 21
Testing set size: 7


In [5]:
# Data-related utility functions.
from sklearn.model_selection import cross_val_score

def train(model_class, highlight=False, **args):
  model = model_class(**args)
  cross_validation = cross_val_score(model, X_train, y_train, cv=4)
  cv_mean_score = round(cross_validation.mean(), 3)
  training_set_score = round(model.fit(X_train, y_train).score(X_test, y_test), 3)

  arg_str = ', '.join([f'{key}={args[key]}' for key in args])
  name = f'{model_class.__name__}({arg_str})'
  output = f'{name.ljust(40, ".")} {cv_mean_score}, {training_set_score}'

  if (highlight):
    output += ' **'

  print(output)

  return None

# [Nearest Neighbours](https://scikit-learn.org/stable/modules/neighbors.html)

Scikit-learn supports three nearest neighbours algorithms:
- [Brute-force](https://scikit-learn.org/stable/modules/neighbors.html#brute-force).
- [K-D Tree](https://scikit-learn.org/stable/modules/neighbors.html#k-d-tree): best for low dimensional data (D < 20).
- [Ball Tree](https://scikit-learn.org/stable/modules/neighbors.html#ball-tree): better for high dimensional data.

## Summary

- Higher K is better.
- Uniform weights might be slightly better than distance weights.
- Top performers: Uniform KNN (K=15)

In [None]:
from sklearn.neighbors import KNeighborsClassifier as KNN

train(KNN, n_neighbors=3, weights='distance')
train(KNN, n_neighbors=15, weights='distance')
train(KNN, n_neighbors=1, weights='uniform')
train(KNN, n_neighbors=3, weights='uniform')
train(KNN, n_neighbors=7, weights='uniform')
train(KNN, n_neighbors=15, weights='uniform', highlight=True)
# train(KNN, n_neighbors=20, weights='uniform')

KNeighborsClassifier(n_neighbors=3, weights=distance) 0.417, 0.286
KNeighborsClassifier(n_neighbors=15, weights=distance) 0.575, 0.286
KNeighborsClassifier(n_neighbors=1, weights=uniform) 0.567, 0.143
KNeighborsClassifier(n_neighbors=3, weights=uniform) 0.417, 0.286
KNeighborsClassifier(n_neighbors=7, weights=uniform) 0.342, 0.429
KNeighborsClassifier(n_neighbors=15, weights=uniform) 0.425, 0.571 **


# [Random Forest](https://scikit-learn.org/stable/modules/ensemble.html#forests-of-randomized-trees)

# [Stochastic Gradient Descent](https://scikit-learn.org/stable/modules/sgd.html)

Requires a lot of data (>100k)

# [Support Vector Machine](https://scikit-learn.org/stable/modules/svm.html)

## Summary

- Higher polynomial is better.
- Lower C is better for RBF.
- Linear doesn't work very well.
- Top performers:
  1. SVC(C=10.0, gamma=0.63, kernel=rbf)
  2. SVC(C=4.0, gamma=1.0, kernel=rbf)
  3. SVC(C=10.0, gamma=1.22, kernel=rbf)
  4. SVC(degree=10, kernel=poly)
  5. SVC(kernel=sigmoid)

In [None]:
# SVC: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
from sklearn.svm import SVC

# Linear
train(SVC, kernel='linear')

# Polynomial
train(SVC, degree=3, kernel='poly')
train(SVC, degree=10, kernel='poly', highlight=True)
train(SVC, degree=20, kernel='poly')

# Radial basis function
train(SVC, C=0.01, kernel='rbf')
train(SVC, C=0.1, kernel='rbf')
train(SVC, C=0.1, gamma=0.1, kernel='rbf')
train(SVC, C=0.1, gamma=1.0, kernel='rbf')
train(SVC, C=0.1, gamma=10.0, kernel='rbf')
train(SVC, C=1.0, kernel='rbf')
train(SVC, C=4.0, kernel='rbf')
train(SVC, C=4.0, gamma=0.1, kernel='rbf')
train(SVC, C=4.0, gamma=0.11, kernel='rbf', highlight=True)
train(SVC, C=4.0, gamma=0.12, kernel='rbf')
train(SVC, C=4.0, gamma=0.15, kernel='rbf')
train(SVC, C=4.0, gamma=0.2, kernel='rbf')
train(SVC, C=4.0, gamma=0.5, kernel='rbf')
train(SVC, C=4.0, gamma=1.0, kernel='rbf', highlight=True)
train(SVC, C=4.0, gamma=2.0, kernel='rbf')
train(SVC, C=4.0, gamma=3.0, kernel='rbf')
train(SVC, C=4.0, gamma=4.0, kernel='rbf', highlight=True)
train(SVC, C=4.0, gamma=5.0, kernel='rbf')
train(SVC, C=4.0, gamma=10.0, kernel='rbf')
train(SVC, C=5.0, kernel='rbf')
train(SVC, C=8.0, kernel='rbf')
train(SVC, C=10.0, kernel='rbf')
train(SVC, C=10.0, gamma=0.1, kernel='rbf')
train(SVC, C=10.0, gamma=0.63, kernel='rbf', highlight=True)
train(SVC, C=10.0, gamma=1.0, kernel='rbf')
train(SVC, C=10.0, gamma=1.22, kernel='rbf', highlight=True)
train(SVC, C=10.0, gamma=4.0, kernel='rbf', highlight=True)
train(SVC, C=10.0, gamma=10.0, kernel='rbf')

# Sigmoid
train(SVC, kernel='sigmoid', highlight=True)


SVC(kernel=linear)...................... 0.617, 0.143
SVC(degree=3, kernel=poly).............. 0.483, 0.143
SVC(degree=10, kernel=poly)............. 0.433, 0.143 **
SVC(degree=20, kernel=poly)............. 0.475, 0.143
SVC(C=0.01, kernel=rbf)................. 0.525, 0.143
SVC(C=0.1, kernel=rbf).................. 0.525, 0.143
SVC(C=0.1, gamma=0.1, kernel=rbf)....... 0.525, 0.143
SVC(C=0.1, gamma=1.0, kernel=rbf)....... 0.525, 0.143
SVC(C=0.1, gamma=10.0, kernel=rbf)...... 0.475, 0.143
SVC(C=1.0, kernel=rbf).................. 0.475, 0.0
SVC(C=4.0, kernel=rbf).................. 0.617, 0.143
SVC(C=4.0, gamma=0.1, kernel=rbf)....... 0.617, 0.143
SVC(C=4.0, gamma=0.11, kernel=rbf)...... 0.617, 0.143 **
SVC(C=4.0, gamma=0.12, kernel=rbf)...... 0.617, 0.143
SVC(C=4.0, gamma=0.15, kernel=rbf)...... 0.617, 0.286
SVC(C=4.0, gamma=0.2, kernel=rbf)....... 0.617, 0.286
SVC(C=4.0, gamma=0.5, kernel=rbf)....... 0.625, 0.429
SVC(C=4.0, gamma=1.0, kernel=rbf)....... 0.575, 0.0 **
SVC(C=4.0, gamma=2.0, k