# Project 3 - What's cooking!

## Setup

### Imports

All the imports go here, to make it easier to find them later.

In [3]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import random
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, log_loss

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

### Reding in the training file

In [4]:
data = pd.read_json('train.json') 

print("An example of what the data looks like:")
print("")
print(data)

print("")
print("Here the number of recepies of each cuisine")
print("")
print(data['cuisine'].value_counts())

An example of what the data looks like:

            cuisine     id                                        ingredients
0             greek  10259  [romaine lettuce, black olives, grape tomatoes...
1       southern_us  25693  [plain flour, ground pepper, salt, tomatoes, g...
2          filipino  20130  [eggs, pepper, salt, mayonaise, cooking oil, g...
3            indian  22213                [water, vegetable oil, wheat, salt]
4            indian  13162  [black pepper, shallots, cornflour, cayenne pe...
5          jamaican   6602  [plain flour, sugar, butter, eggs, fresh ginge...
6           spanish  42779  [olive oil, salt, medium shrimp, pepper, garli...
7           italian   3735  [sugar, pistachio nuts, white almond bark, flo...
8           mexican  16903  [olive oil, purple onion, fresh pineapple, por...
9           italian  12734  [chopped tomatoes, fresh basil, garlic, extra-...
10          italian   5875  [pimentos, sweet pepper, dried oregano, olive ...
11          chinese  45

In [5]:
# The set of different cuisines
cuisines = data.cuisine.unique()

# To find the different ingredients, we need to clean them up a little. 
def clean(string) :
    s = string.replace('-',' ') # read low-fat the same as low fat
    s = string.replace('&', 'and') # read & and and as the same 
    s = re.sub('\((.*?)\)', '', s) # remove everythin g in brackets
    s = re.sub('\d{1,2}\%', '', s) # remove things of the form d% or dd%, where d is a digit
    s = ' '.join(s.split()) # remove extra white spaces
    
    return s

ing_list = data.ingredients.values.tolist()
raw_ingredients = [clean(x) for ing in ing_list for x in ing]

ingredients = sorted(set(raw_ingredients))

print("There are %d different ingredients." % len(ingredients))
print("")
print("")
print("Here is the very long list:")
print("")
print(ingredients)
    

There are 6698 different ingredients.


Here is the very long list:

['2 1/2 to 3 lb. chicken, cut into serving pieces', '7 Up', '8 ounc ziti pasta, cook and drain', 'A Taste of Thai Rice Noodles', 'Accent Seasoning', 'Adobo All Purpose Seasoning', 'Alaskan king crab legs', 'Alexia Waffle Fries', 'Alfredo sauce', 'Amarena cherries', 'Amaretti Cookies', 'American cheese', 'Anaheim chile', 'Angostura bitters', 'Argo Corn Starch', 'Asian chili sauce', 'Asian sweet chili sauce', 'Azteca Flour Tortillas', 'BACARDIÂ® Mixers Margarita Mix', 'BACARDIÂ® Superior', "BREAKSTONE'S Sour Cream", 'Baileys Irish Cream Liqueur', 'Balsamico Bianco', 'Barilla Linguine', 'Barilla Oven-Ready Lasagne', 'Barilla Plus Pasta', 'Bartlett Pear', 'Belgian endive', 'Bengali 5 Spice', 'Bertolli Garlic Alfredo Sauce', 'Bertolli Tomato and Basil Sauce', 'BertolliÂ® Alfredo Sauce', 'BertolliÂ® Arrabbiata Sauce', 'BertolliÂ® Classico Olive Oil', "Best Food's Mayonnaise with Lime Juice", 'Best FoodsÂ® Real Mayonnaise', 

### Recipies as vectors

Here we have functions that take a list of incredients and produces a vector to represent them. 


In [6]:
# build a dictionary that to each ingredient assigns its index
ingredient_index = {}
for i in range(0,len(ingredients)) :
    ingredient_index[ingredients[i]] = i

# the same for cuisines 
cuisine_index = {}
for i in range(0, len(cuisines)) : 
    cuisine_index[cuisines[i]] = i 
    
def ingredients_to_vector(ings) :
    vect = np.zeros(len(ingredients))
    for ing in ings :
        vect[ingredient_index[clean(ing)]] = 1
        
    return vect

def cuisine_to_vector(cus) : 
    vect = np.zeros(20)
    vect[cuisine_index[cus]] = 1
    return vect

vect_list = [ingredients_to_vector(ing) for ing in ing_list]
target_list = [cuisine_to_vector(cus) for cus in data.cuisine.values.tolist()]

print(len(vect_list))
print(len(target_list))

print(vect_list[30064])
print(target_list[30064])


39774
39774
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]


In [7]:
for t in random.sample(target_list, 20) :
    print(t)
    
print(cuisine_index)

[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0

## Split data

In [10]:
X = np.c_[vect_list]
Y = np.c_[target_list]

print(X.shape)
print(Y.shape)

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.2)

print('Shape of x_train: '+ str(x_train.shape))
print('Shape of y_train: '+ str(y_train.shape))
print()
print('Shape of x_test: '+ str(x_test.shape))
print('Shape of y_test: '+ str(y_test.shape))

(39774, 6698)
(39774, 20)
Shape of x_train: (25455, 6698)
Shape of y_train: (25455, 20)

Shape of x_test: (7955, 6698)
Shape of y_test: (7955, 20)


## Alter the target data to output numbers instead of arrays

In [19]:
Y_num = np.zeros((Y.shape[0]))
for i in range(Y.shape[0]):
    Y_num[i] = np.argmax(Y[i])

#print(Y_num)
    
x_train, x_test, y_train, y_test = train_test_split(X, Y_num, test_size = 0.2)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.2)

## Random Forests

In [None]:
clf = RandomForestClassifier(n_estimators=100, max_depth=500,
                             verbose=False)
clf.fit(x_train, y_train)

## Predictiong with the trained forest

In [24]:
clf.score(x_test, y_test)
#clf.predict_proba(x_test)[1].shape

0.6897548711502199