In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

#### Things to keep in mind when tackling this data:
<br/>
<ul>
    <li><h5>Preprocessing:</h3></li>
    <ul>
        <li>Encode ingredient lists into features</li>
        <li>Possibly one-hot encode each ingredient</li>
    </ul>
</ul>

In [6]:
# To read json file
train = pd.read_json("train.json")
y_train = train["cuisine"]
X_train = train["ingredients"]

# To build an encoding of ingredients (treating each phrase that represents an ingredient independently)
word_index = {}
ctr = 1
encoded = np.zeros((len(X_train), 7000))
for i in range(len(X_train)): # Loop through every example
    for j in X_train.values[i]: # Loop through every ingredient
        if j in word_index:
            encoded[i, word_index[j]] = 1
        else:
            word_index[j] = ctr
            encoded[i, ctr] = 1
            ctr = ctr + 1

In [17]:
encoded[:5]

array([[0., 1., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [8]:
y_train.value_counts()
y_train = pd.get_dummies(y_train)
label = y_train.columns
y_train = y_train.values
y_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(encoded, y_train)
encoded.shape

(39774, 7000)

In [14]:
X_train[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [12]:
y_train[:5]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]],
      dtype=uint8)

In [15]:
encoded[:5]

array([[0., 1., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [67]:
y_test[:5]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]],
      dtype=uint8)

In [68]:
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.losses import categorical_crossentropy
from keras.models import Sequential

model = Sequential()
model.add(Dense(32, activation="relu"))
model.add(Dropout(.4))
model.add(Dense(20, activation="softmax"))

adam = Adam()

In [69]:
model.compile(optimizer = adam, loss="categorical_crossentropy", metrics = ["accuracy"])

In [70]:
model.fit(x=X_train, y=y_train, epochs = 10, validation_data=[X_test, y_test])

Train on 29830 samples, validate on 9944 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x11b527080>