In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf

In [2]:
import json
recipeRaw = pd.read_json("./whats-cooking/train.json")
recipeRaw["ingredientsFlat"] = recipeRaw["ingredients"].apply(lambda x: ' '.join(x))
recipeRaw.head()

Unnamed: 0,cuisine,id,ingredients,ingredientsFlat
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...",romaine lettuce black olives grape tomatoes ga...
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...",plain flour ground pepper salt tomatoes ground...
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...",eggs pepper salt mayonaise cooking oil green c...
3,indian,22213,"[water, vegetable oil, wheat, salt]",water vegetable oil wheat salt
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...",black pepper shallots cornflour cayenne pepper...


In [3]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(recipeRaw["cuisine"].values)
list(le.classes_)

['brazilian',
 'british',
 'cajun_creole',
 'chinese',
 'filipino',
 'french',
 'greek',
 'indian',
 'irish',
 'italian',
 'jamaican',
 'japanese',
 'korean',
 'mexican',
 'moroccan',
 'russian',
 'southern_us',
 'spanish',
 'thai',
 'vietnamese']

In [4]:
docs = recipeRaw["ingredientsFlat"].values
labels_enc = le.transform(recipeRaw["cuisine"].values)
labels = tf.keras.utils.to_categorical(labels_enc)
labels

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [5]:
tf.keras.preprocessing.sequence.pad_sequences

<function keras_preprocessing.sequence.pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre', truncating='pre', value=0.0)>

In [6]:
pad_sequences = tf.keras.preprocessing.sequence.pad_sequences

# prepare tokenizer
t = tf.keras.preprocessing.text.Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)
print(vocab_size)
# pad documents to a max length of 4 words
max_length = 40
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(len(padded_docs))

3065
39774


In [7]:
def bootstrap_sample_generator(batch_size):
    while True:
        batch_idx = np.random.choice(
            padded_docs.shape[0], batch_size)
        batch_x = padded_docs[batch_idx]
        batch_y = labels[batch_idx] 
        yield ({'numeric_inputs': batch_x}, 
               {'output': batch_y})

In [8]:
for batch in bootstrap_sample_generator(2):
    break
    
batch[0]['numeric_inputs'].shape

(2, 40)

In [9]:
p = .1

In [10]:
def emb_sz_rule(n_cat): 
    return min(600, round(1.6 * n_cat**0.56))

In [38]:
cat_inputs = tf.keras.layers.Input((40,), name='numeric_inputs')

In [39]:
embedding_layer = tf.keras.layers.Embedding(
    vocab_size, 
    emb_sz_rule(vocab_size), 
    input_length=40)
cat_x = embedding_layer(cat_inputs)

In [40]:
global_ave = tf.keras.layers.GlobalAveragePooling1D()(cat_x)
global_max = tf.keras.layers.GlobalMaxPool1D()(cat_x)
x = tf.keras.layers.Concatenate()([global_ave, global_max])

In [41]:
# bonus
x = tf.keras.layers.RepeatVector(40)(x)
x = tf.keras.layers.Concatenate()([cat_x, x])

x = tf.keras.layers.Dropout(p)(x)
x = tf.keras.layers.Conv1D(20, 1)(x)
x = tf.keras.layers.Activation('relu')(x)

global_ave = tf.keras.layers.GlobalAveragePooling1D()(x)
global_max = tf.keras.layers.GlobalMaxPool1D()(x)
x = tf.keras.layers.Concatenate()([global_ave, global_max])

In [42]:
x = tf.keras.layers.Dropout(p)(x)
x = tf.keras.layers.Dense(100, activation='relu')(x)

x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Dropout(p)(x)
x = tf.keras.layers.Dense(20, activation='relu')(x)

x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Dropout(p)(x)
x = tf.keras.layers.Dense(10, activation='relu')(x)

x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Dropout(p)(x)
out = tf.keras.layers.Dense(20, activation='softmax', name='output')(x)

In [43]:
model = tf.keras.models.Model(inputs=cat_inputs, outputs=out)
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [44]:
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
numeric_inputs (InputLayer)     [(None, 40)]         0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 40, 143)      438295      numeric_inputs[0][0]             
__________________________________________________________________________________________________
global_average_pooling1d_4 (Glo (None, 143)          0           embedding_4[0][0]                
__________________________________________________________________________________________________
global_max_pooling1d_4 (GlobalM (None, 143)          0           embedding_4[0][0]                
____________________________________________________________________________________________

In [45]:
batch_size = 16

model.fit_generator(
    bootstrap_sample_generator(batch_size),
    steps_per_epoch=10_000 // batch_size,
    epochs=5,
    max_queue_size=10,
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x139f90400>

In [None]:
Time series - future work

In [None]:
numeric and dense (how many layers?)
https://github.com/fastai/fastai/blob/master/fastai/tabular/models.py
https://github.com/fastai/fastai/blob/master/fastai/layers.py
https://medium.com/@hiromi_suenaga/deep-learning-2-part-1-lesson-4-2048a26d58aa

https://www.heatonresearch.com/2017/06/01/hidden-layers.html