In [338]:
import numpy as np
np.set_printoptions(threshold=np.nan)
import keras
from keras.models import Sequential
from keras.layers import Conv1D, GlobalMaxPool1D, Dropout, Dense
from keras.utils import np_utils, to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from gensim.models.word2vec import Word2Vec

import pandas as pd
%matplotlib inline

In [352]:
# load data
data = pd.read_json('~/Desktop/datasets/recipes/train.json')
print(data.head())

unique_ingredients = {}
max_ingredients = 0

# join multi word ingredients, easier to encode later
for i, row in data.iterrows():
    data.at[i, 'ingredients'] = [x.replace(" ", "_") for x in row['ingredients']]
    data.at[i, 'total_ingredients'] = len(row['ingredients'])
    
    if len(row['ingredients']) > max_ingredients:
        max_ingredients = len(row['ingredients'])
    
    for x in row['ingredients']:
        unique_ingredients[x.replace(" ", "_")] = 1

       cuisine     id                                        ingredients
0        greek  10259  [romaine lettuce, black olives, grape tomatoes...
1  southern_us  25693  [plain flour, ground pepper, salt, tomatoes, g...
2     filipino  20130  [eggs, pepper, salt, mayonaise, cooking oil, g...
3       indian  22213                [water, vegetable oil, wheat, salt]
4       indian  13162  [black pepper, shallots, cornflour, cayenne pe...


In [341]:
def get_one_hot(data):
    encoder = LabelEncoder()
    int_encoded = encoder.fit_transform(list(data))
    int_encoded = int_encoded.reshape(len(int_encoded), 1)
    onehot_encoded = OneHotEncoder(sparse=False)
    onehot_encoded = onehot_encoded.fit_transform(int_encoded)
    return onehot_encoded

y_train = get_one_hot(data['cuisine'])

words = [[x] for x in list(unique_ingredients.keys())]
model = Word2Vec(words, min_count=1)
data['ingredients'] = data['ingredients'].astype(object)

data['ingredient_vec'] = pd.Series([np.zeros(64)], index=data.index)

# convert ingredients into a single 1d vector by adding word vectors for each ingredient
for i, row in data.iterrows():
    wordvec = np.zeros(100)
    for ing in row['ingredients']:
        wordvec += model.wv[ing]
    data.at[i, 'ingredient_vec'] = wordvec

In [342]:
data.head()

Unnamed: 0,cuisine,id,ingredients,total_ingredients,ingredient_vec,ingredient
0,greek,10259,"[romaine_lettuce, black_olives, grape_tomatoes...",9.0,"[-0.012676712649408728, -0.0064097283175215125...",
1,southern_us,25693,"[plain_flour, ground_pepper, salt, tomatoes, g...",11.0,"[-0.00018075754633173347, 0.008255297230789438...",
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking_oil, g...",12.0,"[-0.006607251561945304, 0.008808402111753821, ...",
3,indian,22213,"[water, vegetable_oil, wheat, salt]",4.0,"[-0.006275213891058229, 0.0011546076857484877,...",
4,indian,13162,"[black_pepper, shallots, cornflour, cayenne_pe...",20.0,"[-0.007589720888063312, -0.020675597013905644,...",


In [348]:
model = Sequential()
x_train = data['ingredient_vec']
input_dim = len(x_train[0])

# input
model.add(Dense(512, input_dim=input_dim, activation='relu'))
model.add(Dense(512, input_dim=input_dim, activation='relu'))
model.add(Dropout(0.2))

# output
model.add(Dense(len(y_train[0]), activation='sigmoid'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_39 (Dense)             (None, 512)               51712     
_________________________________________________________________
dense_40 (Dense)             (None, 512)               262656    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_41 (Dense)             (None, 20)                10260     
Total params: 324,628
Trainable params: 324,628
Non-trainable params: 0
_________________________________________________________________


In [349]:
x_train.shape

(39774,)

In [353]:
model.fit(
    np.array(list(x_train)), 
    y_train, 
    validation_split=0.1,
    shuffle=True,
    epochs=10
)

Train on 35796 samples, validate on 3978 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x170e636d8>