In [8]:
import json
import numpy as np
import pandas as pd
import tiktoken
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.utils import to_categorical

In [3]:
df = pd.read_csv('RecipeNLG/RecipeNLG_dataset.csv')
print(f'df size before: {len(df)}')
df.head()

df size before: 2231142


Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER
0,0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""peanut butter"", ""graham cracker crumbs"", ""bu..."


In [4]:
df = df[~df['link'].str.contains('www.cookbooks.com')]
print(f'df size without cookbooks.com: {len(df)}')
df = df[~df['link'].str.contains('www.allrecipes.com')]
print(f'df size without allrecipes.com: {len(df)}')
df.head()

df size without cookbooks.com: 1334801
df size without allrecipes.com: 1273403


Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER
957739,957739,Kale Carrot And Apple Calcium Booster Juice Fo...,"[""2 kale leaves"", ""2 celery ribs"", ""1/3 cup pa...","[""Wash and cut each vegetable and feed through...",www.food.com/recipe/kale-carrot-and-apple-calc...,Gathered,"[""kale leaves"", ""celery"", ""parsley"", ""carrots""..."
957740,957740,Chocolade Pudding - Chocolate Pudding,"[""1 cup milk"", ""1 ounce chocolate"", ""1/2 cup s...","[""Heat milk with chocolate in the top of a dou...",www.food.com/recipe/chocolade-pudding-chocolat...,Gathered,"[""milk"", ""chocolate"", ""sugar"", ""salt"", ""cornst..."
957741,957741,Haluski,"[""1 cabbage, green, core removed and leaves se...","[""Bring a large pot of water to a boil. Add th...",www.food.com/recipe/haluski-407129,Gathered,"[""cabbage"", ""butter"", ""onions"", ""salt"", ""bacon..."
957742,957742,Cajun Style White Bean Soup,"[""8 ounces great northern beans"", ""water, to c...","[""Put rinsed beans, in a 4 quart dutch oven an...",www.food.com/recipe/cajun-style-white-bean-sou...,Gathered,"[""beans"", ""water"", ""water"", ""tomatoes"", ""ham"",..."
957743,957743,Toddler Tamer Tea,"[""1 chamomile tea bag"", ""2 tablespoons honey"",...","[""Place the tea bag in a microwave safe cup."",...",www.food.com/recipe/toddler-tamer-tea-188781,Gathered,"[""honey"", ""water"", ""cold milk""]"


In [5]:
def stringify_recipe(recipe):
    title = recipe['title']
    ingredients = eval(recipe['ingredients'])
    directions = eval(recipe['directions'])
    ner = eval(recipe['NER'])

    return json.dumps({
        'ner': ner,
        'title': title,
        'ingredients': ingredients,
        'directions': directions,
    })

jsonified_recipes = df.apply(stringify_recipe, axis=1)

In [6]:
for r in jsonified_recipes[:2]:
    print(type(r))
    print(json.dumps(json.loads(r), indent=3))
    print()

<class 'str'>
{
   "ner": [
      "kale leaves",
      "celery",
      "parsley",
      "carrots",
      "apple"
   ],
   "title": "Kale Carrot And Apple Calcium Booster Juice For Juicer",
   "ingredients": [
      "2 kale leaves",
      "2 celery ribs",
      "1/3 cup parsley",
      "3 carrots",
      "1 apple, seeds and stem removed"
   ],
   "directions": [
      "Wash and cut each vegetable and feed through the mouth of a juicer one at a time."
   ]
}

<class 'str'>
{
   "ner": [
      "milk",
      "chocolate",
      "sugar",
      "salt",
      "cornstarch",
      "cocoa",
      "butter",
      "vanilla"
   ],
   "title": "Chocolade Pudding - Chocolate Pudding",
   "ingredients": [
      "1 cup milk",
      "1 ounce chocolate",
      "1/2 cup sugar",
      "1/8 teaspoon salt",
      "1 tablespoon cornstarch",
      "2 tablespoons cocoa",
      "1 tablespoon butter (no substitutions)",
      "1/4 teaspoon vanilla"
   ],
   "directions": [
      "Heat milk with chocolate in the to

In [26]:
enc = tiktoken.get_encoding("cl100k_base")

encoded_recipes = [np.array(enc.encode(r)) for r in jsonified_recipes]

4021


In [27]:
max_recipe_length = max([len(encoded_recipe) for encoded_recipe in encoded_recipes])
print(max_recipe_length)

4021


In [None]:
X = np.zeros((len(encoded_recipes), max_recipe_length))
for i, recipe in enumerate(encoded_recipes):
    X[i][:len(recipe)] = recipe

Y = np.zeros((len(encoded_recipes), max_recipe_length))

In [12]:
for recipe in jsonified_recipes[:2]:
    print(recipe)
    encoded_recipe = enc.encode(recipe)
    print(encoded_recipe)
    print(enc.decode(encoded_recipe))
    print()
    for i, token in enumerate(encoded_recipe):
        continue

# for i in range(len(all_text) - seq_length):
#     seq_in = all_text[i:i + seq_length]
#     seq_out = all_text[i + seq_length]
#     dataX.append([char_to_int[char] for char in seq_in])
#     dataY.append(char_to_int[seq_out])
# n_patterns = len(dataX)
# X = np.reshape(dataX, (n_patterns, seq_length, 1)) / float(len(chars))
# y = to_categorical(dataY)

{"ner": ["kale leaves", "celery", "parsley", "carrots", "apple"], "title": "Kale Carrot And Apple Calcium Booster Juice For Juicer", "ingredients": ["2 kale leaves", "2 celery ribs", "1/3 cup parsley", "3 carrots", "1 apple, seeds and stem removed"], "directions": ["Wash and cut each vegetable and feed through the mouth of a juicer one at a time."]}
[5018, 1215, 794, 4482, 74, 1604, 11141, 498, 330, 3757, 727, 498, 330, 73480, 3258, 498, 330, 7063, 90329, 498, 330, 23182, 8073, 330, 2150, 794, 330, 42, 1604, 3341, 4744, 1628, 8325, 96104, 75292, 66851, 1789, 22410, 13296, 498, 330, 39220, 794, 4482, 17, 63577, 11141, 498, 330, 17, 70121, 56249, 498, 330, 16, 14, 18, 10747, 88621, 498, 330, 18, 62517, 498, 330, 16, 24149, 11, 19595, 323, 19646, 7108, 8073, 330, 19612, 82, 794, 4482, 54, 1003, 323, 4018, 1855, 36581, 323, 5510, 1555, 279, 11013, 315, 264, 10479, 13296, 832, 520, 264, 892, 1210, 14316]
{"ner": ["kale leaves", "celery", "parsley", "carrots", "apple"], "title": "Kale Carrot

In [None]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()
model.save('neural_nourishment.keras')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train, epochs=15, batch_size=128, validation_data=(X_test, y_test))