# Really tho'... how many ingredients could there possibly be???
---
The purpose of this notebook is to act as a workspace to explore and clean datasets.  Consider this a playground which may output resources for use in modeling.

In [5]:
# imports.  you're prolly gonna use all these, right?
import pdb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import LSTM

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

import re
import string

In [6]:
# # in case you want to run down this rabbit hole...
# import spacy
# nlp = spacy.load("en_core_web_lg")

---
## Importing Data
Pulled from two sources (links in the Notion doc), these are over 60k recipes ingredients.  While only the first includes directions, the others include ingredients which will still serve for ingredient prediction modeling!

In [7]:
df = pd.read_csv('../data/clean_recipes.csv', sep=';')
kgl1_df = pd.read_json('../data/kgl_ingredient_train.json')
kgl2_df = pd.read_json('../data/kgl_ingredient_test.json')

In [8]:
df = df.rename(columns={'Ingredients':'ingredients'})
df.head(3)

Unnamed: 0,Recipe Name,Review Count,Recipe Photo,Author,Prepare Time,Cook Time,Total Time,ingredients,Directions,RecipeID
0,Golden Crescent Rolls Recipe,304,https://images.media-allrecipes.com/userphotos...,Mike A.,25 m,15 m,3 h 10 m,"yeast,water,white sugar,salt,egg,butter,flour,...","Dissolve yeast in warm water.**Stir in sugar, ...",7000
1,Poppy Seed Bread with Glaze Recipe,137,https://images.media-allrecipes.com/userphotos...,Christina Jun,15 m,1 h,1 h 20 m,"flour,salt,baking powder,poppy,butter,vegetabl...",'Preheat oven to 350 degrees F (175 degrees C)...,7001
2,Applesauce Bread I Recipe,124,https://images.media-allrecipes.com/userphotos...,GAF55,10 m,1 h 20 m,1 h 30 m,"flour,egg,white sugar,vegetable oil,applesauce...",Preheat oven to 350 degrees F (175 degrees C)....,7003


In [9]:
# df ingredients is a single string, splitting into a list of strings
df['ingredients'] = df['ingredients'].apply(lambda x: x.split(','))

In [10]:
l1 = df['ingredients']
l2 = kgl1_df['ingredients']
l3 = kgl2_df['ingredients']

ingr_series = l1.append([l2, l3], ignore_index=True)

In [11]:
ingr_series[:10]

0    [yeast, water, white sugar, salt, egg, butter,...
1    [flour, salt, baking powder, poppy, butter, ve...
2    [flour, egg, white sugar, vegetable oil, apple...
3    [flour, baking powder, baking soda, salt, cinn...
4    [oat, buttermilk, vegetable oil, egg, brown su...
5    [shortening, white sugar, salt, milk, egg, lem...
6    [shortening, water, brown sugar, yeast, water,...
7    [water, cottage cheese, vegetable oil, egg, fl...
8    [margarine, white sugar, egg, banana, coffee, ...
9    [cornmeal, milk, egg, bell pepper, onion, garl...
Name: ingredients, dtype: object

In [12]:
def scrub_ingredients(ingr_list):
    
    chtble = str.maketrans('', '', string.punctuation + '0123456789')
    
    # drop special characters and multiple spaces, then convert to lowercase
    clean_list = []
    for ingr_string in ingr_list:
        ingr_string = re.sub(r'\W\s+', ' ', ingr_string.strip().translate(chtble))
        clean_list.append(ingr_string.lower())

    # convert to lowercase & drop known measurement words
    dropwords = ['ounce', 'ounces', 'oz', 'lb', ' pound', 'the', 'a', 'an'
                 'package', 'packages', 'inches', 'and', 'for', 'as', 'is']

    outlist = []

    for i in clean_list:
        words = i.split()
        words = [word for word in words if word not in dropwords]
        i = ' '.join(words)
        outlist.append(i)
    
    return outlist

In [13]:
ingr_s = ingr_series.apply(scrub_ingredients)
ingr_s[:10]

0    [yeast, water, white sugar, salt, egg, butter,...
1    [flour, salt, baking powder, poppy, butter, ve...
2    [flour, egg, white sugar, vegetable oil, apple...
3    [flour, baking powder, baking soda, salt, cinn...
4    [oat, buttermilk, vegetable oil, egg, brown su...
5    [shortening, white sugar, salt, milk, egg, lem...
6    [shortening, water, brown sugar, yeast, water,...
7    [water, cottage cheese, vegetable oil, egg, fl...
8    [margarine, white sugar, egg, banana, coffee, ...
9    [cornmeal, milk, egg, bell pepper, onion, garl...
Name: ingredients, dtype: object

In [14]:
def ingredient_count(dfcolumn):
    '''
    Takes a pandas Series of lists (or list of lists) and returns a count of each unique string.
    '''

    # Create dictionary with word count
    d = dict()

    for l in dfcolumn:
        for i in l:
            if i in d:
                d[i] = d[i] + 1
            else:
                d[i] = 1

    return pd.DataFrame(d.items(), columns=['ingredient', 'count'])

In [15]:
# create df of unique ingredients and their number of occurances in recipe dfs
ingr_count_df = ingredient_count(ingr_s).sort_values('count', ascending=False).reset_index(drop=True)
ingr_count_df.head(15)

Unnamed: 0,ingredient,count
0,salt,27050
1,water,11899
2,garlic,11287
3,butter,10534
4,onions,10008
5,olive oil,9889
6,sugar,9319
7,garlic cloves,7772
8,pepper,7080
9,vegetable oil,6874


---
## The long way round...

You can see in the df above, there is overlap with some ingredients such as sugar & white sugar or flour & all purpose flour.  This problem is further extended to things like typos or added brand names.  If we are to train a model with the data we have found, we need to correct for this.  

After spending a week or so trying to be clever (i.e. NLP, TF-IDF, scraping other sites for ingredients, etc.), it became apparent the best approach might just be a woeful amount of legwork.   What follows is that legwork.

In [16]:
## Reset ##

ingr_count_categories = ingr_count_df.copy()
ingr_count_categories = ingr_count_categories[ingr_count_categories['count'] >= 2]
ingr_count_categories['categories'] = np.empty((len(ingr_count_categories), 0)).tolist()

root_ingr_list = []

### gameplan...
create a new column in our df called 'categories'

search through the unique indgredient names and assign a category label to each

create a dictionary from the newly categorized ingredients >> categories

replace all values in the original dataframe with their category label & export for modeling


### variables -
- list of "unique ingredients" still needing assignment
- df with ingredients & their assigned category/categories
- list of defined categories

### steps -
- take the root/essence of the first 5 words in df ingr_count_categories
  - red potatoes >> potato
  - sea salt >> salt
  - boiled cabbage >> cabbage
  - (etc...)
- print a snapshot of the way those words appear in the ingr_list
  - potato >> [potatoes, baked potato, etc....]
- manually identify any outliers that don't fit the root word
  - ...potato chips, ...
- list.append root word to df.category wherever that substring correctly appears in the ingredient list
  - 93 ---- zucchini ---- 1279 ---- []
  - 94 ---- potatoes ---- 1272 ---- \[potato]
  - 95 ---- bay leaf ---- 1264 ---- []
  - ...
  - 3163 ---- vanilla pudding   ---- 1279 ---- []
  - 3164 ---- potato chips      ---- 1272 ---- \[]
  - 3165 ---- halibut steak     ---- 1264 ---- []


In [17]:
# make a list of remaining ingredients to begin categorization process
uncategorized_ingr = ingr_count_categories.loc[(ingr_count_categories['categories'].str.len() == 0), :]
working_ingr_list = list(uncategorized_ingr['ingredient'])
print(len(working_ingr_list), 'items in working_ingr_list')
print('\nSnapshot of df remaining uncategorized')
ingr_count_categories.head()

5531 items in working_ingr_list

Snapshot of df remaining uncategorized


Unnamed: 0,ingredient,count,categories
0,salt,27050,[]
1,water,11899,[]
2,garlic,11287,[]
3,butter,10534,[]
4,onions,10008,[]


---
Defining a few functions that will help along the way

In [77]:
# for those times when you realize avocado doesn't have 3 'a's in it...
def edit_remove_catg(dropword):
    return  ingr_count_categories['categories'].remove(dropword)

def view_progress(i = 10):
    labels = sorted(list(set(ingr_count_categories.loc[(ingr_count_categories['categories'].str.len() > 0), :]['categories'])))
    unlabeled_df = ingr_count_categories.loc[(ingr_count_categories['categories'].str.len() == 0), :]
    unlabeled = list(unlabeled_df['ingredient'])

    total_ingr_count = sum(ingr_count_categories['count'])
    perc_missing_root = sum(unlabeled_df['count'])/total_ingr_count

    print(f'current number of ingredent labels: {len(labels)}')
    print(f'number of ingredients still missing label: {len(unlabeled)} ({perc_missing_root:.3}%)\n\nTop {i} unlabeled ingredients:')
    return (unlabeled[0:i])

In [78]:
view_progress()

current number of ingredent labels: 1
number of ingredients still missing label: 5519 (0.998%)

Top 10 unlabeled ingredients:


['salt',
 'water',
 'garlic',
 'butter',
 'onions',
 'olive oil',
 'sugar',
 'garlic cloves',
 'pepper',
 'vegetable oil']

In [79]:
def find_ingr_indxs(search_word):
    '''
    look for a string in the ingredient column, print index list of those strings with their ingredient, then return array of indexes
    '''
    trimmed_df = ingr_count_categories.loc[(ingr_count_categories['categories'].str.len() == 0), :]
    subject_ingr_df = trimmed_df[trimmed_df['ingredient'].str.contains(search_word)]
    for i in subject_ingr_df.index:
        print(f"{i}: {list(subject_ingr_df.loc[i, ['ingredient']])[0]}")

    search_ingr_indxs = list(subject_ingr_df.index)

    return(search_ingr_indxs)

def apply_label(ingr_label, label_indexes):
    '''
    applies label to indexes provided, first param allows for applying different label than what was searched, mostly used for typos in db
    '''
    ingr_count_categories.loc[label_indexes, ['categories']] = ingr_label
    print('---Label applied!--- \n')
    return ingr_count_categories[ingr_count_categories['categories'] == ingr_label].head(5)

In [81]:
new_search_word = 'paprika'

search_word_indxs = find_ingr_indxs(new_search_word)
drop_indexes = []

56: paprika
276: smoked paprika
566: sweet paprika
1203: hungarian sweet paprika
1442: spanish paprika
1666: hungarian paprika
1724: ground paprika
2418: hot smoked paprika
2624: smoked sweet spanish paprika
4045: hungarian hot paprika
4230: hot spanish paprika
4273: spanish smoked paprika


In [84]:
# identify any indexes that do not belong in that label
drop_indexes = []
label_indexes = list(set(search_word_indxs) - set(drop_indexes))

In [85]:
apply_label(new_search_word, label_indexes)

---Label applied!--- 



Unnamed: 0,ingredient,count,categories
56,paprika,1895,paprika
276,smoked paprika,431,paprika
566,sweet paprika,177,paprika
1203,hungarian sweet paprika,51,paprika
1442,spanish paprika,38,paprika


In [87]:
# # save categorized df, just in case a crash happens!!!
# ingr_count_categories.to_csv('categorized_ingredients.csv', index=False)

...rinse, repeat


-----

In [91]:
### after some time, you have a db that will sub in ingredients, loading that previous work here:
ingr_count_categories = pd.read_csv('./categorized_ingr.csv')

In [92]:
#  get a rough idea what the progress is ---
checking_df = ingr_count_categories.copy()
testing_df = ingr_s.apply(pd.Series)
testing_df = testing_df.rename(columns = lambda x: 'ingr_' + str(x))
testing_df.head(3)

Unnamed: 0,ingr_0,ingr_1,ingr_2,ingr_3,ingr_4,ingr_5,ingr_6,ingr_7,ingr_8,ingr_9,...,ingr_55,ingr_56,ingr_57,ingr_58,ingr_59,ingr_60,ingr_61,ingr_62,ingr_63,ingr_64
0,yeast,water,white sugar,salt,egg,butter,flour,butter,,,...,,,,,,,,,,
1,flour,salt,baking powder,poppy,butter,vegetable oil,egg,milk,white sugar,vanilla,...,,,,,,,,,,
2,flour,egg,white sugar,vegetable oil,applesauce,raisin,cinnamon,baking soda,baking powder,sour cream,...,,,,,,,,,,


In [93]:
# create a dictionary pairing original values and their label value
ingr_keys = list(ingr_count_categories['ingredient'])
ingr_vals = list(ingr_count_categories['categories'])

ingr_dict = {ingr_keys[i]: ingr_vals[i] for i in range(len(ingr_keys))} 

In [94]:
# replace values in modeling df with replace vals
#   *** mapping dict vals is only available with pd.Series, stack >> map >> unstack as workaround

testing_sub_df = testing_df.copy()
testing_sub_df = testing_sub_df.stack().map(ingr_dict).unstack()
testing_sub_df = testing_sub_df.stack().apply(lambda y: np.nan if len(y)==0 else y).unstack()
testing_sub_df.head()

Unnamed: 0,ingr_0,ingr_1,ingr_2,ingr_3,ingr_4,ingr_5,ingr_6,ingr_7,ingr_8,ingr_9,...,ingr_55,ingr_56,ingr_57,ingr_58,ingr_59,ingr_60,ingr_61,ingr_62,ingr_63,ingr_64
0,yeast,water,sugar,salt,egg,butter,flour,butter,,,...,,,,,,,,,,
1,flour,salt,baking powder,poppyseed,butter,oil,egg,milk,sugar,vanilla,...,,,,,,,,,,
2,flour,egg,sugar,oil,applesauce,raisin,cinnamon,baking soda,baking powder,sour cream,...,,,,,,,,,,
3,flour,baking powder,baking soda,salt,cinnamon,nutmeg,sugar,oat,[],walnut,...,,,,,,,,,,
4,oat,milk,oil,egg,sugar,flour,baking powder,baking soda,salt,,...,,,,,,,,,,


In [95]:
testing_sub_df.isna().sum().head(25)

ingr_0       260
ingr_1       279
ingr_2       628
ingr_3      1810
ingr_4      4055
ingr_5      7583
ingr_6     12310
ingr_7     17934
ingr_8     23739
ingr_9     29784
ingr_10    35469
ingr_11    40702
ingr_12    45239
ingr_13    49074
ingr_14    52186
ingr_15    54638
ingr_16    56619
ingr_17    58165
ingr_18    59320
ingr_19    60136
ingr_20    60768
ingr_21    61164
ingr_22    61439
ingr_23    61629
ingr_24    61751
dtype: int64

In [97]:
ingr_0_list = testing_df.iloc[testing_sub_df[testing_sub_df['ingr_0'].isna()].index]['ingr_0']

d = dict()

for i in ingr_0_list:
    if i in d:
        d[i] = d[i] + 1
    else:
        d[i] = 1

ingr0_word_df = pd.DataFrame(d.items(), columns=['word', 'count'])
ingr0_word_df = ingr0_word_df.sort_values('count', ascending=False).reset_index(drop=True)
ingr0_wordlist = list(ingr0_word_df['word'])
ingr0_word_df.head(15)

Unnamed: 0,word,count
0,pounds calamari,1
1,absinthe,1
2,fine grind white cornmeal,1
3,reduced sodium condensed cream of chicken soup,1
4,refrigerated seamless crescent dough,1
5,smoked whitefish,1
6,quahog clams,1
7,kraft shredded lowmoisture partskim mozzarella...,1
8,pocket bread,1
9,chambord liqueur,1


In [208]:
(modeling_df.isnull().mean() * 100).head(25)

ingr_0      0.000000
ingr_1      0.069278
ingr_2      0.667000
ingr_3      2.571332
ingr_4      6.173774
ingr_5     11.951216
ingr_6     19.618489
ingr_7     28.648762
ingr_8     38.076979
ingr_9     47.830640
ingr_10    57.033302
ingr_11    65.470686
ingr_12    72.807682
ingr_13    78.997567
ingr_14    84.048398
ingr_15    87.987562
ingr_16    91.190449
ingr_17    93.689281
ingr_18    95.559780
ingr_19    96.872835
ingr_20    97.902334
ingr_21    98.538723
ingr_22    98.980167
ingr_23    99.284667
ingr_24    99.487667
dtype: float64

In [209]:
modeling_df = modeling_df.replace(np.nan, '', regex=True)
modeling_df.head()

Unnamed: 0,ingr_0,ingr_1,ingr_2,ingr_3,ingr_4,ingr_5,ingr_6,ingr_7,ingr_8,ingr_9,...,ingr_55,ingr_56,ingr_57,ingr_58,ingr_59,ingr_60,ingr_61,ingr_62,ingr_63,ingr_64
0,yeast,water,white sugar,salt,egg,butter,flour,butter,,,...,,,,,,,,,,
1,flour,salt,baking powder,poppy,butter,vegetable oil,egg,milk,white sugar,vanilla,...,,,,,,,,,,
2,flour,egg,white sugar,vegetable oil,applesauce,raisin,cinnamon,baking soda,baking powder,sour cream,...,,,,,,,,,,
3,flour,baking powder,baking soda,salt,cinnamon,nutmeg,brown sugar,oat,apple,walnut,...,,,,,,,,,,
4,oat,buttermilk,vegetable oil,egg,brown sugar,flour,baking powder,baking soda,salt,,...,,,,,,,,,,


In [210]:
ingr_int = {ingr:i for i, ingr in enumerate(ingr_count_df['ingredient'])}
int_ingr = {i:ingr for i, ingr in enumerate(ingr_count_df['ingredient'])}

In [211]:
# convert ingredients to numeric representations
encoded_df = modeling_df.applymap(ingr_int.get)
print(encoded_df.shape)
encoded_df.head()

(62069, 65)


Unnamed: 0,ingr_0,ingr_1,ingr_2,ingr_3,ingr_4,ingr_5,ingr_6,ingr_7,ingr_8,ingr_9,...,ingr_55,ingr_56,ingr_57,ingr_58,ingr_59,ingr_60,ingr_61,ingr_62,ingr_63,ingr_64
0,352,1,10,0,15,3,12,3,4232,4232,...,4232,4232,4232,4232,4232,4232,4232,4232,4232,4232
1,12,0,27,1089,3,9,15,14,10,25,...,4232,4232,4232,4232,4232,4232,4232,4232,4232,4232
2,12,15,10,9,679,226,44,35,27,31,...,4232,4232,4232,4232,4232,4232,4232,4232,4232,4232
3,12,27,35,0,44,131,24,278,253,122,...,4232,4232,4232,4232,4232,4232,4232,4232,4232,4232
4,278,90,9,15,24,12,27,35,0,4232,...,4232,4232,4232,4232,4232,4232,4232,4232,4232,4232


In [212]:
# drop all recipes that have fewer than 5 ingredients
encoded_df = encoded_df[encoded_df['ingr_5'] < 4232]
encoded_df.reset_index(drop=True)
print(encoded_df.shape)
encoded_df.head()

(54227, 65)


Unnamed: 0,ingr_0,ingr_1,ingr_2,ingr_3,ingr_4,ingr_5,ingr_6,ingr_7,ingr_8,ingr_9,...,ingr_55,ingr_56,ingr_57,ingr_58,ingr_59,ingr_60,ingr_61,ingr_62,ingr_63,ingr_64
0,352,1,10,0,15,3,12,3,4232,4232,...,4232,4232,4232,4232,4232,4232,4232,4232,4232,4232
1,12,0,27,1089,3,9,15,14,10,25,...,4232,4232,4232,4232,4232,4232,4232,4232,4232,4232
2,12,15,10,9,679,226,44,35,27,31,...,4232,4232,4232,4232,4232,4232,4232,4232,4232,4232
3,12,27,35,0,44,131,24,278,253,122,...,4232,4232,4232,4232,4232,4232,4232,4232,4232,4232
4,278,90,9,15,24,12,27,35,0,4232,...,4232,4232,4232,4232,4232,4232,4232,4232,4232,4232


In [213]:
derp_df = encoded_df.iloc[0:5]
derp_df

Unnamed: 0,ingr_0,ingr_1,ingr_2,ingr_3,ingr_4,ingr_5,ingr_6,ingr_7,ingr_8,ingr_9,...,ingr_55,ingr_56,ingr_57,ingr_58,ingr_59,ingr_60,ingr_61,ingr_62,ingr_63,ingr_64
0,352,1,10,0,15,3,12,3,4232,4232,...,4232,4232,4232,4232,4232,4232,4232,4232,4232,4232
1,12,0,27,1089,3,9,15,14,10,25,...,4232,4232,4232,4232,4232,4232,4232,4232,4232,4232
2,12,15,10,9,679,226,44,35,27,31,...,4232,4232,4232,4232,4232,4232,4232,4232,4232,4232
3,12,27,35,0,44,131,24,278,253,122,...,4232,4232,4232,4232,4232,4232,4232,4232,4232,4232
4,278,90,9,15,24,12,27,35,0,4232,...,4232,4232,4232,4232,4232,4232,4232,4232,4232,4232


In [214]:
encoded_df = encoded_df[encoded_df['ingr_59'] == 4232]
print(encoded_df.shape)

(54226, 65)


In [215]:
def create_rec_sequences(recipe_row):

    # take length of row & create length 5 lists that can be added to sequences[]
    
    counter = 0

    r_list = []
    next_ingr = []
    
    for counter in range(0, len(recipe_row), 3):
        if recipe_row[counter + 5] == 4232:
            return r_list, next_ingr
        r_list.append(recipe_row[counter:counter+5])
        next_ingr.append(recipe_row[counter + 5])

In [216]:
#  create list of sequences for 5 item lists
#  i.e. sequences[0] = [352, 1, 10, 0, 15]
#       sequences[1] = [1, 10, 0, 15, 3]

# testing grounds!
sequences = []
next_ingr = []

for row in range(0, len(derp_df)):
    derpen = derp_df.iloc[row]

    seq_to_add, next_to_add = create_rec_sequences(derpen)
    
    for seq in seq_to_add:
        sequences.append(seq)
    
    for nxt in next_to_add:
        next_ingr.append(nxt)
        

print(sequences, next_ingr)

[ingr_0    352
ingr_1      1
ingr_2     10
ingr_3      0
ingr_4     15
Name: 0, dtype: int64, ingr_0      12
ingr_1       0
ingr_2      27
ingr_3    1089
ingr_4       3
Name: 1, dtype: int64, ingr_3    1089
ingr_4       3
ingr_5       9
ingr_6      15
ingr_7      14
Name: 1, dtype: int64, ingr_6      15
ingr_7      14
ingr_8      10
ingr_9      25
ingr_10    204
Name: 1, dtype: int64, ingr_9      25
ingr_10    204
ingr_11    146
ingr_12      3
ingr_13    204
Name: 1, dtype: int64, ingr_0     12
ingr_1     15
ingr_2     10
ingr_3      9
ingr_4    679
Name: 2, dtype: int64, ingr_3      9
ingr_4    679
ingr_5    226
ingr_6     44
ingr_7     35
Name: 2, dtype: int64, ingr_0    12
ingr_1    27
ingr_2    35
ingr_3     0
ingr_4    44
Name: 3, dtype: int64, ingr_3      0
ingr_4     44
ingr_5    131
ingr_6     24
ingr_7    278
Name: 3, dtype: int64, ingr_6      24
ingr_7     278
ingr_8     253
ingr_9     122
ingr_10    226
Name: 3, dtype: int64, ingr_0    278
ingr_1     90
ingr_2      9
ingr_3 

In [217]:
sequences = []
next_ingr = []

for row in range(0, len(encoded_df)):
    
    ingr_list = encoded_df.iloc[row]
    seq_to_add, next_to_add = create_rec_sequences(ingr_list)

    for seq in seq_to_add:
        sequences.append(seq)
    for nxt in next_to_add:
        next_ingr.append(nxt)

for i in range(0, 10):
    print(sequences[i], '\n', next_ingr[i])


ingr_0    352
ingr_1      1
ingr_2     10
ingr_3      0
ingr_4     15
Name: 0, dtype: int64 
 3
ingr_0      12
ingr_1       0
ingr_2      27
ingr_3    1089
ingr_4       3
Name: 1, dtype: int64 
 9
ingr_3    1089
ingr_4       3
ingr_5       9
ingr_6      15
ingr_7      14
Name: 1, dtype: int64 
 10
ingr_6      15
ingr_7      14
ingr_8      10
ingr_9      25
ingr_10    204
Name: 1, dtype: int64 
 146
ingr_9      25
ingr_10    204
ingr_11    146
ingr_12      3
ingr_13    204
Name: 1, dtype: int64 
 25
ingr_0     12
ingr_1     15
ingr_2     10
ingr_3      9
ingr_4    679
Name: 2, dtype: int64 
 226
ingr_3      9
ingr_4    679
ingr_5    226
ingr_6     44
ingr_7     35
Name: 2, dtype: int64 
 27
ingr_0    12
ingr_1    27
ingr_2    35
ingr_3     0
ingr_4    44
Name: 3, dtype: int64 
 131
ingr_3      0
ingr_4     44
ingr_5    131
ingr_6     24
ingr_7    278
Name: 3, dtype: int64 
 253
ingr_6      24
ingr_7     278
ingr_8     253
ingr_9     122
ingr_10    226
Name: 3, dtype: int64 
 15


In [218]:
print('\n sequences:', len(sequences), '\n', 'next_ingredients:', len(next_ingr))


 sequences: 129423 
 next_ingredients: 129423


In [219]:
max_pred_length = 10

In [220]:
x = np.zeros((len(sequences), max_pred_length, len(ingr_count_df)), dtype=np.bool)
y = np.zeros((len(sequences), len(ingr_count_df)), dtype=np.bool)

for i, sequence in enumerate(sequences):
    for t, ingr in enumerate(sequence):
        x[i,t,ingr] = 1
    
    y[i, next_ingr[i]] = 1

In [221]:
print(x.shape)
print(y.shape)

(129423, 10, 7631)
(129423, 7631)


In [222]:
model = Sequential()
model.add(LSTM(128, input_shape=(max_pred_length, len(ingr_count_df))))
model.add(Dense(len(ingr_count_df), activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='nadam')

In [223]:
model.fit(x, y,
          batch_size=128,
          epochs=5)

Train on 129423 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x192aa7b9888>

In [225]:
import json
with open('ingr_int.json', 'w') as fp:
    json.dump(ingr_int, fp)

with open('int_ingr.json', 'w') as fp:
    json.dump(int_ingr, fp)

In [31]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [33]:
known_ingr = ['peanut butter', 'flour', 'sugar', 'eggs', 'chocolate chips']

pred_next = []

for diversity in [0.2, 0.5, 1.0, 1.2]:

    start_ingr = [ingr_int[x] for x in known_ingr]
    
    for i in range(10):
        x_pred = np.zeros((1, max_pred_length, len(ingr_count_df)))
        for t, ingr in enumerate(start_ingr):
            x_pred[0, t, ingr] = 1

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_ingredient = int_ingr[next_index]

        pred_next.append(next_ingredient)


print(set(pred_next))

{'vanilla extract', 'sugar', 'active dry yeast', 'hot water', 'coconut milk', 'baking powder', 'yellow chives', 'white sugar', 'flour', 'cream cheese', 'water', 'salt', 'milk', 'steak sauce', 'carrot', 'nutmeg', 'unsalted butter', 'lemon juice', 'egg', 'baking soda'}
