In [11]:
#####
#INPUTS: 1M recipes json, flavor_DB.json
#OUTPUTS: dev_set.json, validation_set.json 

import json
import numpy as np
import random
from thefuzz import fuzz
from thefuzz import process
from collections import defaultdict
from Levenshtein import distance as lev
from collections import defaultdict
import time

f = open('1M_recipes_with_nutritional_info.json',)
recipe_df = json.load(open('1M_recipes_with_nutritional_info.json',))


In [12]:
#Sanity check. see generally how many flavors are in an ingredient
f = open('flavor_DB.json',)
flavor_df = json.load(open('flavor_DB.json',))
for i in range(10):
    print("entity: {}".format(flavor_df[i]['entity_alias_readable']))
    print("entity basket: {}".format(flavor_df[i]['entity_alias_basket']))

    print("# of flavor molecules: {}".format(len(flavor_df[i]['molecules'])))

entity: Egg
entity basket: egg, egg-boiled, egg-cooked, egg-scrambled
# of flavor molecules: 55
entity: Bakery Products
entity basket: bakery-products
# of flavor molecules: 6
entity: Bread
entity basket: bread, bread-preferment
# of flavor molecules: 129
entity: Rye Bread
entity basket: bread-rye
# of flavor molecules: 30
entity: Wheaten Bread
entity basket: bread-wheaten
# of flavor molecules: 30
entity: White Bread
entity basket: bread-white
# of flavor molecules: 13
entity: Wholewheat Bread
entity basket: bread-wholewheat
# of flavor molecules: 58
entity: Wort
entity basket: wort
# of flavor molecules: 63
entity: Arrack
entity basket: arrack
# of flavor molecules: 5
entity: Beer
entity basket: beer
# of flavor molecules: 263


In [13]:
#Create the dev set, 1000 out of 50000 recipes
#Create the validation set, pick 100 recipes from the 1000 recipes from dev_set
def splitdataset(recipe_df, dev_set_length=1000, valid_set_length=100):  ##our flavor_db
    #sample random indices
    all_indices=np.arange(len(recipe_df))
    dev_indices=np.random.choice(np.arange(len(recipe_df)), dev_set_length, replace=False)
    valid_indices=np.random.choice(dev_indices, valid_set_length, replace=False)
    train_indices=list(set(all_indices).difference(set(dev_indices)))
    dev_set = [recipe_df[i] for i in dev_indices]
    valid_set = [recipe_df[i] for i in valid_indices]
    train_set = [recipe_df[i] for i in train_indices]
    #print(sorted(valid_indices))
    #print(sorted(dev_indices))
    return dev_set, valid_set,train_set
np.random.seed(42)
dev_set, valid_set,train_set = splitdataset(recipe_df)
with open('dev_set.json', 'w') as f:
    json.dump(dev_set, f)

print("# of entries in recipe_df: {}".format(len(recipe_df)))
print("# of entries in development set: {}".format(len(dev_set)))
print("# of entries in validation set: {}".format(len(valid_set)))

print("# of entries in training set: {}".format(len(train_set)))
##print("# of entries in recipe_df: {}".format(len(recipe_df)))

# of entries in recipe_df: 51235
# of entries in development set: 1000
# of entries in validation set: 100
# of entries in training set: 50235


In [16]:
#Use basic string matching to help construct the validation set

def shortenstring(text,i):
    return (','.join(text.split(',')[:i]))
def flavormatch(ingredient, flavorlist):
    #print("input ingredient: {}".format(ingredient))
    match = process.extract(ingredient, flavorlist, limit=3)
    #print("best matches: {}".format(match))
    return match 

def flavormatch2(ingredient, flavorlist):
    ingredient_s=shortenstring(ingredient,2)
    #print("*** input ingredient: {}".format(ingredient_s))
    match = process.extract(ingredient_s, flavorlist, limit=6)
    #print("    best matches: {}".format(match))
    return match

def levmatch(ingredient,matches,num=2):
    scores=defaultdict(int)
    for i in range(len(matches)):
        
        scores[matches[i]]=lev(ingredient,matches[i])
    max_keys = sorted(scores, key=scores.get)[:3]
    #print("input ingredient: {}".format(ingredient))
    #print("best matches: {}".format(max_keys))

flavorlist=[]
for i in range(len(flavor_df)):
    flavorlist.append(flavor_df[i]['entity_alias_readable'])      

start = time.process_time()
for i in range(len(valid_set)):
    matches=[]
    for j in valid_set[i]['ingredients']:
        try1 = j['text']
        matches.append({"text": flavormatch2(try1,flavorlist)[0][0]})
    valid_set[i]['ingredients_flavormatch']=matches
duration = time.process_time()-start
#print(duration)

In [18]:
#Manual curation - overide of incorrect matches
valid_set[0]['ingredients_flavormatch'][0]['text']='Flour'
#valid_set[0]['ingredients_flavormatch'][3]['text']  #sour cream
valid_set[1]['ingredients_flavormatch'][2]['text']='Bread'  #pretzels
#valid_set[1]['ingredients_flavormatch'][7]['text']  #olive oil

valid_set[2]['ingredients_flavormatch'][4]['text']='Flour'  
#valid_set[1]['ingredients_flavormatch'][5]['text']  #Cornmeal
#valid_set[4]['ingredients_flavormatch'][1]['text'] #Worchestiresauce  
valid_set[4]['ingredients_flavormatch'][2]['text']='Capsicum'  #Paprika
valid_set[4]['ingredients_flavormatch'][8]['text']='Walnut'  #walnuts

valid_set[4]['ingredients_flavormatch'][9]['text']='Capsicum'  #red pepper

valid_set[6]['ingredients_flavormatch'][0]['text']='Flour'  
valid_set[8]['ingredients_flavormatch'][6]['text']='Soy milk'  
valid_set[8]['ingredients_flavormatch'][7]['text']='Apple Sauce'  
valid_set[8]['ingredients_flavormatch'][9]['text']='Peanut Butter'  
valid_set[8]['ingredients_flavormatch'][11]['text']='Soy milk'  
valid_set[8]['ingredients_flavormatch'][12]['text']='Soy milk'  


valid_set[9]['ingredients_flavormatch'][5]['text']='Berry'   #Goji Berry
valid_set[10]['ingredients_flavormatch'][0]['text']='Pineapple'   #pineapple juice
valid_set[10]['ingredients_flavormatch'][2]['text']='Soybean Sauce'   #Soy sauce
valid_set[10]['ingredients_flavormatch'][2]['text']='Ketchup'   #Catsup
# valid_set[10]['ingredients_flavormatch'][5]['text']   #Cornstarch
valid_set[10]['ingredients_flavormatch'][6]['text']='Pineapple'   #pineapple juice

valid_set[12]['ingredients_flavormatch'][2]['text']='Jalapeno'   #jalapeno, peppers

valid_set[13]['ingredients_flavormatch'][2]['text']='Yellow zucchini'   #summer squash
valid_set[13]['ingredients_flavormatch'][3]['text']='Soup'   #soup, chicken broth or bouillon

valid_set[14]['ingredients_flavormatch'][1]['text']='Leek'   #leeks, (bulb and lower leaf-portion)
valid_set[14]['ingredients_flavormatch'][5]['text']='Soybean Sauce'  # soy sauce made from soy (tamari)

valid_set[15]['ingredients_flavormatch'][0]['text']='Vinegar' #vinegar, red wine

valid_set[16]['ingredients_flavormatch'][0]['text']='Beetroot' #beets, raw

valid_set[17]['ingredients_flavormatch'][2]['text']='Bread' #croutons

valid_set[18]['ingredients_flavormatch'][2]['text']='Pineapple' #pineapple juice
valid_set[18]['ingredients_flavormatch'][6]['text']='Shortening' #pineapple juice
valid_set[18]['ingredients_flavormatch'][7]['text']='Pineapple' #pineapple juice

valid_set[19]['ingredients_flavormatch'][1]['text']='Pasta' #noodle
valid_set[19]['ingredients_flavormatch'][10]['text']='Capsicum'  #cajun
valid_set[20]['ingredients_flavormatch'][0]['text']='Soy milk'  #soymilk, original and vanilla
valid_set[20]['ingredients_flavormatch'][5]['text']='Flour' 
valid_set[21]['ingredients_flavormatch'][0]['text']='Flour' 
valid_set[22]['ingredients_flavormatch'][7]['text']='Shortening'  #shortening confectionery, coconut 

valid_set[24]['ingredients_flavormatch'][5]['text']='Flour' 
valid_set[26]['ingredients_flavormatch'][8]['text']='Shortening' #shortening confectionery, coconut
valid_set[27]['ingredients_flavormatch'][3]['text']='Ketchup' #Catsup
#valid_set[27]['ingredients_flavormatch'][7]['text']  #Worchestiresauce  

#valid_set[29]['ingredients_flavormatch'][0]['text']  #Pickles 
valid_set[29]['ingredients_flavormatch'][1]['text']='Onion'  #spring or scallions (includes tops and bulb) 
valid_set[29]['ingredients_flavormatch'][3]['text']='Flour'
valid_set[29]['ingredients_flavormatch'][4]['text']='Soup'
#valid_set[29]['ingredients_flavormatch'][6]['text']  #sour cream
valid_set[30]['ingredients_flavormatch'][1]['text']='Buttermilk'
#valid_set[31]['ingredients_flavormatch'][0]['text'] #Pumpkin seeds
valid_set[32]['ingredients_flavormatch'][1]['text']='Flour' 
valid_set[32]['ingredients_flavormatch'][2]['text']='Pineapple' 
valid_set[32]['ingredients_flavormatch'][5]['text']='Beverage alcolohic other' #alcoholic beverage, distilled
valid_set[34]['ingredients_flavormatch'][1]['text']='Hot chocolate' #beverages, ovaltine
valid_set[34]['ingredients_flavormatch'][4]['text']='Flour'
valid_set[35]['ingredients_flavormatch'][1]['text']='Flour'

valid_set[36]['ingredients_flavormatch'][0]['text']='Bread'  #pretzels
valid_set[36]['ingredients_flavormatch'][2]['text']='Soybean Sauce' 

valid_set[39]['ingredients_flavormatch'][0]['text']='Soybean Sauce'   #fish sauce
#valid_set[39]['ingredients_flavormatch'][1]['text']  #sour cream
valid_set[40]['ingredients_flavormatch'][0]['text']='Flour'

valid_set[41]['ingredients_flavormatch'][4]['text']='Flour'
valid_set[41]['ingredients_flavormatch'][6]['text']='Flour'

valid_set[42]['ingredients_flavormatch'][0]['text']='Pecan'
valid_set[43]['ingredients_flavormatch'][3]['text']='Ice cream'

valid_set[44]['ingredients_flavormatch'][3]['text']='Flour'

valid_set[44]['ingredients_flavormatch'][3]['text']='Flour'
valid_set[47]['ingredients_flavormatch'][0]['text']='Buttermilk'
valid_set[48]['ingredients_flavormatch'][1]['text']='Soybean Sauce'
valid_set[48]['ingredients_flavormatch'][3]['text']='Ginger'
valid_set[49]['ingredients_flavormatch'][1]['text']='Peanut Butter'
valid_set[50]['ingredients_flavormatch'][1]['text']='Flour'

valid_set[51]['ingredients_flavormatch'][1]['text']='Syrup'  #syrups, corn
valid_set[55]['ingredients_flavormatch'][4]['text']='Soy milk'  
valid_set[56]['ingredients_flavormatch'][3]['text']='Pineapple'  
valid_set[56]['ingredients_flavormatch'][7]['text']='Water'   #beverages, carbonated
valid_set[57]['ingredients_flavormatch'][3]['text']='Peanut Butter'   #beverages, carbonated
valid_set[58]['ingredients_flavormatch'][3]['text']='Beverage alcolohic other' #alcoholic beverage, distilled
valid_set[59]['ingredients_flavormatch'][2]['text']='Soybean Sauce'
valid_set[59]['ingredients_flavormatch'][4]['text']='Onion'
valid_set[60]['ingredients_flavormatch'][0]['text']='Walnut'  #walnuts
valid_set[61]['ingredients_flavormatch'][0]['text']='Flour'
valid_set[62]['ingredients_flavormatch'][1]['text']='Syrup'  #syrups, corn
valid_set[62]['ingredients_flavormatch'][2]['text']='Peanut Butter'   

valid_set[63]['ingredients_flavormatch'][3]['text']='Beverage alcolohic other' 
valid_set[63]['ingredients_flavormatch'][7]['text']='Popcorn'   

valid_set[64]['ingredients_flavormatch'][1]['text']='Pecans'   
valid_set[64]['ingredients_flavormatch'][2]['text']='Pistachio'   
valid_set[65]['ingredients_flavormatch'][0]['text']='Beef'   #beef, grass-fed
valid_set[65]['ingredients_flavormatch'][5]['text']='Ketchup' #Catsup

valid_set[66]['ingredients_flavormatch'][0]['text']='Flour'
valid_set[67]['ingredients_flavormatch'][0]['text']='Whey' #beverages, protein powder whey based
valid_set[67]['ingredients_flavormatch'][1]['text']='Flour'
valid_set[67]['ingredients_flavormatch'][3]['text']='Butter'  #lard

valid_set[69]['ingredients_flavormatch'][1]['text']='Flour'
valid_set[69]['ingredients_flavormatch'][2]['text']='Flour'

valid_set[70]['ingredients_flavormatch'][0]['text']='Flour'
valid_set[70]['ingredients_flavormatch'][9]['text']='Flour'
valid_set[70]['ingredients_flavormatch'][12]['text']='Capsicum'  #red pepper

valid_set[71]['ingredients_flavormatch'][4]['text']='Buttermilk'
valid_set[72]['ingredients_flavormatch'][1]['text']='Grapefruit'

valid_set[75]['ingredients_flavormatch'][1]['text']='Flour'
valid_set[77]['ingredients_flavormatch'][1]['text']='Ginger'
valid_set[77]['ingredients_flavormatch'][3]['text']='Soybean Sauce'  #hoisin sauce
valid_set[77]['ingredients_flavormatch'][5]['text']='Ketchup'  
valid_set[78]['ingredients_flavormatch'][1]['text']='Shortening' #shortening confectionery, coconut
valid_set[78]['ingredients_flavormatch'][7]['text']='Ginger'

#valid_set[79]['ingredients_flavormatch'][0]['text'] #sour cream
valid_set[79]['ingredients_flavormatch'][2]['text']='Buttermilk'
valid_set[79]['ingredients_flavormatch'][6]['text']='Flour'

valid_set[81]['ingredients_flavormatch'][1]['text']='Flour'
valid_set[83]['ingredients_flavormatch'][3]['text']='Capsicum'  #Paprika
valid_set[85]['ingredients_flavormatch'][0]['text']='Flour'
valid_set[86]['ingredients_flavormatch'][4]['text']='Yellow zucchini'   #summer squash
#valid_set[86]['ingredients_flavormatch'][8]['text']  #Worchestiresauce  
valid_set[90]['ingredients_flavormatch'][0]['text']='Flour'  #house of pasta, pizza dough
valid_set[91]['ingredients_flavormatch'][0]['text']='Flour'

valid_set[93]['ingredients_flavormatch'][0]['text']='Flour'
valid_set[93]['ingredients_flavormatch'][9]['text']='Pecans'
valid_set[94]['ingredients_flavormatch'][6]['text']='Apple Sauce'
valid_set[95]['ingredients_flavormatch'][6]['text']='Pecans'
valid_set[96]['ingredients_flavormatch'][2]['text']='Cranberry'
valid_set[97]['ingredients_flavormatch'][6]['text']='Pineapple' 
valid_set[97]['ingredients_flavormatch'][7]['text']='Breakfast cereal' #cereals ready-to-eat, granola

valid_set[98]['ingredients_flavormatch'][0]['text']='Flour'
valid_set[98]['ingredients_flavormatch'][1]['text']='Flour'
valid_set[98]['ingredients_flavormatch'][2]['text']='Soybean Sauce'
valid_set[98]['ingredients_flavormatch'][4]['text']='Hot chocolate' #beverages, ovaltine

with open('validation_set.json', 'w') as f:
    json.dump(dev_set, f)