In [1]:
import pandas as pd
import re
import json
import ast
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from Levenshtein import distance as lev
import time

In [2]:
food_path = "1M_recipes_with_nutritional_info.json"
flavor_path = "flavor_DB.json"

In [3]:
def flatten(path, cols, meta):
    # load data using Python JSON module
    with open(path,'r') as f:
        data = json.loads(f.read())
        
    # Flatten data
    df = pd.json_normalize(
        data, 
        record_path =cols, 
        meta=meta
    )
    return df


In [4]:
## flatten molecules from nested array
flavor_df = flatten(flavor_path, ["molecules"], ["entity_id", "category", "entity_alias_readable"])

In [5]:
## flatten ingredients from nested array
food_df = flatten(food_path, ["ingredients"], ["id", "title", "instructions", "nutr_values_per100g"])
food_df = food_df.iloc[0:200, :] # partition to a small subset for testing

In [6]:
## re-order the column header and rename ingredient
food_df = food_df[["id", "title", "instructions", "nutr_values_per100g", "text"]]
food_df = food_df.rename(columns = {"text": "ingredient"})
food_df.iloc[0:10, :]

Unnamed: 0,id,title,instructions,nutr_values_per100g,ingredient
0,000095fc1d,Yogurt Parfaits,[{'text': 'Layer all ingredients in a serving ...,"{'energy': 81.12946131894766, 'fat': 2.1401392...","yogurt, greek, plain, nonfat"
1,000095fc1d,Yogurt Parfaits,[{'text': 'Layer all ingredients in a serving ...,"{'energy': 81.12946131894766, 'fat': 2.1401392...","strawberries, raw"
2,000095fc1d,Yogurt Parfaits,[{'text': 'Layer all ingredients in a serving ...,"{'energy': 81.12946131894766, 'fat': 2.1401392...","cereals ready-to-eat, granola, homemade"
3,00051d5b9d,"Salt Free, Low Cholesterol Sugar Cookies Recipe",[{'text': 'Cream sugar and butter together til...,"{'energy': 477.09640393594606, 'fat': 23.41248...","sugars, granulated"
4,00051d5b9d,"Salt Free, Low Cholesterol Sugar Cookies Recipe",[{'text': 'Cream sugar and butter together til...,"{'energy': 477.09640393594606, 'fat': 23.41248...","oil, corn, peanut, and olive"
5,00051d5b9d,"Salt Free, Low Cholesterol Sugar Cookies Recipe",[{'text': 'Cream sugar and butter together til...,"{'energy': 477.09640393594606, 'fat': 23.41248...","egg substitute, powder"
6,00051d5b9d,"Salt Free, Low Cholesterol Sugar Cookies Recipe",[{'text': 'Cream sugar and butter together til...,"{'energy': 477.09640393594606, 'fat': 23.41248...","orange juice, raw"
7,00051d5b9d,"Salt Free, Low Cholesterol Sugar Cookies Recipe",[{'text': 'Cream sugar and butter together til...,"{'energy': 477.09640393594606, 'fat': 23.41248...","orange juice, raw"
8,00051d5b9d,"Salt Free, Low Cholesterol Sugar Cookies Recipe",[{'text': 'Cream sugar and butter together til...,"{'energy': 477.09640393594606, 'fat': 23.41248...","leavening agents, baking powder, double-acting..."
9,00051d5b9d,"Salt Free, Low Cholesterol Sugar Cookies Recipe",[{'text': 'Cream sugar and butter together til...,"{'energy': 477.09640393594606, 'fat': 23.41248...","wheat flour, white, all-purpose, unenriched"


In [7]:
flavor_df = flavor_df[["category", "entity_alias_readable", "fooddb_id", "pubchem_id", "common_name", "iupac_name"]]
flavor_df.iloc[0:10, :]

Unnamed: 0,category,entity_alias_readable,fooddb_id,pubchem_id,common_name,iupac_name
0,animalproduct,Egg,FDB002257,6322,L-arginine,(2S)-2-amino-5-(diaminomethylideneamino)pentan...
1,animalproduct,Egg,FDB010993,6736,3-Methylindole,3-methyl-1H-indole
2,animalproduct,Egg,FDB013954,31252,"2,5-Dimethylpyrazine","2,5-dimethylpyrazine"
3,animalproduct,Egg,FDB008174,7909,4-Methyl-2-pentanone,4-methylpentan-2-one
4,animalproduct,Egg,FDB008127,7284,2-Methylbutyraldehyde,2-methylbutanal
5,animalproduct,Egg,FDB012552,7501,Styrene,styrene
6,animalproduct,Egg,FDB008326,9609,Diethyl sulfide,ethylsulfanylethane
7,animalproduct,Egg,FDB012084,12180,Methyl butyrate,methyl butanoate
8,animalproduct,Egg,FDB003349,18827,1-Octen-3-Ol,oct-1-en-3-ol
9,animalproduct,Egg,FDB008541,18635,3-(Methylthio)propionaldehyde,3-methylsulfanylpropanal


In [8]:
# ## fuzzywuzzy implementation
# ingredients = flavor_df["entity_alias_readable"].unique()
# start = time.process_time()
# food_df['key']=food_df['ingredient'].apply(lambda x : [process.extract(x, ingredients, limit=1)][0][0][0])
# duration = time.process_time() - start 
# print(duration)    

In [9]:
## levenshtein implementation
ingredients = flavor_df["entity_alias_readable"].unique()
start = time.process_time()
food_df['key']=food_df['ingredient'].apply(lambda x : min(ingredients, key=lambda i : lev(x, i))) 
duration = time.process_time() - start 
print(duration)  
food_df.iloc[0:10, :]

0.1914479999999994


Unnamed: 0,id,title,instructions,nutr_values_per100g,ingredient,key
0,000095fc1d,Yogurt Parfaits,[{'text': 'Layer all ingredients in a serving ...,"{'energy': 81.12946131894766, 'fat': 2.1401392...","yogurt, greek, plain, nonfat",Northern bluefin tuna
1,000095fc1d,Yogurt Parfaits,[{'text': 'Layer all ingredients in a serving ...,"{'energy': 81.12946131894766, 'fat': 2.1401392...","strawberries, raw",Strawberry Jam
2,000095fc1d,Yogurt Parfaits,[{'text': 'Layer all ingredients in a serving ...,"{'energy': 81.12946131894766, 'fat': 2.1401392...","cereals ready-to-eat, granola, homemade",Wholewheat Bread
3,00051d5b9d,"Salt Free, Low Cholesterol Sugar Cookies Recipe",[{'text': 'Cream sugar and butter together til...,"{'energy': 477.09640393594606, 'fat': 23.41248...","sugars, granulated",Pomegranate
4,00051d5b9d,"Salt Free, Low Cholesterol Sugar Cookies Recipe",[{'text': 'Cream sugar and butter together til...,"{'energy': 477.09640393594606, 'fat': 23.41248...","oil, corn, peanut, and olive",European anchovy
5,00051d5b9d,"Salt Free, Low Cholesterol Sugar Cookies Recipe",[{'text': 'Cream sugar and butter together til...,"{'energy': 477.09640393594606, 'fat': 23.41248...","egg substitute, powder",Sugar substitute
6,00051d5b9d,"Salt Free, Low Cholesterol Sugar Cookies Recipe",[{'text': 'Cream sugar and butter together til...,"{'energy': 477.09640393594606, 'fat': 23.41248...","orange juice, raw",Orange Oil
7,00051d5b9d,"Salt Free, Low Cholesterol Sugar Cookies Recipe",[{'text': 'Cream sugar and butter together til...,"{'energy': 477.09640393594606, 'fat': 23.41248...","orange juice, raw",Orange Oil
8,00051d5b9d,"Salt Free, Low Cholesterol Sugar Cookies Recipe",[{'text': 'Cream sugar and butter together til...,"{'energy': 477.09640393594606, 'fat': 23.41248...","leavening agents, baking powder, double-acting...",Mandarin Orange Peel Oil
9,00051d5b9d,"Salt Free, Low Cholesterol Sugar Cookies Recipe",[{'text': 'Cream sugar and butter together til...,"{'energy': 477.09640393594606, 'fat': 23.41248...","wheat flour, white, all-purpose, unenriched",Breadnut tree seed


In [10]:
merged_df = flavor_df.merge(food_df,left_on='entity_alias_readable',right_on='key', how="inner")
merged_df.iloc[[0,3], :]

Unnamed: 0,category,entity_alias_readable,fooddb_id,pubchem_id,common_name,iupac_name,id,title,instructions,nutr_values_per100g,ingredient,key
0,bakery,Wheaten Bread,FDB011267,263034,Difurfuryl ether,2-(furan-2-ylmethoxymethyl)furan,0007c8edef,Easy Chocolate Frosting Recipe,[{'text': 'Heat chocolate on top of double boi...,"{'energy': 367.7230503667077, 'fat': 4.3421179...","cheese, parmesan, hard",Wheaten Bread
3,bakery,Wheaten Bread,FDB011267,263034,Difurfuryl ether,2-(furan-2-ylmethoxymethyl)furan,00180ff35c,Potato and Hamburger Baskets #SP5,"[{'text': 'Preheat oven to 425F.'}, {'text': '...","{'energy': 265.4093538084956, 'fat': 18.562064...","cheese, cheddar",Wheaten Bread


In [11]:
## reorder the cols
merged_df = merged_df[["id", "title", "instructions","ingredient", "entity_alias_readable", "fooddb_id", "pubchem_id", "common_name", "iupac_name"]]
merged_df = merged_df.rename(columns = {"title": "food", "entity_alias_readable": "ingredient_flavorDB"})
merged_df

Unnamed: 0,id,food,instructions,ingredient,ingredient_flavorDB,fooddb_id,pubchem_id,common_name,iupac_name
0,0007c8edef,Easy Chocolate Frosting Recipe,[{'text': 'Heat chocolate on top of double boi...,"cheese, parmesan, hard",Wheaten Bread,FDB011267,263034,Difurfuryl ether,2-(furan-2-ylmethoxymethyl)furan
1,000c364a4a,Aunt Julie's Pineapple Cookies,[{'text': 'Mix the butter and the cream cheese...,"cheese, parmesan, hard",Wheaten Bread,FDB011267,263034,Difurfuryl ether,2-(furan-2-ylmethoxymethyl)furan
2,000cfbeccf,Broccoli Chicken Casserole Recipe,[{'text': 'Place thawed broccoli in bottom of ...,"cheese, cheddar",Wheaten Bread,FDB011267,263034,Difurfuryl ether,2-(furan-2-ylmethoxymethyl)furan
3,00180ff35c,Potato and Hamburger Baskets #SP5,"[{'text': 'Preheat oven to 425F.'}, {'text': '...","cheese, cheddar",Wheaten Bread,FDB011267,263034,Difurfuryl ether,2-(furan-2-ylmethoxymethyl)furan
4,00196f106d,Tomato-Olive Bread,"[{'text': 'Preheat the oven to 350 degrees.'},...","wheat germ, crude",Wheaten Bread,FDB011267,263034,Difurfuryl ether,2-(furan-2-ylmethoxymethyl)furan
...,...,...,...,...,...,...,...,...,...
9731,00196f106d,Tomato-Olive Bread,"[{'text': 'Preheat the oven to 350 degrees.'},...","spices, thyme, dried",Chinese bayberry,FDB013255,5280445,luteolin,"2-(3,4-dihydroxyphenyl)-5,7-dihydroxychromen-4..."
9732,00196f106d,Tomato-Olive Bread,"[{'text': 'Preheat the oven to 350 degrees.'},...","spices, thyme, dried",Chinese bayberry,FDB012060,8768,"3,4-Dihydroxybenzaldehyde","3,4-dihydroxybenzaldehyde"
9733,00196f106d,Tomato-Olive Bread,"[{'text': 'Preheat the oven to 350 degrees.'},...","spices, thyme, dried",Chinese bayberry,FDB002594,637542,p-coumaric acid,(E)-3-(4-hydroxyphenyl)prop-2-enoic acid
9734,00196f106d,Tomato-Olive Bread,"[{'text': 'Preheat the oven to 350 degrees.'},...","spices, thyme, dried",Chinese bayberry,FDB003285,11552,3-Methylbutanal,3-methylbutanal


In [15]:
## flatten col "nutrition per 100g"
## then join
with open(food_path,'r') as f:
    data = json.loads(f.read())
nutr = pd.json_normalize(data)
flatten_nutr = nutr.filter(regex="nutr_values_per100g")
flatten_nutr.insert(0, "title", nutr["title"], True)
flatten_nutr.insert(0, "id", nutr["id"], True)

In [16]:
flatten_nutr

Unnamed: 0,id,title,nutr_values_per100g.energy,nutr_values_per100g.fat,nutr_values_per100g.protein,nutr_values_per100g.salt,nutr_values_per100g.saturates,nutr_values_per100g.sugars
0,000095fc1d,Yogurt Parfaits,81.129461,2.140139,6.914437,0.055978,0.365347,5.086341
1,00051d5b9d,"Salt Free, Low Cholesterol Sugar Cookies Recipe",477.096404,23.412486,7.625492,0.548621,3.425054,14.298443
2,00059b093b,Honey Sriracha Chicken Wings,208.058983,14.297046,15.383456,1.063915,4.535687,3.048951
3,0005fc89f7,Shrimp and Caper Salad,194.752596,15.980767,11.946687,0.614843,2.366704,0.314583
4,0006ca31f4,Natural Peanut Butter Chocolate Bon Bons,303.435400,5.094847,5.067961,0.019791,2.048394,63.210605
...,...,...,...,...,...,...,...,...
51230,fffb3bbff2,Granola Supreme,451.751314,25.488530,1.147211,0.082548,13.190866,53.392914
51231,fffd4b124b,Almond Bark Candy,431.615316,23.820486,10.677064,0.033979,3.479518,7.628235
51232,fffd6d487a,Chocolate Crumble,461.730085,26.190802,5.283055,0.494183,16.115193,29.173830
51233,fffdbfd298,Cran-Apple White Chocolate Popcorn,210.211482,1.987518,0.679577,0.007365,0.871569,44.795721


In [19]:
final_df = merged_df.merge(flatten_nutr, on="id", how="left").sort_values(by=["id"])

In [68]:
final_df.iloc[0:3, :]

Unnamed: 0,id,food,instructions,ingredient,ingredient_flavorDB,fooddb_id,pubchem_id,common_name,iupac_name,title,nutr_values_per100g.energy,nutr_values_per100g.fat,nutr_values_per100g.protein,nutr_values_per100g.salt,nutr_values_per100g.saturates,nutr_values_per100g.sugars
2328,000095fc1d,Yogurt Parfaits,[{'text': 'Layer all ingredients in a serving ...,"strawberries, raw",Strawberry Jam,FDB011707,22873,Hexyl Hexanoate,hexyl hexanoate,Yogurt Parfaits,81.129461,2.140139,6.914437,0.055978,0.365347,5.086341
204,000095fc1d,Yogurt Parfaits,[{'text': 'Layer all ingredients in a serving ...,"cereals ready-to-eat, granola, homemade",Wholewheat Bread,,61209,"3-Methylcyclopentane-1,2-dione","3-methylcyclopentane-1,2-dione",Yogurt Parfaits,81.129461,2.140139,6.914437,0.055978,0.365347,5.086341
205,000095fc1d,Yogurt Parfaits,[{'text': 'Layer all ingredients in a serving ...,"cereals ready-to-eat, granola, homemade",Wholewheat Bread,FDB003296,11579,Levulinic Acid,4-oxopentanoic acid,Yogurt Parfaits,81.129461,2.140139,6.914437,0.055978,0.365347,5.086341


In [44]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9736 entries, 2328 to 4188
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   id                             9736 non-null   object 
 1   food                           9736 non-null   object 
 2   instructions                   9736 non-null   object 
 3   ingredient                     9736 non-null   object 
 4   ingredient_flavorDB            9736 non-null   object 
 5   fooddb_id                      9736 non-null   object 
 6   pubchem_id                     9736 non-null   int64  
 7   common_name                    9736 non-null   object 
 8   iupac_name                     9736 non-null   object 
 9   title                          9736 non-null   object 
 10  nutr_values_per100g.energy     9736 non-null   float64
 11  nutr_values_per100g.fat        9736 non-null   float64
 12  nutr_values_per100g.protein    9736 non-null 

In [67]:
## flatten flavor molecules into list for each food
flatten_molecule = final_df.groupby(["id", "food"]).agg(flavor_molecules=("iupac_name", lambda x : x.tolist())).reset_index()
flatten_molecule["count"] = flatten_molecule.apply(lambda x : len(x['flavor_molecules']), axis=1)
flatten_molecule.iloc[0:3, :]

Unnamed: 0,id,food,flavor_molecules,count
0,000095fc1d,Yogurt Parfaits,"[hexyl hexanoate, 3-methylcyclopentane-1,2-dio...",88
1,00051d5b9d,"Salt Free, Low Cholesterol Sugar Cookies Recipe","[5-methyl-2-propan-2-ylcyclohexan-1-one, 2-(4-...",318
2,00059b093b,Honey Sriracha Chicken Wings,[2-[3-[(4-amino-2-methylpyrimidin-5-yl)methyl]...,608


In [104]:
## find all unique molecules
m_unique = set()
for i in range(len(flatten_molecule["flavor_molecules"])):
    for j in flatten_molecule["flavor_molecules"].iloc[i]:
        if len(j) > 0:
            m_unique.add(j)
len(m_unique)

623

In [21]:
import gensim