# Imports

In [26]:
import gensim
import ujson as json
import numpy as np
import pandas as pd
import logging
from fuzzywuzzy import fuzz
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Prepare datasets

In [21]:
with open('data/joined_dev_set.json') as f:
    data = json.load(f)
    df = pd.json_normalize(data)
    df = df.astype({'id': 'string'})
    # Construct Molecule vectors
    df['mols'] = df['flavor_molecules'].apply(lambda el: [str(m['pubchem_id']) for m in el])
    # Build Tagged Document Corpus
    df['mol_doc'] = df.apply(lambda r: gensim.models.doc2vec.TaggedDocument(r['mols'], [r['id']]), axis=1)
    df['type'] = 'recipe'
    
with open('data/flavor_DB.json') as f:
    data = json.load(f)
    flavor_df = pd.json_normalize(data)
    flavor_df = flavor_df.rename(columns={'entity_id': 'id', 'entity_alias_readable': 'food'})
    flavor_df['id'] = flavor_df['id'].apply(lambda i: f"ing-{i}" )
    # Construct Document vectors
    flavor_df['mols'] = flavor_df['molecules'].apply(lambda el: [str(m['pubchem_id']) for m in el])
    # Build Tagged Document Corpus
    flavor_df['mol_doc'] = flavor_df.apply(lambda r: gensim.models.doc2vec.TaggedDocument(r['mols'], [r['id']]), axis=1)
    # Annotate documnet lengths
    flavor_df['molecules_count'] = flavor_df['mols'].apply(lambda el: len(el))
    flavor_df['type'] = 'ingredient'

    
df = pd.concat([df, flavor_df])
df.reset_index(drop=True, inplace=True)
del flavor_df

In [23]:
df.iloc[-5:]

Unnamed: 0,id,food,unique_ingredient,ingredient_count,category_freq,instructions,flavor_molecules,molecules_count,nutr_values_per100g.energy,nutr_values_per100g.fat,...,type,category,category_readable,entity_alias_basket,natural_source_name,entity_alias,molecules,natural_source_url,entity_alias_url,entity_alias_synonyms
1929,ing-968,Green zucchini,,,,,,107,,,...,ingredient,vegetable,Vegetable,,Cucurbita_pepo,,"[{'bond_stereo_count': 0, 'undefined_atom_ster...",https://en.wikipedia.org/wiki/Cucurbita_pepo,https://en.wikipedia.org/wiki/Zucchini,Courgette
1930,ing-969,Yellow zucchini,,,,,,103,,,...,ingredient,vegetable,Vegetable,,Cucurbita_pepo,,"[{'bond_stereo_count': 0, 'undefined_atom_ster...",https://en.wikipedia.org/wiki/Cucurbita_pepo,https://en.wikipedia.org/wiki/Zucchini,Yellow zucchini
1931,ing-970,Saskatoon berry,,,,,,94,,,...,ingredient,fruit-berry,Berry,,Amelanchier,,"[{'bond_stereo_count': 0, 'undefined_atom_ster...",https://en.wikipedia.org/wiki/Amelanchier,https://en.wikipedia.org/wiki/Amelanchier_alni...,"Saskatoon, Pacific serviceberry, Western serv..."
1932,ing-971,Nanking cherry,,,,,,94,,,...,ingredient,fruit-berry,Berry,,Prunus cerasus,,"[{'bond_stereo_count': 0, 'undefined_atom_ster...",https://en.wikipedia.org/wiki/Prunus_cerasus,https://en.wikipedia.org/wiki/Prunus_tomentosa,"Korean cherry, Manchu cherry, Downy cherry, Sh..."
1933,ing-972,Japanese pumpkin,,,,,,100,,,...,ingredient,fruit,Fruit,,Winter squash,,"[{'bond_stereo_count': 0, 'undefined_atom_ster...",https://en.wikipedia.org/wiki/Winter_squash,https://en.wikipedia.org/wiki/Kabocha,"Japanese pumpkin, Kabocha"


# HyperParameters

In [7]:
MODEL = 0 # PV-DBOW
VECTOR_SIZE = 300
WINDOW_SIZE = df['molecules_count'].max() 
EPOCHS = 40
DIM_REDUCTION = 3

# Train Model

In [8]:
model = gensim.models.doc2vec.Doc2Vec(dm=MODEL, vector_size=VECTOR_SIZE, window=WINDOW_SIZE, epochs=EPOCHS)
model.build_vocab(df['mol_doc'])

2021-11-22 19:27:25,939 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec(dbow,d300,n5,mc5,s0.001,t3)', 'datetime': '2021-11-22T19:27:25.882815', 'gensim': '4.0.1', 'python': '3.9.6 | packaged by conda-forge | (default, Jul 11 2021, 03:36:15) \n[Clang 11.1.0 ]', 'platform': 'macOS-12.0.1-x86_64-i386-64bit', 'event': 'created'}
2021-11-22 19:27:25,948 : INFO : collecting all words and their counts
2021-11-22 19:27:25,950 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2021-11-22 19:27:26,088 : INFO : collected 1788 word types and 1934 unique tags from a corpus of 1934 examples and 371674 words
2021-11-22 19:27:26,089 : INFO : Creating a fresh vocabulary
2021-11-22 19:27:26,096 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=5 retains 1345 unique words (75.22371364653245%% of original 1788, drops 443)', 'datetime': '2021-11-22T19:27:26.096190', 'gensim': '4.0.1', 'python': '3.9.6 | packaged by conda-forge | (default, Jul 11 2021, 03:36:15)

In [9]:
model.train(df['mol_doc'], total_examples=model.corpus_count, epochs=model.epochs)

2021-11-22 19:27:35,247 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 3 workers on 1345 vocabulary and 300 features, using sg=1 hs=0 sample=0.001 negative=5 window=677', 'datetime': '2021-11-22T19:27:35.247947', 'gensim': '4.0.1', 'python': '3.9.6 | packaged by conda-forge | (default, Jul 11 2021, 03:36:15) \n[Clang 11.1.0 ]', 'platform': 'macOS-12.0.1-x86_64-i386-64bit', 'event': 'train'}
2021-11-22 19:27:35,512 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-11-22 19:27:35,524 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-11-22 19:27:35,528 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-11-22 19:27:35,529 : INFO : EPOCH - 1 : training on 371674 raw words (348388 effective words) took 0.3s, 1256171 effective words/s
2021-11-22 19:27:35,779 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-11-22 19:27:35,791 : INFO : worker thread finished; awaiting finish of 1 more threads


In [None]:
# Search for similar flavor vecotrs

In [12]:
def random_suggestion():
    row = df.sample(n=1)
    flavor_vec = model.infer_vector(row.iloc[0]['mol_doc'].words)
#     similars = model.dv.most_similar(flavor_vec)
    similars = model.dv.most_similar(positive=[flavor_vec])
    s_df = pd.DataFrame(similars, columns=['id','similarity']).astype({'id': 'string'})
    return pd.concat([row[['id','food']], s_df.merge(df, on='id', how='left')[['id','similarity','food']] ])

In [13]:
random_suggestion()

Unnamed: 0,id,food,similarity
170,2b62524940,Flavored Simple Syrups,
0,2b62524940,Flavored Simple Syrups,0.950752
1,6af95944a9,Apple Salsa,0.822376
2,1a8d32738f,Hot Cranberry Drink,0.816156
3,c92cb2e84f,Cheery Cherry Punch,0.760965
4,b099336710,Raspberry Revolution Smoothie (Nordstrom),0.709396
5,c641cf762d,Hurricane Smoothie,0.69994
6,4601a99c1e,Cranberry Relish,0.686615
7,352239c704,"Cholesterol Free, Low Fat Cookies W/ Icing",0.683366
8,33d1163df0,No Cook Cranberry Relish,0.667553


# Dimensional Reduction

In [14]:
import umap.umap_ as umap

In [15]:
reducer = umap.UMAP(n_components=DIM_REDUCTION)
projection = reducer.fit_transform(model.dv.vectors) # model.dv.index_to_key 
projection

OMP: Info #271: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


array([[ 6.6821623,  4.8610272, -1.845947 ],
       [ 7.884981 ,  9.264379 ,  1.5613925],
       [ 8.941873 ,  7.9035196,  1.88472  ],
       ...,
       [ 7.9416146, 11.935551 ,  4.1431394],
       [ 7.9552813, 11.9433   ,  4.0863476],
       [ 8.044895 , 11.881279 ,  3.8303423]], dtype=float32)

# Visualize Flavor Space

In [170]:
import seaborn as sns
import matplotlib.pyplot as plt

from bokeh.application import Application
from bokeh.application.handlers import FunctionHandler
from bokeh.layouts import column, row
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ( 
    HoverTool, ColumnDataSource, Callback, DataTable, TableColumn,
    CustomJS, CategoricalColorMapper, CategoricalMarkerMapper, LinearColorMapper
)
from bokeh.transform import linear_cmap
from bokeh.palettes import Cividis256, Magma256, Category20
from bokeh.models.widgets import Slider, TextInput, Select, AutocompleteInput


sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})
output_notebook()

In [17]:
# Debug: check flavor space
# plt.scatter(projection[:,0], projection[:,1])

In [65]:
projection_df = pd.DataFrame(projection, columns=('x', 'y', 'z'))
projection_df['id'] = model.dv.index_to_key
projection_df['food'] = df['food']
projection_df['type'] = df['type']
projection_df['ingredient'] = df['unique_ingredient']
projection_df['ingredient_count'] = df['ingredient_count']
selected_indexes = []

In [66]:
projection_df.iloc[0:5]

Unnamed: 0,x,y,z,id,food,type,ingredient,ingredient_count
0,6.682162,4.861027,-1.845947,005e671ac4,Pecan Pralines,recipe,"[cream, fluid, heavy whipping, sugars, brown, ...",5.0
1,7.884981,9.264379,1.561393,00703db954,Greatest Granola,recipe,"[oats, seeds, sesame seeds, whole, dried, seed...",8.0
2,8.941873,7.90352,1.88472,007e544c80,Cream of Fresh Cauliflower Soup,recipe,"[soup, chicken broth or bouillon, dry, onions,...",9.0
3,8.099725,8.495958,1.33226,00a0ea8691,Sausage and Rice Bake,recipe,"[rice, white, long-grain, regular, unenriched,...",9.0
4,8.1041,8.210351,2.008258,00deb3f9f4,Spicy Raspberry Balsamic Dressing,recipe,"[oil, olive, salad or cooking, vinegar, balsam...",9.0


In [149]:
## find nearest neighbors
def findSimilars(food="", type_="recipe", limit=0):
    if food == "":
        return projection_df, pd.DataFrame(columns=projection_df.columns)
    if not type_ in ["recipe", "ingredient", "all"]:
        raise Exception("Wrong food type, type must be one of 'recipe, ingredient, all' ")
    i=3
    row = df.loc[df['food']==food]
    flavor_vec = model.infer_vector(row.iloc[0]['mol_doc'].words)
    similars = model.dv.most_similar(positive=[flavor_vec],topn=30)
    indices = [x[0] for x in similars]
    indices.append(row["id"].iloc[0])
    selected_df = projection_df[projection_df["id"].isin(indices)]
    selected_df = selected_df[selected_df["type"] == type_]
    if type_ == 'recipe':
        selected_df = selected_df[selected_df['ingredient_count'] > limit]
    selected_df = selected_df.iloc[:5]
    nonselected_df = projection_df[~projection_df["id"].isin(selected_df['id'])]
    return nonselected_df, selected_df

In [145]:
findSimilars(food="Cucumber",type_="ingredient")[1]['ingredient'].iloc[0]

Unnamed: 0,id,food,unique_ingredient,ingredient_count,category_freq,instructions,flavor_molecules,molecules_count,nutr_values_per100g.energy,nutr_values_per100g.fat,...,type,category,category_readable,entity_alias_basket,natural_source_name,entity_alias,molecules,natural_source_url,entity_alias_url,entity_alias_synonyms
1366,ing-366,Cucumber,,,,,,134,,,...,ingredient,vegetable-gourd,Gourd,cucumber,Cucumis,cucumber,"[{'bond_stereo_count': 1, 'undefined_atom_ster...",https://en.wikipedia.org/wiki/Cucumis,https://en.wikipedia.org/wiki/Cucumber,Cucumber


nan

In [146]:
findSimilars(food="",type_="ingredient")[1]

Unnamed: 0,x,y,z,id,food,type,ingredient,ingredient_count


In [171]:
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)                 

output_notebook() ## output to the notebook, avoid using bokeh server

fig = figure(
    title='UMAP projection of the Flavor Space',
    plot_width=800,
    plot_height=800,
    tools=('pan, wheel_zoom, reset, lasso_select')
)

fig.add_tools(HoverTool(tooltips="""
<div>
    <div>
        <span style='font-size: 16px; color: #224499'>ID: @id</span>
        <span style='font-size: 18px'>@food</span>
    </div>
</div>
"""))

## prepare data
nonselected_df, selected_df = findSimilars()
chart_data = ColumnDataSource(nonselected_df)
highlighted_data = ColumnDataSource(selected_df)
table_data = ColumnDataSource(dict(id=highlighted_data.data["id"], 
                           food=highlighted_data.data["food"], 
                           type=highlighted_data.data["type"], 
                           ingredient = highlighted_data.data["ingredient"],
                           ingredient_count=highlighted_data.data["ingredient_count"]
                          ))
color_map = LinearColorMapper(palette=Category20[4], low=projection_df['z'].min(), high=projection_df['z'].max())
marker_map = CategoricalMarkerMapper(factors=projection_df['type'].unique(), markers=['circle', 'triangle'])

chart = fig.scatter(
    'x',
    'y',
    source=chart_data,
    marker=dict(field='type', transform=marker_map),
    color=dict(field='z', transform=color_map),
    line_alpha=0.6,
    fill_alpha=0.6,
    size=6
)

## drawing scatter of selected data in red color
highlighted_chart = fig.scatter(
    'x',
    'y',
    source=highlighted_data,
    color="red",
    line_alpha=0.6,
    fill_alpha=0.6,
    size=6
)




chart_data.selected.js_on_change('indices', 
     CustomJS(args=dict(chart_data=chart_data, table_data=table_data), 
              code="""
                var selected_indexes = cb_obj.indices;
                console.log(selected_indexes)
                table_data.data['id'] = [];
                table_data.data['food'] = [];
                table_data.data['ingredient'] = [];
                table_data.data['ingredient_count'] = [];
                table_data.data['type'] = [];
                for (let i=0;i<selected_indexes.length;i++){
                    table_data.data['id'].push(chart_data.data['id'][selected_indexes[i]]);
                    table_data.data['food'].push(chart_data.data['food'][selected_indexes[i]]);
                    table_data.data['type'].push(chart_data.data['type'][selected_indexes[i]]);
                    table_data.data['ingredient'].push(chart_data.data['ingredient'][selected_indexes[i]]);
                    table_data.data['ingredient_count'].push(chart_data.data['ingredient_count'][selected_indexes[i]]);
                }                
                table_data.change.emit();
                """
             )
)

## display the selected data
table = DataTable(source=table_data, sizing_mode="stretch_width", columns=[
    TableColumn(field='id'),
    TableColumn(field='food'),
    TableColumn(field='type'),
    TableColumn(field='ingredient'),
    TableColumn(field='ingredient_count')
])

#### config user input widgets
def updateData(food, type_, limit):
    nonselected_df, selected_df = findSimilars(food=food, type_=type_, limit=int(limit))
    # update data
    chart_data.data = nonselected_df
    highlighted_data.data = selected_df
    table_data.data = dict(id=highlighted_data.data["id"], 
                           food=highlighted_data.data["food"], 
                           ingredient = highlighted_data.data["ingredient"],
                           ingredient_count=highlighted_data.data["ingredient_count"],
                           type=highlighted_data.data["type"]
                          )

    
## text input for food input
def text_input_callback(attr, old, new):
    updateData(food=text_input.value, type_=select_type_filter.value, limit=int(select_ingredient_filter.value))

# get all possible entries
recipe_list = df[df['type'] == "recipe"]["food"].tolist()
ingr_list = df[df['type'] == "ingredient"]["food"].tolist()

text_input = AutocompleteInput(title="Food Name", value='', completions=recipe_list, case_sensitive=False)       
text_input.on_change("value",text_input_callback)


## filtration food with only 1 or 2 ingredients
def ingredient_filter_callback(attr, old, new):
    updateData(food=text_input.value, type_=select_type_filter.value, limit=int(select_ingredient_filter.value))

select_ingredient_filter = Slider(start=0, end=5, value=0, step=1, title="Food with at least N ingredients")
select_ingredient_filter.on_change("value", ingredient_filter_callback)


## type filtration
def type_filter_callback(attr, old, new):
    if select_type_filter.value == 'recipe':
        text_input.completions = recipe_list
    else:
        text_input.completions = ingr_list
    text_input.value = ""
    updateData(food=text_input.value, type_=select_type_filter.value, limit=int(select_ingredient_filter.value))
    
select_type_filter = Select(title="Food type", value="recipe", options=["recipe", "ingredient"])
select_type_filter.on_change("value", type_filter_callback)


## Nutrition Optimizer
def nutrition_filter_callback(attr, old, new):
    ###############
    #### placeholder optimization algorithm here
    ##############
    print(select_nutrition_filter.value)    
select_nutrition_filter = Select(title="Nutrition Priority", value="Protein", options=["Protein", "Fat", "Calories"])
select_nutrition_filter.on_change("value", nutrition_filter_callback)


# Set up layouts and add to document
# layout = column(row(text_input, select_ingredient_filter, select_nutrition_filter),fig, table)
layout = column(row(column( select_type_filter, text_input), 
                    column(select_ingredient_filter,select_nutrition_filter)),
                fig, table)


def modify_doc(doc):
    doc.add_root(row(layout, width=800))
    return doc


handler = FunctionHandler(modify_doc) # A Handler that accepts a plain python function to use for modifying Bokeh Documents.
app = Application(handler)
show(app)

2021-11-22 22:45:38,709 : INFO : Starting Bokeh server version 2.4.1 (running on Tornado 6.1)
2021-11-22 22:45:38,711 : INFO : User authentication hooks NOT provided (default user enabled)


2021-11-22 22:45:38,853 : INFO : 200 GET /autoload.js?bokeh-autoload-element=11696&bokeh-absolute-url=http://localhost:64707&resources=none (::1) 31.09ms
INFO:tornado.access:200 GET /autoload.js?bokeh-autoload-element=11696&bokeh-absolute-url=http://localhost:64707&resources=none (::1) 31.09ms
2021-11-22 22:45:39,234 : INFO : 101 GET /ws (::1) 2.04ms
INFO:tornado.access:101 GET /ws (::1) 2.04ms
2021-11-22 22:45:39,236 : INFO : WebSocket connection opened
2021-11-22 22:45:39,239 : INFO : ServerConnection created


In [None]:
df[df['food'].str.contains('Garlic')]

In [None]:
df.loc[[1259,1364,1339]]

In [None]:
similars = model.dv.most_similar( model.dv.vectors[1259] + model.dv.vectors[1364] + model.dv.vectors[1339] )
s_df = pd.DataFrame(similars, columns=['id','similarity']).astype({'id': 'string'})
s_df.merge(df, on='id', how='left')[['id','similarity','food']]

In [None]:
list(df[df['id'] == '3c55f46ddf']['unique_ingredient'])

In [None]:
df[df['id'] == 'd7c6d60f0a'].columns

In [None]:
len(df[df['type'] == 'ingredient'])