# Imports

In [None]:
import gensim
import ujson as json
import numpy as np
import pandas as pd
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Prepare datasets

In [None]:
with open('data/joined_dev_set.json') as f:
    data = json.load(f)
    df = pd.json_normalize(data)
    df = df.astype({'id': 'string'})
    # Construct Molecule vectors
    df['mols'] = df['flavor_molecules'].apply(lambda el: [str(m['pubchem_id']) for m in el])
    # Build Tagged Document Corpus
    df['mol_doc'] = df.apply(lambda r: gensim.models.doc2vec.TaggedDocument(r['mols'], [r['id']]), axis=1)
    df['type'] = 'recipe'
    
with open('data/flavor_DB.json') as f:
    data = json.load(f)
    flavor_df = pd.json_normalize(data)
    flavor_df = flavor_df.rename(columns={'entity_id': 'id', 'entity_alias_readable': 'food'})
    flavor_df['id'] = flavor_df['id'].apply(lambda i: f"ing-{i}" )
    # Construct Document vectors
    flavor_df['mols'] = flavor_df['molecules'].apply(lambda el: [str(m['pubchem_id']) for m in el])
    # Build Tagged Document Corpus
    flavor_df['mol_doc'] = flavor_df.apply(lambda r: gensim.models.doc2vec.TaggedDocument(r['mols'], [r['id']]), axis=1)
    # Annotate documnet lengths
    flavor_df['molecules_count'] = flavor_df['mols'].apply(lambda el: len(el))
    flavor_df['type'] = 'ingredient'

    
df = pd.concat([df, flavor_df])
df.reset_index(drop=True, inplace=True)
del flavor_df

# HyperParameters

In [None]:
MODEL = 0 # PV-DBOW
VECTOR_SIZE = 300
WINDOW_SIZE = df['molecules_count'].max() 
EPOCHS = 40
DIM_REDUCTION = 3

# Train Model

In [None]:
model = gensim.models.doc2vec.Doc2Vec(dm=MODEL, vector_size=VECTOR_SIZE, window=WINDOW_SIZE, epochs=EPOCHS)
model.build_vocab(df['mol_doc'])

In [None]:
model.train(df['mol_doc'], total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
# Search for similar flavor vecotrs

In [None]:
def random_suggestion():
    row = df.sample(n=1)
    flavor_vec = model.infer_vector(row.iloc[0]['mol_doc'].words)
    similars = model.dv.most_similar(flavor_vec)
    s_df = pd.DataFrame(similars, columns=['id','similarity']).astype({'id': 'string'})
    return pd.concat([row[['id','food']], s_df.merge(df, on='id', how='left')[['id','similarity','food']] ])

In [None]:
random_suggestion()

# Dimensional Reduction

In [None]:
import umap.umap_ as umap

In [None]:
reducer = umap.UMAP(n_components=DIM_REDUCTION)
projection = reducer.fit_transform(model.dv.vectors) # model.dv.index_to_key 
projection

# Visualize Flavor Space

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

from bokeh.layouts import column
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ( 
    HoverTool, ColumnDataSource, Callback, DataTable, TableColumn,
    CustomJS, CategoricalColorMapper, CategoricalMarkerMapper, LinearColorMapper
)
from bokeh.transform import linear_cmap
from bokeh.palettes import Cividis256, Magma256

sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})
output_notebook()

In [None]:
# Debug: check flavor space
# plt.scatter(projection[:,0], projection[:,1])

In [None]:
projection_df = pd.DataFrame(projection, columns=('x', 'y', 'z'))
projection_df['id'] = model.dv.index_to_key
projection_df['food'] = df['food']
projection_df['type'] = df['type']
selected_indexes = []

In [None]:

p1 = figure(
    title='UMAP projection of the Flavor Space',
    plot_width=800,
    plot_height=800,
    tools=('pan, wheel_zoom, reset, lasso_select')
)

p1.add_tools(HoverTool(tooltips="""
<div>
    <div>
        <span style='font-size: 16px; color: #224499'>ID: @id</span>
        <span style='font-size: 18px'>@food</span>
    </div>
</div>
"""))


s1 = ColumnDataSource(projection_df)
color_map = LinearColorMapper(palette=Magma256, low=projection_df['z'].min(), high=projection_df['z'].max())
marker_map = CategoricalMarkerMapper(factors=projection_df['type'].unique(), markers=['circle', 'triangle'])

p1.scatter(
    'x',
    'y',
    source=s1,
    marker=dict(field='type', transform=marker_map),
    color=dict(field='z', transform=color_map),
    line_alpha=0.6,
    fill_alpha=0.6,
    size=6
)

s2 = ColumnDataSource(dict(id=[],food=[]))
p2 = DataTable(source=s2, columns=[
    TableColumn(field='id'),
    TableColumn(field='food')
])

s1.selected.js_on_change('indices', 
     CustomJS(args=dict(s1=s1, s2=s2), 
              code="""
                var selected_indexes = cb_obj.indices;
                s2.data['id'] = [];
                s2.data['food'] = [];
                for (let i=0;i<selected_indexes.length;i++){
                    s2.data['id'].push(s1.data['id'][selected_indexes[i]])
                    s2.data['food'].push(s1.data['food'][selected_indexes[i]])
                }                
                var kernel = IPython.notebook.kernel;
                IPython.notebook.kernel.execute("selected_indexes = " + selected_indexes);
                s2.change.emit();
                """
             )
)


layout = column(p1, p2)
show(layout)

In [None]:
df[df['food'].str.contains('Garlic')]

In [None]:
df.loc[[1259,1364,1339]]

In [None]:
similars = model.dv.most_similar( model.dv.vectors[1259] + model.dv.vectors[1364] + model.dv.vectors[1339] )
s_df = pd.DataFrame(similars, columns=['id','similarity']).astype({'id': 'string'})
s_df.merge(df, on='id', how='left')[['id','similarity','food']]

In [None]:
list(df[df['id'] == '3c55f46ddf']['unique_ingredient'])

In [None]:
df[df['id'] == 'd7c6d60f0a'].columns

In [None]:
len(df[df['type'] == 'ingredient'])