In [43]:
import os
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import h5py
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.offline as offline
import numpy as np
import collections
import pandas as pd
import itertools
import seaborn as sns
import time
import json
offline.init_notebook_mode()

flatten = lambda l: [item for sublist in l for item in sublist]

def load_embeddings(path):
    f = h5py.File(path, 'r')
    nemb = f['nemb'][:]
    f.close()
    return nemb


def load_vocab(path):
    vocab = []
    with open(path, 'rb') as f:
        for line in f.readlines():
            split = line.split(' ')
            vocab.append((split[0], int(split[1].rstrip())))
    # ignore UNK at position 0
    return vocab[1:]


def run_tsne(nemb, n_iter=5000):
    tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=n_iter, verbose=1)
    return tsne.fit_transform(nemb)


def load_recipes(path='../dat/srep00196-s3.csv'):
    recipes = []
    with open(path, 'rb') as f:
        for line in f:
            if line[0] == '#':
                pass
            else:
                recipes.append(line.rstrip().split(','))
    return recipes
   
    
def build_food2cuisine(recipes, vocab):
    foods = [tup[0] for tup in vocab]
    food_counters = {food: collections.Counter() for food in foods}
    cuisine_counter = collections.Counter()
    for line in recipes:
            cuisine = line[0]
            cuisine_counter.update([cuisine])
            for food in line[1:]:
                if food in foods:
                    food_counters[food].update([cuisine])
    food2cuisine = {}
    for food, food_counter in food_counters.items():
        for cuisine in cuisine_counter.keys():
            food_counter[cuisine] = float(food_counter[cuisine]) / float(cuisine_counter[cuisine])
        sorted_food_counter = sorted(food_counter.items(), key=lambda a: a[1])[::-1]
        print food, sorted_food_counter[0:2]
        food2cuisine.update({food: sorted_food_counter[0][0]})
    return food2cuisine


# These are the "Tableau 20" colors as RGB.    
tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),    
             (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),    
             (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),    
             (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),    
             (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]    
tableau20_rgb = ['rgb' + str(triplet) for triplet in tableau20]

# Prettify ingredients
pretty_food = lambda s: ' '.join(s.split('_')).capitalize().lstrip()
# Prettify cuisine names
pretty_cuisine = lambda s: ''.join(map(lambda x: x if x.islower() else " "+x, s)).lstrip()


def make_plot(name, points, labels, legend_labels, legend_order, legend_label_to_color, pretty_legend_label):
    lst = zip(points, labels, legend_labels)
    full = sorted(lst, key=lambda x: x[2])
    traces = []
    for legend_label, group in itertools.groupby(full, lambda x: x[2]):
        group_points = []
        group_labels = []
        for tup in group:
            point, label, _ = tup
            group_points.append(point)
            group_labels.append(label)
        group_points = np.stack(group_points)
        traces.append(go.Scattergl(
            x = group_points[:, 0],
            y = group_points[:, 1],
            mode = 'markers',
            marker = dict(
                color = legend_label_to_color[legend_label],
                size = 8,
                opacity = 0.6,
                #line = dict(width = 1)
            ),
            text = ['{} ({})'.format(label, pretty_legend_label(legend_label)) for label in group_labels],
            hoverinfo = 'text',
            name = legend_label
        )
        )
    # order the legend
    ordered = [[trace for trace in traces if trace.name == lab] for lab in legend_order]
    traces_ordered = flatten(ordered)
    def _set_name(trace):
        trace.name = pretty_legend_label(trace.name)
        return trace
    traces_ordered = map(_set_name, traces_ordered)
    layout = go.Layout(
        xaxis=dict(
            autorange=True,
            showgrid=False,
            zeroline=False,
            showline=False,
            autotick=True,
            ticks='',
            showticklabels=False
        ),
        yaxis=dict(
            autorange=True,
            showgrid=False,
            zeroline=False,
            showline=False,
            autotick=True,
            ticks='',
            showticklabels=False
        )
    )
    fig = go.Figure(data=traces_ordered, layout=layout)
    offline.iplot(fig, filename=name)

In [39]:
# path = '/home/jaan/fit/food2vec'
path = '/Users/jaanaltosaar/fit/food2vec'
nemb = load_embeddings(os.path.join(path, 'embeddings.h5'))
vocab = load_vocab(os.path.join(path, 'vocab.txt'))
food2id = {tup[0]: i for i, tup in enumerate(vocab)}

## Plot ingredients

In [3]:
# don't plot UNK at position 0
low_dim_embs = run_tsne(nemb[1:])

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 347 / 347
[t-SNE] Mean sigma: 0.347558
[t-SNE] Error after 100 iterations with early exaggeration: 1.257875
[t-SNE] Error after 175 iterations: 1.993717


In [4]:
recipes = load_recipes()
food2cuisine = build_food2cuisine(recipes, vocab)

lemon_juice [('MiddleEastern', 0.24806201550387597), ('African', 0.23011363636363635)]
brussels_sprout [('EasternEuropean', 0.005249343832020997), ('SoutheastAsian', 0.002188183807439825)]
sunflower_oil [('SouthAsian', 0.00644122383252818), ('African', 0.002840909090909091)]
mackerel [('EastAsian', 0.01592356687898089), ('WesternEuropean', 0.001504324934185784)]
porcini [('SouthernEuropean', 0.011004784688995215), ('MiddleEastern', 0.006201550387596899)]
anise_seed [('NorthernEuropean', 0.016), ('LatinAmerican', 0.005827905382242029)]
milk [('NorthAmerican', 0.26481071187746846), ('WesternEuropean', 0.2606242948476871)]
grape [('SouthAsian', 0.00966183574879227), ('NorthAmerican', 0.006815335709469223)]
camembert_cheese [('WesternEuropean', 0.003008649868371568), ('NorthAmerican', 9.632983334938831e-05)]
peanut [('SoutheastAsian', 0.09190371991247265), ('EastAsian', 0.018312101910828025)]
melon [('NorthAmerican', 0.0032270494172045084), ('SouthAsian', 0.00322061191626409)]
matsutake [(

In [5]:
cuisines = list(set(food2cuisine.values()))
# np.random.seed(1234)
# tableau20_sample = np.random.choice(tableau20_rgb, len(cuisines), replace=False)
# cuisine2color = {cuisine: tableau20_sample[i] for i, cuisine in enumerate(cuisines)}
cuisine2color = {
    'African': sns.xkcd_rgb["grey"],
    'LatinAmerican': sns.xkcd_rgb["forest green"],
    'NorthAmerican': sns.xkcd_rgb["light pink"],
    'MiddleEastern': sns.xkcd_rgb["mustard yellow"],
    'EastAsian': sns.xkcd_rgb["orange"],
    'SouthAsian': sns.xkcd_rgb["magenta"],
    'SoutheastAsian': sns.xkcd_rgb["purple"],
    'NorthernEuropean': sns.xkcd_rgb["blue"],
    'EasternEuropean': sns.xkcd_rgb["deep blue"],
    'WesternEuropean': sns.xkcd_rgb["sky blue"],
    'SouthernEuropean': sns.xkcd_rgb["olive"],
}
food2color = {food: cuisine2color[food2cuisine[food]] for food in food2cuisine.keys()}

In [6]:
legend_order = [
'African',
'LatinAmerican',
'NorthAmerican',
'EastAsian',
'SouthAsian',
'SoutheastAsian',
'MiddleEastern',
'NorthernEuropean',
'EasternEuropean',
'WesternEuropean',
'SouthernEuropean',
]

In [40]:
labels = [item[0] for item in vocab]
legend_labels = [food2cuisine[food] for food in labels]
labels = [item[0] for item in vocab]
labels = map(pretty_food, labels)
# legend_order = cuisine2color.keys()
make_plot(name='food2vec_food_embeddings_tsne',
          points=low_dim_embs, 
          labels=labels, 
          legend_labels=legend_labels, 
          legend_order=legend_order, 
          legend_label_to_color=cuisine2color, 
          pretty_legend_label=pretty_cuisine)

## Plot recipes
NB: TSNE Takes ~10-30 minutes on 50k recipes

In [8]:
def build_recipe_embedding(recipes, nemb, food2id):
    """Get the recipe embedding.
    
    A recipe's embedding is the mean of its ingredients' embeddings.
    
    Args:
        recipes: list of recipes in the form [cuisine, food1, food2, ...]
        nemb: normalized embeddings
        food2id: map from food string to index in normalized embeddings
    Returns:
        List of tuples, each tuple has form (cuisine, ingredients, recipe embedding)
    """
    recipe_embeddings = []
    for line in recipes:
        cuisine = line.pop(0)
        foods = line
        # check that we have learned the embeddings for all the ingredients
        filtered_foods = [food for food in foods if food in food2id]
        if len(filtered_foods) > 0:
            food_ids = map(lambda x: food2id[x], filtered_foods)
            embedding = np.mean(nemb[food_ids], axis=0)
            recipe_embeddings.append((cuisine, foods, embedding))
    return recipe_embeddings

In [9]:
recipe_embeddings = build_recipe_embedding(recipes, nemb, food2id)

In [10]:
# subset = np.random.choice(range(len(recipe_embeddings)), 2000, replace=False)
# small = [recipe_embeddings[idx] for idx in subset]

In [11]:
cuisine_labels, ingredients, embeddings = zip(*recipe_embeddings)
cuisine_labels = list(cuisine_labels)
recipe_nemb = np.vstack(embeddings)

In [12]:
cuisine_counter = collections.Counter(cuisine_labels)

In [13]:
cuisine_counter

Counter({'African': 352,
         'EastAsian': 2512,
         'EasternEuropean': 381,
         'LatinAmerican': 2917,
         'MiddleEastern': 645,
         'NorthAmerican': 41523,
         'NorthernEuropean': 250,
         'SouthAsian': 621,
         'SoutheastAsian': 457,
         'SouthernEuropean': 4180,
         'WesternEuropean': 2659})

In [26]:
t0 = time.time()
low_dim_recipe_embs = run_tsne(recipe_nemb)
print 'time to run tsne on %d points: %.3f mins' % (len(recipe_nemb), (time.time() - t0) / 60.)

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 56497
[t-SNE] Computed conditional probabilities for sample 2000 / 56497
[t-SNE] Computed conditional probabilities for sample 3000 / 56497
[t-SNE] Computed conditional probabilities for sample 4000 / 56497
[t-SNE] Computed conditional probabilities for sample 5000 / 56497
[t-SNE] Computed conditional probabilities for sample 6000 / 56497
[t-SNE] Computed conditional probabilities for sample 7000 / 56497
[t-SNE] Computed conditional probabilities for sample 8000 / 56497
[t-SNE] Computed conditional probabilities for sample 9000 / 56497
[t-SNE] Computed conditional probabilities for sample 10000 / 56497
[t-SNE] Computed conditional probabilities for sample 11000 / 56497
[t-SNE] Computed conditional probabilities for sample 12000 / 56497
[t-SNE] Computed conditional probabilities for sample 13000 / 56497
[t-SNE] Computed conditional probabilities 

In [14]:
recipe_emb_path = os.path.join(path, 'low_dim_recipe_embs.npz')
# np.savez_compressed(recipe_emb_path, low_dim_recipe_embs)

In [30]:
with open(recipe_emb_path, 'rb') as f:
    low_dim_recipe_embs = np.load(f)['arr_0']

In [31]:
# low_dim_recipe_embs = run_fast_tsne(embeddings)
# low_dim_recipe_embs = tsne.bh_sne(embeddings)
# t0 = time.time()
# low_dim_recipe_embs = bhtsne.run_bh_tsne(nemb, no_dims=2, perplexity=50, theta=0.5, randseed=-1, verbose=False,initial_dims=50, use_pca=True, max_iter=1000)
# print 'time to run tsne on %d points: %.3f mins' % (len(recipe_nemb), (time.time() - t0) / 60.)
low_dim_recipe_embs_list = low_dim_recipe_embs.tolist()

In [36]:
recipe_labels = [', '.join([pretty_label(food) for food in foods]).lower().capitalize() for foods in ingredients]

In [37]:
make_plot(name='food2vec_recipe_embeddings_tsne',
          points=low_dim_recipe_embs_list, 
          labels=recipe_labels, 
          legend_labels=cuisine_labels, 
          legend_order=legend_order, 
          legend_label_to_color=cuisine2color, 
          pretty_legend_label=pretty_cuisine)

## Write foods to json

In [44]:
foods = [tup[0] for tup in vocab]
food2prettyfood = {food: pretty_food(food) for food in foods}
with open(os.path.join(path, 'food2prettyfood.json'), 'wb') as f:
    json.dumps(food2prettyfood, f)