### Embeddings taken from keras examples

In [1]:
# -*- coding: utf-8 -*-

%load_ext autoreload
%matplotlib inline
%autoreload 2

from __future__ import absolute_import

import os
# os.environ["THEANO_FLAGS"] = "mode=FAST_RUN,optimizer=fast_compile,device=gpu0,floatX=float32"

import sys
sys.path.insert(0, '../')

import numpy as np
import theano
import theano.tensor as T
import six.moves.cPickle as pickle
import os, re, json
import itertools

from sklearn.feature_extraction.text import TfidfVectorizer

from keras.preprocessing import sequence, text
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.utils import np_utils, generic_utils
from keras.models import Sequential
from keras.layers.core import Layer,Merge,Reshape,Dense,Dropout
from keras.layers.embeddings import Embedding
from keras import activations, initializations
from six.moves import range
from six.moves import zip

from utils_pack.utils import pickle_in,pickle_out
from utils_pack.word_embeddings_utils import NearestFood

from ipywidgets import interact
# from .utils_pack.utils import ensure_dir

Using Theano backend.


### Load model dict

In [2]:
word_embedding_dict = pickle_in(os.path.join("../models",
                                             "food_terms_embedding_models",
                                             "model_1",
                                             "word_embedding_dict.pkl"))

### Load data

In [3]:
full_dataset_dict = pickle_in(os.path.join("../","datasets","full_dataset_dict.pkl"))
full_dataset_list = pickle_in(os.path.join("../","datasets","full_dataset_list.pkl"))

### Calculate TF-IDF for ingredients

In [4]:
titles,token_titles,recipies,ingredients = zip(*full_dataset_list)

titles = [t[0].encode("utf-8")  for t in titles]
ingredients_flat=list(itertools.chain.from_iterable(ingredients))
ingredients_flat=list(itertools.chain.from_iterable(ingredients_flat))
ingredients_flat = [word.encode("utf-8") for word in ingredients_flat]
ingredients_unique = list(set(ingredients_flat))

wordcount={}
for word in ingredients_flat:
    if word not in wordcount:
        wordcount[word] = 1
    else:
        wordcount[word] += 1

ingredients_tf_idf={}
for word in ingredients_unique:
    ingredients_tf_idf[word]=1.0*wordcount[word]/len(ingredients_flat)

pickle_out(os.path.join("../models",
                         "food_terms_embedding_models",
                         "model_1",
                         "tf_idf_dict.pkl"),ingredients_tf_idf)

## Calculate model flavours

### Simple vector mean
Recipe vector is just the simple average of word vectors

In [5]:
recipe_vectors={}
for key,values in full_dataset_dict.iteritems():

    ingredients = values["ingredient"]
    ingredients = [item.encode("utf-8") for sublist in ingredients for item in sublist]
    recipe_vector=[]
    for ingredient in ingredients:
        try:
            token_vector = word_embedding_dict[ingredient]
        except KeyError:
            continue
        recipe_vector.append(token_vector)
           
    recipe_vector = np.array(recipe_vector)
    recipe_vector = np.mean(recipe_vector,axis=0)
    recipe_vectors[key] = recipe_vector
pickle_out(os.path.join("../models",
                         "food_terms_embedding_models",
                         "model_1","recipe_embeddings_simple_mean_on_ingredients.pkl"
                       ),recipe_vectors)

NF = NearestFood()
NF.add_data(full_dataset_dict,recipe_vectors)

pickle_out(os.path.join("../models",
                         "food_terms_embedding_models",
                         "model_1","search_class_recipe_embeddings_simple_mean_on_ingredients.pkl"
                       ),NF)

### TF-IDF vector mean
Recipe vector is a TF-IDF weighted mean with TF-IDF calculated from ingredients alone

In [6]:
recipe_vectors={}
for key,values in full_dataset_dict.iteritems():

    ingredients = values["ingredient"]
    ingredients = [item.encode("utf-8") for sublist in ingredients for item in sublist]
    recipe_vector=[]
    tf_idf_values =[]
    for ingredient in ingredients:
        try:
            token_vector = word_embedding_dict[ingredient]
            tf_idf_value = ingredients_tf_idf[ingredient]
        except KeyError:
            continue
        recipe_vector.append(token_vector)
        tf_idf_values.append(tf_idf_value)
        
    recipe_vector = np.array(recipe_vector)
    tf_idf_values = np.array(tf_idf_values).reshape(-1,1)
    recipe_vector = recipe_vector*tf_idf_values
    recipe_vector = np.mean(recipe_vector,axis=0)
    recipe_vectors[key] = recipe_vector
pickle_out(os.path.join("../models",
                         "food_terms_embedding_models",
                         "model_1","recipe_embeddings_tf_idf_mean_on_ingredients.pkl"
                       ),recipe_vectors)

NF = NearestFood()
NF.add_data(full_dataset_dict,recipe_vectors)

pickle_out(os.path.join("../models",
                         "food_terms_embedding_models",
                         "model_1","search_class_recipe_embeddings_tf_idf_mean_on_ingredients.pkl"
                       ),NF)