In [54]:
import os
import sys
import logging

import numpy as np
import pandas as pd 

from gensim.models import Word2Vec

import config
from ingredient_parser import ingredient_parser

sys.path.append('..')

if sys.platform == 'linux':
    path = config.LINUX_PATH
else:
    path = config.OS_PATH
os.chdir(path)

In [55]:
data = pd.read_csv('input/df_parsed.csv')
data['parsed_new'] = data.ingredients.apply(ingredient_parser)
data.head()

Unnamed: 0,recipe_urls,recipe_name,ingredients,ingredients_parsed,parsed_new
0,https://www.jamieoliver.com/recipes/duck-recip...,Roast duck with Marsala gravy,"['1 x 1.6kg whole duck', '2 heaped teaspoons C...",duck chinese five clementine gravy carrot onio...,"[duck, chinese five, clementine, gravy, carrot..."
1,https://www.jamieoliver.com/recipes/vegetable-...,Best-ever Brussels sprouts,"['800 g Brussels sprouts', '2 higher-welfare C...",brussels sprout cumberland sausage butter onio...,"[brussels sprout, cumberland sausage, butter, ..."
2,https://www.jamieoliver.com/recipes/pasta-reci...,Beautiful courgette carbonara,"['6 medium green and yellow courgettes', '500 ...",courgette penne egg single cream parmesan chee...,"[courgette, penne, egg, single cream, parmesan..."
3,https://www.jamieoliver.com/recipes/vegetable-...,Roasted black bean burgers,"['1½ red onions', '200 g mixed mushrooms', '10...",onion mushroom rye bread bean mature cheddar c...,"[onion, mushroom, rye bread, bean, mature ched..."
4,https://www.jamieoliver.com/recipes/chicken-re...,Chicken & tofu noodle soup,"['2 shallots', '2 cloves of garlic', '2 cm pie...",shallot chicken thigh sesame star soy rice noo...,"[shallot, chicken thigh, sesame, star, soy, ri..."


In [56]:
# get corpus with the documents sorted in alphabetical order
def get_and_sort_corpus(data):
    corpus_sorted = []
    for doc in data.parsed_new.values:
        doc.sort()
        corpus_sorted.append(doc)
    return corpus_sorted

corpus = get_and_sort_corpus(data)
print(f"Length of corpus: {len(corpus)}")

Length of corpus: 4647


In [57]:
# calculate average length of each document 
lengths = [len(doc) for doc in corpus]
avg_len = float(sum(lengths)) / len(lengths)
avg_len

5.984506132989025

In [58]:
# train word2vec model 
sg = 0 # CBOW: build a language model that correctly predicts the center word given the context words in which the center word appears
workers = 8 # number of CPUs
window = 6 # window size: average length of each document 
min_count = 1 # unique ingredients are important to decide recipes 

model_cbow = Word2Vec(corpus, sg=sg, workers=workers, window=window, min_count=min_count, vector_size=100)

In [59]:
#Summarize the loaded model
print(model_cbow)

#Summarize vocabulary
words = list(model_cbow.wv.index_to_key)
words.sort()
# print(words)

#Acess vector for one word
# print(model_cbow.wv['chicken stock'])

# most similar
model_cbow.wv.most_similar(u'cauliflower just larger than potato')
model_cbow.wv.similarity('cauliflower', 'cauliflower just larger than potato')

Word2Vec(vocab=3893, vector_size=100, alpha=0.025)


0.868625

In [60]:
model_cbow.save('models/model_cbow.bin')

In [61]:
class MeanEmbeddingVectorizer(object):

	def __init__(self, word_model):
		self.word_model = word_model
		self.vector_size = word_model.wv.vector_size

	def fit(self):  # comply with scikit-learn transformer requirement
		return self

	def transform(self, docs):  # comply with scikit-learn transformer requirement
		doc_word_vector = self.word_average_list(docs)
		return doc_word_vector

	def word_average(self, sent):
		"""
		Compute average word vector for a single doc/sentence.
		:param sent: list of sentence tokens
		:return:
			mean: float of averaging word vectors
		"""
		mean = []
		for word in sent:
			if word in self.word_model.wv.index_to_key:
				mean.append(self.word_model.wv.get_vector(word))

		if not mean:  # empty words
			# If a text is empty, return a vector of zeros.
			logging.warning("cannot compute average owing to no vector for {}".format(sent))
			return np.zeros(self.vector_size)
		else:
			mean = np.array(mean).mean(axis=0)
			return mean


	def word_average_list(self, docs):
		"""
		Compute average word vector for multiple docs, where docs had been tokenized.
		:param docs: list of sentence in list of separated tokens
		:return:
			array of average word vector in shape (len(docs),)
		"""
		return np.vstack([self.word_average(sent) for sent in docs])

In [62]:
# encode document by averaging word embeddings

# load model 
loaded_model = Word2Vec.load('models/model_cbow.bin')
if loaded_model:
    print("Successfully loaded model")

mean_vec_tr = MeanEmbeddingVectorizer(loaded_model)
doc_vec = mean_vec_tr.transform(corpus)



Successfully loaded model
