# Imports

In [33]:
from gensim.models import Word2Vec
import pandas as pd
import ast
import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [34]:
# function to parse strings of lists as Python lists
def parseTupleFunc(tuple_str: str):

    try:
        return ast.literal_eval(tuple_str)

    except Exception as e:

        print(tuple_str)

# import column
recipes = pd.read_csv("../data/recipes.csv", usecols=["RecipeIngredientParts"]).squeeze("columns")

recipes = recipes.drop(recipes[recipes.str[:2] != "c("].index)

recipes = recipes.str[1:]

recipes = recipes.apply(parseTupleFunc)

In [35]:
recipes = recipes.values

In [36]:
recipes[0]

('blueberries', 'granulated sugar', 'vanilla yogurt', 'lemon juice')

In [37]:
model = Word2Vec(
    recipes,
    # use skipgram, not CBOW
    sg=1,
    # ensures rarely-occurring ingredients still are given a vector
    min_count=1
)

INFO - 15:06:31: collecting all words and their counts
INFO - 15:06:31: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 15:06:31: PROGRESS: at sentence #10000, processed 78783 words, keeping 3498 word types
INFO - 15:06:31: PROGRESS: at sentence #20000, processed 156666 words, keeping 4106 word types
INFO - 15:06:31: PROGRESS: at sentence #30000, processed 234118 words, keeping 4459 word types
INFO - 15:06:31: PROGRESS: at sentence #40000, processed 312517 words, keeping 4694 word types
INFO - 15:06:31: PROGRESS: at sentence #50000, processed 392108 words, keeping 4858 word types
INFO - 15:06:31: PROGRESS: at sentence #60000, processed 471699 words, keeping 5007 word types
INFO - 15:06:31: PROGRESS: at sentence #70000, processed 550299 words, keeping 5126 word types
INFO - 15:06:31: PROGRESS: at sentence #80000, processed 631287 words, keeping 5227 word types
INFO - 15:06:31: PROGRESS: at sentence #90000, processed 712934 words, keeping 5395 word types
INFO - 1

In [38]:
# model.build_vocab(recipes, progress_per=10000)


In [39]:
model.train(recipes, total_examples=model.corpus_count, epochs=30, report_delay=1)

INFO - 15:06:55: Word2Vec lifecycle event {'msg': 'training model with 3 workers on 7358 vocabulary and 100 features, using sg=1 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2023-03-22T15:06:55.779629', 'gensim': '4.3.1', 'python': '3.8.3 (default, Jul  2 2020, 16:21:59) \n[GCC 7.3.0]', 'platform': 'Linux-5.15.0-67-generic-x86_64-with-glibc2.10', 'event': 'train'}
INFO - 15:06:56: EPOCH 0 - PROGRESS: at 19.21% examples, 541726 words/s, in_qsize 5, out_qsize 0
INFO - 15:06:57: EPOCH 0 - PROGRESS: at 39.96% examples, 569060 words/s, in_qsize 5, out_qsize 0
INFO - 15:06:58: EPOCH 0 - PROGRESS: at 51.61% examples, 487907 words/s, in_qsize 5, out_qsize 0
INFO - 15:06:59: EPOCH 0 - PROGRESS: at 69.02% examples, 491685 words/s, in_qsize 5, out_qsize 0
INFO - 15:07:00: EPOCH 0 - PROGRESS: at 89.73% examples, 516995 words/s, in_qsize 6, out_qsize 0
INFO - 15:07:01: EPOCH 0: training on 4120761 raw words (2922734 effective words) took 5.6s, 523333 effective words/s
IN

(87687663, 123622830)

In [40]:
model.wv["plain tomato juice"]

array([-0.20123923,  0.05010962, -0.1516256 , -0.159913  ,  0.17697452,
       -0.21962458, -0.26083273,  0.54683363, -0.32839498,  0.06224587,
        0.0900903 ,  0.03506778,  0.09311107,  0.1355969 ,  0.21669076,
        0.08601019, -0.14975776, -0.20450114, -0.3422258 , -0.2763498 ,
        0.43450385, -0.21946886, -0.00867581,  0.171336  , -0.38241684,
       -0.29506716,  0.26185876, -0.25560662, -0.05153555, -0.07044638,
        0.14139736, -0.05067601, -0.18647344, -0.05818619, -0.07033163,
        0.02623757,  0.24014017, -0.19685867, -0.2597786 , -0.3696434 ,
        0.05100844, -0.193165  , -0.23040444, -0.13720913, -0.00102963,
        0.07204737, -0.02009816, -0.20352918, -0.13351886,  0.12135656,
       -0.01390866, -0.30453137, -0.14768335,  0.2061019 ,  0.16105384,
       -0.14375035, -0.2023864 ,  0.01728152, -0.24214457,  0.3883214 ,
        0.14996931,  0.04812564, -0.03132119,  0.03303896, -0.2610986 ,
        0.02217694, -0.05133477,  0.13466693, -0.29261792,  0.14

In [41]:
model.wv.save("recipe2vec.wordvectors")

INFO - 15:09:25: KeyedVectors lifecycle event {'fname_or_handle': 'recipe2vec.wordvectors', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-03-22T15:09:25.314827', 'gensim': '4.3.1', 'python': '3.8.3 (default, Jul  2 2020, 16:21:59) \n[GCC 7.3.0]', 'platform': 'Linux-5.15.0-67-generic-x86_64-with-glibc2.10', 'event': 'saving'}
INFO - 15:09:25: saved recipe2vec.wordvectors
