In [67]:
import pandas as pd
import numpy as np
import json
import pickle
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import csv
from sklearn.metrics.pairwise import cosine_similarity
import pickle

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kratikakothari/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kratikakothari/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Preprocessing

#### kaggle and nature dataset(to be used for predicting ingredients)

In [58]:
# Preprocessing the dataset
# The dataset is kaggle and nature dataset
# It contains set of ingredients and the associated cuisine for each recipe.
f_kaggleNature = open('kaggle_and_nature.csv', newline = '')    
csv_reader = csv.reader(f_kaggleNature, delimiter='\t')

id_ingredients_cuisine = []
cuisines = []
i = 0
for row in csv_reader:
    temp = dict()
    temp['id'] = i
    ingredients = []
    for ingredient in row[0].split(",")[1:]:
        ingredients.append(ingredient.replace("_"," "))
    temp['ingredients'] = ingredients
    temp['cuisine'] = row[0].split(",")[0]
    id_ingredients_cuisine.append(temp)
    cuisines.append(row[0].split(",")[0])
    i = i + 1
id_ingredients_cuisine
# Removing Punctuation, Stopwords
# Lemmatization

stop_words = set(stopwords.words('english'))
intab = '''!()-[]{};:'"\,<>?@#$%^&*_~'''
outtab = "_" * len(intab)
trantab = str.maketrans(intab, outtab)

lemmatizer = WordNetLemmatizer()

new_id_ingredients_cuisine = []
max_ingredients = 0

for recipe_id,recipe in enumerate(id_ingredients_cuisine):
    temp_ingredients = []
    for ingredient in recipe["ingredients"]:
        word_tokens = word_tokenize(ingredient)
        l = []
        for word in word_tokens:
            if word not in stop_words:
                word = word.translate(trantab).replace("_", "").lower()
                l.append(lemmatizer.lemmatize(word))
        ingredient_modified = " ".join(l)
        temp_ingredients.append(ingredient_modified)
    temp = dict()
    temp['id'] = recipe_id
    temp['ingredients'] = sorted(temp_ingredients)
    temp['cuisine'] = recipe["cuisine"]
    new_id_ingredients_cuisine.append(temp)
    if(len(temp_ingredients) > max_ingredients):
        max_ingredients  = len(temp_ingredients)

###### building corpus for the model

In [64]:
# building corpus from kaggle and nature dataset
corpus = []
for recipe_id,recipe in enumerate(new_id_ingredients_cuisine):
    corpus.append(recipe["ingredients"])

In [65]:
with open('kaggle&nature_corpus.pkl','wb') as f:
    pickle.dump(corpus,f)

In [78]:
def compute_recipe_vectors():
#     making a new dictionary with key as recipe id 
#     new_id_ingredients_cuisine
    model = Word2Vec.load('word2vec_skipgram_kg_ng7.model')
    id_vector = dict()
    id_ingredients = dict()
    id_cuisine = dict()
    for recipe_id,recipe in enumerate(new_id_ingredients_cuisine):
        current_sum = np.zeros(100)
        n = len(recipe['ingredients'])
        for ingredient in recipe['ingredients']:
            vector = model.wv[ingredient]
            current_sum = np.add(current_sum,vector)
        current_sum = current_sum/n
        id_vector[recipe_id] = current_sum
        id_ingredients[recipe_id] = recipe['ingredients']
        id_cuisine[recipe_id] = recipe['cuisine']
    with open('kaggle&nature_Id_vectors.pkl','wb') as f:
        pickle.dump(id_vector,f)
    with open('kaggle&nature_Id_ingredients.pkl','wb') as g:
        pickle.dump(id_ingredients,g)
    with open('kaggle&nature_Id_cuisine.pkl','wb') as h:
        pickle.dump(id_cuisine,h)
    return id_vector,id_ingredients,id_cuisine

#### CulinaryDB

In [87]:
# building corpus from culinaryDB
import pickle
with open('culinaryDB_new_recipes.pkl', 'rb') as f:
    id_ingredients = pickle.load(f)
corpus = []
max_ingredients = 0
for recipe_id in id_ingredients:
    corpus.append(id_ingredients[recipe_id])
    if len(id_ingredients[recipe_id]) > max_ingredients:
        max_ingredients = len(id_ingredients[recipe_id]) 

In [88]:
with open('culinarDB_new_corpus.pkl','wb') as f:
    pickle.dump(corpus,f)

In [None]:
def compute_recipe_vectors_for_culinaryDB():
#     making a new dictionary with key as recipe id 
    id_vector = dict()
    id_
    for recipe_id in id_ingredients:
        current_sum = np.zeros(100)
        n = len(id_ingredients[recipe_id])
        for ingredient in id_ingredients[recipe_id]:
            vector = model.wv[ingredient]
            current_sum = np.add(current_sum,vector)
        current_sum = current_sum/n
        id_vector[recipe_id] = current_sum
    return id_vector

#### KAGGLE2

In [27]:
# building corpus from kaggle2 dataset
data = pd.read_csv('RAW_recipes.csv')
recipe_id = data['id']
recipe_names = data['name']
ingredients = data['ingredients']
id_recipeName = pd.concat([recipe_id, recipe_names], axis=1)
id_ingredients = pd.concat([recipe_id, ingredients], axis=1)

In [28]:
# building corpus from kaggle2 dataset
corpus = []
max_ingredients = 0
for i in ingredients:
    corpus.append(i)
    if len(i) > max_ingredients:
        max_ingredients = len(i) 

In [36]:
stop_words = set(stopwords.words('english'))
intab = '''!()-[]{};:'"\,<>?@#$%^&*_~'''
outtab = "_" * len(intab)
trantab = str.maketrans(intab, outtab)

lemmatizer = WordNetLemmatizer()

new_ingredients = []
max_ingredients = 0

for recipe in ingredients:
    temp_ingredients = []
    l = recipe.split(",")
    for ingredient in l:
        word_tokens = word_tokenize(ingredient)
        l = []
        for word in word_tokens:
            if word not in stop_words:
                word = word.translate(trantab).replace("_", "").lower()
                l.append(lemmatizer.lemmatize(word))
        ingredient_modified = " ".join(l)
        temp_ingredients.append(ingredient_modified.strip())
    new_ingredients.append(temp_ingredients)
    if(len(temp_ingredients)>max_ingredients):
        max_ingredients = len(temp_ingredients)

In [44]:
def compute_recipe_vectors_for_kaggle2():
#     making a new dictionary with key as recipe id 
    id_vector = dict()
    id_ingredients = dict()
    for i in range(len(new_ingredients)):
        current_sum = np.zeros(100)
        n = len(new_ingredients[i])
        for ingredient in new_ingredients[i]:
            vector = model.wv[ingredient]
            current_sum = np.add(current_sum,vector)
        current_sum = current_sum/n
        id_vector[recipe_id[i]] = current_sum
        id_ingredients[recipe_id[i]] = new_ingredients[i]
    return id_vector,id_ingredients

In [52]:
id_names = dict()
for i in range(len(recipe_id)):
    id_names[recipe_id[i]] = recipe_names[i]    

In [55]:
import pickle
with open('kaggle2_Id_vectors.pkl','wb') as f:
    pickle.dump(id_vector,f)
with open('kaggle2_Id_ingredients.pkl','wb') as g:
    pickle.dump(id_ingredients,g)
with open('kaggle2_Id_names.pkl','wb') as h:
    pickle.dump(id_names,h)

### MODEL

In [None]:

# Training word2vec model
with open('culinarDB_new_corpus.pkl','rb') as f:
    corpus = pickle.load(f)
# corpus = new_ingredients
embedding_size = 100
no_of_workers = 2 # better to have as many workers an number of cores on the machine
window_size = max_ingredients 
sg_ = 1 # 0 for CBOW and 1 for skip-gram
no_of_negative_samples = 7 
        
model = Word2Vec(corpus, min_count = 1, size = embedding_size, workers = no_of_workers, window = window_size, sg = sg_,negative = no_of_negative_samples)

model.save("word2vec_cl_new_ng7.model")
# model.wv.save_word2vec_format('word2vec_skipgram_kg_ng7.txt', binary=False)


In [14]:
def compute_recipe_vectors_for_culinaryDB():
#     making a new dictionary with key as recipe id 
    id_vector = dict()
    for recipe_id in id_ingredients:
        current_sum = np.zeros(100)
        n = len(id_ingredients[recipe_id])
        for ingredient in id_ingredients[recipe_id]:
            vector = model.wv[ingredient]
            current_sum = np.add(current_sum,vector)
        current_sum = current_sum/n
        id_vector[recipe_id] = current_sum
    return id_vector

In [156]:
def compute_recipe_vectors():
#     making a new dictionary with key as recipe id 
#     new_id_ingredients_cuisine
    id_vector = dict()
    id_ingredients = dict()
    for recipe_id,recipe in enumerate(new_id_ingredients_cuisine):
        current_sum = np.zeros(100)
        n = len(recipe['ingredients'])
        for ingredient in recipe['ingredients']:
            vector = model.wv[ingredient]
            current_sum = np.add(current_sum,vector)
        current_sum = current_sum/n
        id_vector[recipe_id] = current_sum
        id_ingredients[recipe_id] = recipe['ingredients']
    return id_vector,id_ingredients

In [72]:
def suggest_ingredients(current_ingredients):
    model = Word2Vec.load('word2vec_skipgram_kg_ng7.model')
    return model.predict_output_word(current_ingredients)