<a href="https://colab.research.google.com/github/lampealex888/capstone-foodsocial/blob/main/recipe_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Download Dependencies and Load Libraries

In [None]:
!pip install unidecode

In [None]:
import json
import pandas as pd
import nltk
import string
import ast
import re
import unidecode
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from collections import Counter
import numpy as np
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

In [None]:
nltk.download('wordnet')
nltk.download('punkt')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#2. Dataset

In [None]:
# Load the recipes and ingredients data from JSON files
with open('/content/drive/MyDrive/cs1980/recipes.json', 'r') as f: recipes = json.load(f)
with open('/content/drive/MyDrive/cs1980/ingredients.json', 'r') as f: ingredients = json.load(f)
# Create a dictionary to look up ingredient names by their ids
ingredient_names = {str(id): data['ingredient_name'] for id, data in ingredients.items()}
# Replace ingredient ids with names in each recipe
for recipe in recipes.values(): recipe['ingredients'] = [ingredient_names.get(str(ingredient_id), ingredient_id) for ingredient_id in recipe['ingredients']]
# Save the updated recipes data back to the JSON file
with open('recipes.json', 'w') as f: json.dump(recipes, f, indent=4)

In [None]:
# Read JSON data into DataFrame
recipe_df = pd.read_json('recipes.json', orient='index')
# Remove unnecessary columns
recipe_df = recipe_df.drop(columns=['author_id', 'date_published', 'picture_path'])
# Drop rows with missing values
recipe_df = recipe_df.dropna()
# Reset index and rename index column
recipe_df = recipe_df.reset_index().rename(columns={'index': 'recipe_id'})

In [None]:
recipe_df.head()

Unnamed: 0,recipe_id,title,instructions,ingredients,recipe_link
0,112565,Mixed Berry Streusel Bars,Mix the cornstarch with the warm water until t...,"[Cornstarch, Water, Strawberries, Maple Syrup,...",https://devpitt2.foodsocial.io/recipe/mixed-be...
1,112553,The Best Vegetarian Chili,Heat the olive oil over medium-high heat in a ...,"[Extra Virgin Olive Oil, Yellow Onion, Green B...",https://devpitt2.foodsocial.io/recipe/the-best...
2,112563,My Favorite Carrot Cake,Preheat the oven to 350˚F. Grease three 9-inch...,"[Section Ingredient, All Purpose Flour, Baking...",https://devpitt2.foodsocial.io/recipe/my-favor...
3,112575,Extra Crispy Baked Chicken Wings,Preheat the oven to 425˚F. Line a large baking...,"[Section Ingredient, Chicken Wings, Salt, Blac...",https://devpitt2.foodsocial.io/recipe/extra-cr...
4,112567,Quick & Easy Banana Muffins,Preheat the oven to 425˚F. Spray a 12-count mu...,"[All Purpose Flour, Baking Powder, Baking Soda...",https://devpitt2.foodsocial.io/recipe/quick-ea...


# Back Up Model

In [None]:
# cv = CountVectorizer()
# count_matrix = cv.fit_transform(recipes_df['ingredients'])

In [None]:
# cosine_sim = cosine_similarity(count_matrix)

In [None]:
# def recommend_recipe(recipe_name):
#     recipe_index = recipes_df[recipes_df['title']==recipe_name].index[0]
#     similar_recipes = list(enumerate(cosine_sim[recipe_index]))
#     sorted_recipes = sorted(similar_recipes,key=lambda x:x[1],reverse=True)[1:]
#     recommended_recipes = []
#     for i in range(10):
#         recommended_recipes.append(recipes_df.iloc[sorted_recipes[i][0]]['title'])
#     return recommended_recipes

In [None]:
# recommend_recipe('Spaghetti and Meatballs')

# 3. Preprocessing and Parsing of Ingredients

In [None]:
# Initialize the vocabulary frequency distribution and WordNet Lemmatizer
vocabulary = nltk.FreqDist()
lemmatizer = WordNetLemmatizer()

# Loop through each recipe's ingredients and update the vocabulary frequency distribution
for ingredients in recipe_df['ingredients']:
    ingredients = ' '.join(ingredients)
    # Split the ingredients string into a list of lowercase individual words
    ingredients = ingredients.lower().split()
    # Update the frequency distribution with the individual words
    vocabulary.update(ingredients)

# Modify the frequency threshold as needed
threshold = 1000

# Create a list of lemmatized common ingredients to remove by filtering based on the frequency threshold
common_ingredients = [lemmatizer.lemmatize(word) for word, frequency in vocabulary.items() if frequency >= threshold]

print("Common Ingredients to Remove:")
print(common_ingredients)

Common Ingredients to Remove:
['ground', 'coconut', 'oil', 'vanilla', 'extract', 'sugar', 'onion', 'pepper', 'garlic', 'salt', 'black', 'powder', 'section', 'ingredient', 'flour', 'sea']


In [None]:
# measures words (already lemmatized)
measures = ['teaspoon', 't', 'tsp.', 'tablespoon', 'T', 'tbl.', 'tb', 'tbsp.', 'fluid ounce', 'fl oz', 'gill', 'cup', 'c', 'pint', 'p', 'pt', 'fl pt', 'quart', 'q', 'qt', 'fl qt', 'gallon', 'g', 'gal', 'ml', 'milliliter', 'millilitre', 'cc', 'mL', 'l', 'liter', 'litre', 'L', 'dl', 'deciliter', 'decilitre', 'dL', 'bulb', 'level', 'heaped', 'rounded', 'whole', 'pinch', 'medium', 'slice', 'pound', 'lb', '#', 'ounce', 'oz', 'mg', 'milligram', 'milligramme', 'g', 'gram', 'gramme', 'kg', 'kilogram', 'kilogramme', 'x', 'of', 'mm', 'millimetre', 'millimeter', 'cm', 'centimeter', 'centimetre', 'm', 'meter', 'metre', 'inch', 'in', 'milli', 'centi', 'deci', 'hecto', 'kilo']
# We first get rid of all the punctuation
translator = str.maketrans('', '', string.punctuation)
# initialize nltk's lemmatizer
lemmatizer = WordNetLemmatizer()
# Turn ingredient list from string into a list
parsed_ingredients = []
for ingredients in recipe_df['ingredients']:
    if isinstance(ingredients, list):
        ingredients = ingredients
    else:
        ingredients = ast.literal_eval(ingredients)

    # We first get rid of all the punctuation
    translator = str.maketrans('', '', string.punctuation)

    # initialize nltk's lemmatizer
    lemmatizer = WordNetLemmatizer()
    ingred_list = []
    for i in ingredients:
        i = i.translate(translator)
        # We split up with hyphens as well as spaces
        items = re.split(' |-', i)
        # Get rid of words containing non alphabet letters
        items = [word for word in items if word.isalpha()]
        # Turn everything to lowercase
        items = [word.lower() for word in items]
        # remove accents
        items = [unidecode.unidecode(word) for word in items]
        # Lemmatize words so we can compare words to measuring words
        items = [lemmatizer.lemmatize(word) for word in items]
        # Gets rid of measuring words/phrases, e.g. heaped teaspoon
        items = [word for word in items if word not in measures]
        # Get rid of common easy words
        items = [word for word in items if word not in common_ingredients]
        if items:
            ingred_list.append(' '.join(items))
    parsed_ingredients.append(ingred_list)
recipe_df['parsed'] = parsed_ingredients

In [None]:
recipe_df.head()

Unnamed: 0,recipe_id,title,instructions,ingredients,recipe_link,parsed
0,112565,Mixed Berry Streusel Bars,Mix the cornstarch with the warm water until t...,"[Cornstarch, Water, Strawberries, Maple Syrup,...",https://devpitt2.foodsocial.io/recipe/mixed-be...,"[cornstarch, water, strawberry, maple syrup pu..."
1,112553,The Best Vegetarian Chili,Heat the olive oil over medium-high heat in a ...,"[Extra Virgin Olive Oil, Yellow Onion, Green B...",https://devpitt2.foodsocial.io/recipe/the-best...,"[extra virgin olive, yellow, green bell, red b..."
2,112563,My Favorite Carrot Cake,Preheat the oven to 350˚F. Grease three 9-inch...,"[Section Ingredient, All Purpose Flour, Baking...",https://devpitt2.foodsocial.io/recipe/my-favor...,"[all purpose, baking, baking soda, cinnamon, g..."
3,112575,Extra Crispy Baked Chicken Wings,Preheat the oven to 425˚F. Line a large baking...,"[Section Ingredient, Chicken Wings, Salt, Blac...",https://devpitt2.foodsocial.io/recipe/extra-cr...,"[chicken wing, white, smoked paprika, rosemary..."
4,112567,Quick & Easy Banana Muffins,Preheat the oven to 425˚F. Spray a 12-count mu...,"[All Purpose Flour, Baking Powder, Baking Soda...",https://devpitt2.foodsocial.io/recipe/quick-ea...,"[all purpose, baking, baking soda, cinnamon, b..."


# 4. Word Embeddings Using Word2Vec

In [None]:
# get corpus with the documents sorted in alphabetical order
def get_and_sort_corpus(data):
    corpus_sorted = []
    for doc in data.parsed.values:
        doc.sort()
        corpus_sorted.append(doc)
    return corpus_sorted

In [None]:
# calculate average length of each document
def get_window(corpus):
    lengths = [len(doc) for doc in corpus]
    avg_len = float(sum(lengths)) / len(lengths)
    return round(avg_len)

In [None]:
# get corpus
corpus = get_and_sort_corpus(recipe_df)
print(f"Length of corpus: {len(corpus)}")
# train and save CBOW Word2Vec model
model_cbow = Word2Vec(
  corpus, sg=0, workers=8, window=get_window(corpus), min_count=1, vector_size=100
)
model_cbow.save('model_cbow.bin')
print("Word2Vec model successfully trained")

Length of corpus: 4002
Word2Vec model successfully trained


# 5. Document Embeddings

In [None]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, model_cbow):
        self.model_cbow = model_cbow
        self.vector_size = model_cbow.wv.vector_size

    def fit(self):
        return self

    def transform(self, docs):
        doc_vector = self.doc_average_list(docs)
        return doc_word_vector

    def doc_average(self, doc):
        mean = []
        for word in doc:
            if word in self.model_cbow.wv.index_to_key:
                mean.append(self.model_cbow.wv.get_vector(word))

        if not mean:
            return np.zeros(self.vector_size)
        else:
            mean = np.array(mean).mean(axis=0)
            return mean

    def doc_average_list(self, docs):
        return np.vstack([self.doc_average(doc) for doc in docs])

# 6. Using TF_IDF To Aggergate Embeddings

In [None]:
class TfidfEmbeddingVectorizer(object):
    def __init__(self, model_cbow):
        self.model_cbow = model_cbow
        self.word_idf_weight = None
        self.vector_size = model_cbow.wv.vector_size

    def fit(self, docs):
        """
        Build a tfidf model to compute each word's idf as its weight.
        """
        text_docs = []
        for doc in docs:
            text_docs.append(" ".join(doc))

        tfidf = TfidfVectorizer()
        tfidf.fit(text_docs)
        # if a word was never seen it is given idf of the max of known idf value
        max_idf = max(tfidf.idf_)
        self.word_idf_weight = defaultdict(
            lambda: max_idf,
            [(word, tfidf.idf_[i]) for word, i in tfidf.vocabulary_.items()],
        )
        return self

    def transform(self, docs):
        doc_word_vector = self.doc_average_list(docs)
        return doc_word_vector

    def doc_average(self, doc):
        """
        Compute weighted mean of documents word embeddings
        """

        mean = []
        for word in doc:
            if word in self.model_cbow.wv.index_to_key:
                mean.append(
                    self.model_cbow.wv.get_vector(word) * self.word_idf_weight[word]
                )

        if not mean:
            return np.zeros(self.vector_size)
        else:
            mean = np.array(mean).mean(axis=0)
            return mean

    def doc_average_list(self, docs):
        return np.vstack([self.doc_average(doc) for doc in docs])

# 7. Recommendation System

In [None]:
def get_recommendations(N, scores):
    """
    Rank scores and output a pandas data frame containing all the details of the top N recipes.
    :param scores: list of cosine similarities
    """
    # order the scores with and filter to get the highest N scores
    top = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:N]
    # create dataframe to load in recommendations
    recommendation = pd.DataFrame(columns=["recipe", "ingredients", "score", "url"])
    count = 0
    for i in top:
        recommendation.at[count, "recipe"] = title_parser(recipe_df["title"][i])
        recommendation.at[count, "ingredients"] = ingredient_parser_final(
            recipe_df["ingredients"][i]
        )
        recommendation.at[count, "url"] = recipe_df["recipe_link"][i]
        recommendation.at[count, "score"] = f"{scores[i]}"
        count += 1
    return recommendation

def get_recs(ingredients, N=5, mean=False):
    """
    Get the top N recipe recomendations.
    :param ingredients: comma seperated string listing ingredients
    :param N: number of recommendations
    :param mean: False if using tfidf weighted embeddings, True if using simple mean
    """
    # load in word2vec model
    model = Word2Vec.load("model_cbow.bin")
    # normalize embeddings
    model.init_sims(replace=True)
    if model:
        print("Successfully loaded model")
    if mean:
        # get average embdeddings for each document
        mean_vec_tr = MeanEmbeddingVectorizer(model)
        doc_vec = mean_vec_tr.transform(corpus)
        doc_vec = [doc.reshape(1, -1) for doc in doc_vec]
        assert len(doc_vec) == len(corpus)
    else:
        # use TF-IDF as weights for each word embedding
        tfidf_vec_tr = TfidfEmbeddingVectorizer(model)
        tfidf_vec_tr.fit(corpus)
        doc_vec = tfidf_vec_tr.transform(corpus)
        doc_vec = [doc.reshape(1, -1) for doc in doc_vec]
        assert len(doc_vec) == len(corpus)

    # create embeddings for input text
    input = ingredients
    # create tokens with elements
    input = input.split(",")
    # parse ingredient list
    input = ingredient_parser(input)
    # get embeddings for ingredient doc
    if mean:
        input_embedding = mean_vec_tr.transform([input])[0].reshape(1, -1)
    else:
        input_embedding = tfidf_vec_tr.transform([input])[0].reshape(1, -1)

    # get cosine similarity between input embedding and all the document embeddings
    cos_sim = map(lambda x: cosine_similarity(input_embedding, x)[0][0], doc_vec)
    scores = list(cos_sim)
    # Filter top N recommendations
    recommendations = get_recommendations(N, scores)
    return recommendations

if __name__ == "__main__":
    # test
    input = "Salt, Yeast, Breadcrumbs"
    rec = get_recs(input)
    print(rec)

  model.init_sims(replace=True)


Successfully loaded model
                                              recipe  \
0                          Easy Vegan Mac and Cheese   
1         Whole30 Broccoli & Bacon Open-Faced Omelet   
2            Lemon Pepper Chicken Pasta with Burrata   
3  Comforting Paleo Meatloaf topped with a Chipot...   
4                  Roasted Red Pepper and Walnut Dip   

                                         ingredients               score  \
0  Pasta,Potatoes, White,Cashews, Raw,Almond Milk...  0.9984462585363983   
1  Bacon,Tomato,Egg Whites,Eggs,Salt,Black Pepper...  0.9983660039337334   
2  Section Ingredient,Pasta,Extra Virgin Olive Oi...  0.9983633595043416   
3  Section Ingredient,Ground Beef,Ground Pork,Egg...  0.9983612233665506   
4  Red Bell Pepper,Molasses,Chili Flakes,Breadcru...  0.9983579798499652   

                                                 url  
0  https://devpitt2.foodsocial.io/recipe/easy-veg...  
1  https://devpitt2.foodsocial.io/recipe/whole30-...  
2  https://devp