# Content Based Recommendation

In [9]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [10]:
food_df =pd.read_pickle("food.pkl")
food_df.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,n_steps,steps,description,ingredients,...,submitted_month,submitted_year,dairy-free,gluten-free,low-carb,vegan,vegetarian,recipe_id,average_rating,votes
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",...,Sep,2005,0,0,0,0,1,137739,5.0,3
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",...,Jun,2002,0,0,0,0,0,31490,4.666667,3
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",...,Feb,2005,0,0,0,0,0,112140,4.0,1
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",...,Apr,2003,0,0,0,0,0,59389,4.5,2
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",...,Oct,2002,0,0,0,0,1,44061,5.0,1


In [11]:
# import libraries
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kelly\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
# Create a function for tokenizer

stemmer = nltk.stem.PorterStemmer()
ENGLISH_STOP_WORDS = stopwords.words('english')

def recipe_tokenizer(sentence):
    # remove punctuation and set to lower case
    for punctuation_mark in string.punctuation:
        sentence = sentence.replace(punctuation_mark,'').lower()

    # split sentence into words
    listofwords = sentence.split(' ')
    listofstemmed_words = []

    # remove stopwords and any tokens that are just empty strings
    for word in listofwords:
        if (not word in ENGLISH_STOP_WORDS) and (word!=''):
            # Stem words
            stemmed_word = stemmer.stem(word)
            listofstemmed_words.append(stemmed_word)

    return listofstemmed_words

In [13]:
# Import libraries
import gensim
from gensim.models import Word2Vec
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
# Function for word embedding using Word2Vec
def word_embedding(food_df, column):
    # Tokenize the text data
    tokenized_data = food_df[column].apply(recipe_tokenizer)

    # Train a Word2Vec model
    model = Word2Vec(tokenized_data, vector_size=100, window=5, min_count=1, workers=4)

    # Create word embeddings for each word in the vocabulary
    embeddings = {word: model.wv[word] for word in model.wv.index_to_key}

    return embeddings

In [15]:
# Function for finding recipes
def find_similar_recipes(food_df, recipe_index, num_similar=5):
    # Step 1: Process 'ingredients' using word2vec and create word embeddings
    embeddings = word_embedding(food_df, 'ingredients')

    # Step 2: Concatenate relevant columns (excluding 'ingredients')
    food_df['text_data'] = food_df[['name', 'tags', 'description']].astype(str).agg(' '.join, axis=1)

    # Step 3: Preprocess the text data (example: lowercase conversion)
    food_df['text_data'] = food_df['text_data'].str.lower()

    # Step 4: Vectorize the text data (excluding 'ingredients') using TF-IDF
    vectorizer = TfidfVectorizer(min_df=5,
                                 tokenizer=recipe_tokenizer)
    vectorized_data = vectorizer.fit_transform(food_df['text_data'])

    # Step 5: Retrieve the word embeddings for 'ingredients'
    ingredient_embeddings = [np.mean([embeddings[word] for word in recipe_tokenizer(ingredients) if word in embeddings]
                                      or [np.zeros(100)], axis=0) for ingredients in food_df['ingredients']]

    # Step 6: Combine the vectorized data and ingredient embeddings
    combined_embeddings = np.concatenate([vectorized_data.toarray(), np.array(ingredient_embeddings)], axis=1)

    # Step 7: Compute cosine similarity
    cosine_sim_matrix = cosine_similarity(combined_embeddings)

    # Step 8: Retrieve similar recipes
    similar_recipes = cosine_sim_matrix[recipe_index].argsort()[::-1][1:num_similar + 1]  # Exclude the recipe itself

    # Get similar recipe names
    similar_recipe_names = food_df.loc[similar_recipes, 'name'].tolist()

    return similar_recipe_names


In [16]:
# Modify the batch_processing function to print the name of the recipe being compared
def batch_processing(data_df, batch_size):
    num_rows = data_df.shape[0]
    num_batches = (num_rows // batch_size) + 1

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, num_rows)

        batch_data = data_df.iloc[start_idx:end_idx]

        # Perform your desired operations on the batch_data
        # For example, you can call your find_similar_recipes function here
        for recipe_index in range(batch_data.shape[0]):
            recipe_name = batch_data.iloc[recipe_index]['name']
            similar_recipes = find_similar_recipes(batch_data, recipe_index, num_similar=5)
            print(f"Batch {i+1}, Recipe Name: {recipe_name}, Similar Recipes:")
            for j, recipe in enumerate(similar_recipes):
                print(f"{j+1}. {recipe}")


In [17]:
# Test

# Define the batch size
batch_size = 5000

# Call the batch processing function
batch_processing(food_df, batch_size)

Batch 1, Recipe Name: arriba   baked winter squash mexican style, Similar Recipes:
1. granny squash   creamed squash
2. acorn squash roasted with applesauce
3. acorn squash stuffed with lamb   curry
4. almost kfc coleslaw
5. acorn or butternut squash risotto
Batch 1, Recipe Name: a bit different  breakfast pizza, Similar Recipes:
1. a different  pizza
2. alfredo deep dish pizza
3. an aussie vegemite pizza
4. alabama egg and sausage souffle
5. all in one breakfast casserole
Batch 1, Recipe Name: all in the kitchen  chili, Similar Recipes:
1. 2 bean chili
2. the works  crock pot chili
3. a beef chili
4. 3 bean veggie chili
5. amarillo chili
Batch 1, Recipe Name: alouette  potatoes, Similar Recipes:
1. american potato salad with hard boiled eggs and sweet pickles
2. adobe vegetable skillet
3. potatoes with garlic and cheese  pommes de terre a l ail
4. accordion potatoes
5. aloo curry  potato curry
Batch 1, Recipe Name: amish  tomato ketchup  for canning, Similar Recipes:
1. african dried 

KeyboardInterrupt: 