# Content Based Recommendation

In [345]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [346]:
# Load data
data =pd.read_pickle("food.pkl")
data.shape

(212255, 28)

In [347]:
# Sample 25% of the dataset
sampled_data = data.sample(frac=0.25, random_state=42)

# View sample
display(sampled_data.head())

# Print the shape of the sampled dataset
print("Shape of Sampled Dataset:", sampled_data.shape)

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,n_steps,steps,description,ingredients,...,submitted_month,submitted_year,dairy-free,gluten-free,low-carb,vegan,vegetarian,recipe_id,average_rating,votes
168477,sesame dipping sauce,353233,10,406741,2009-01-31,"['lactose', '15-minutes-or-less', 'time-to-mak...",2,"['in a small bowl , whisk together soy sauce ,...",this works well for dipping pot stickers in as...,"['soy sauce', 'rice vinegar', 'sesame oil', 'a...",...,Jan,2009,0,0,0,1,1,353233,5.0,2
201337,unattended rib roast,97973,112,152393,2004-08-17,"['time-to-make', 'main-ingredient', 'preparati...",6,"['at noon , preheat oven to 375 degrees', 'sea...",i found this recipe in the paper and thought i...,"['standing rib roast', 'salt and pepper']",...,Aug,2004,0,0,1,0,0,97973,5.0,1
173114,slow cooker mashed potatoes,265017,135,290107,2007-11-12,"['time-to-make', 'course', 'main-ingredient', ...",6,"['in a mixing bowl , combine cream cheese , so...",this recipe is from taste of home magazine. it...,"['cream cheese', 'sour cream', 'butter', 'drie...",...,Nov,2007,0,0,0,0,0,265017,5.0,2
29931,butterscotch toffee cookies,288112,25,668077,2008-02-23,"['30-minutes-or-less', 'time-to-make', 'course...",8,['mix shortening and both sugars with electric...,"in preparation for a bake sale, i changed a re...","['shortening', 'sugar', 'brown sugar', 'egg', ...",...,Feb,2008,0,0,0,0,0,288112,4.0,1
9300,au gratin hash browns casserole,94740,70,126418,2004-06-30,"['time-to-make', 'course', 'preparation', 'cas...",6,"['oven@ 350', 'in a large bowl combine first 5...","i think i found this recipe in my ""goody bag"" ...","['cream of chicken soup', 'sour cream', 'marga...",...,Jun,2004,0,0,0,0,0,94740,5.0,1


Shape of Sampled Dataset: (53064, 28)


In [348]:
# import libraries
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kelly\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [349]:
# Create a function for tokenizer

stemmer = nltk.stem.PorterStemmer()
ENGLISH_STOP_WORDS = stopwords.words('english')

def recipe_tokenizer(sentence):
    # remove punctuation and set to lower case
    for punctuation_mark in string.punctuation:
        sentence = sentence.replace(punctuation_mark,'').lower()

    # split sentence into words
    listofwords = sentence.split(' ')
    listofstemmed_words = []

    # remove stopwords and any tokens that are just empty strings
    for word in listofwords:
        if (not word in ENGLISH_STOP_WORDS) and (word!=''):
            # Stem words
            stemmed_word = stemmer.stem(word)
            listofstemmed_words.append(stemmed_word)

    return listofstemmed_words

In [350]:
# Import libraries
import gensim
from gensim.models import Word2Vec
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [351]:
# Function for word embedding using Word2Vec
def word_embedding(sampled_data, column):
    # Tokenize the text data
    tokenized_data = sampled_data[column].apply(recipe_tokenizer)

    # Train a Word2Vec model
    model = Word2Vec(tokenized_data, vector_size=100, window=5, min_count=1, workers=4)

    # Create word embeddings for each word in the vocabulary
    embeddings = {word: model.wv[word] for word in model.wv.index_to_key}

    return embeddings

In [352]:
# Function to pre-compute and store the combined embeddings
def precompute_embeddings(sampled_data):
    # Step 1: Process 'ingredients' using word2vec and create word embeddings
    embeddings = word_embedding(sampled_data, 'ingredients')

    # Step 2: Concatenate relevant columns (excluding 'ingredients')
    sampled_data['text_data'] = sampled_data[['name', 'tags', 'description']].astype(str).agg(' '.join, axis=1)

    # Step 3: Preprocess the text data (example: lowercase conversion)
    sampled_data['text_data'] = sampled_data['text_data'].str.lower()

    # Step 4: Vectorize the text data (excluding 'ingredients') using TF-IDF
    vectorizer = TfidfVectorizer(min_df=5,
                                 tokenizer=recipe_tokenizer)
    vectorized_data = vectorizer.fit_transform(sampled_data['text_data'])

    # Step 5: Retrieve the word embeddings for 'ingredients'
    ingredient_embeddings = [np.mean([embeddings[word] for word in recipe_tokenizer(ingredients) if word in embeddings]
                                      or [np.zeros(100)], axis=0) for ingredients in sampled_data['ingredients']]

    # Step 6: Combine the vectorized data and ingredient embeddings
    combined_embeddings = np.concatenate([vectorized_data.toarray(), np.array(ingredient_embeddings)], axis=1)
    
    # Step 7: Store combined embeddings in pkl file
    with open('combined_embeddings.pkl', 'wb') as f:
        pickle.dump(combined_embeddings, f)
    
    # Step 8: Store the trained TF-IDF vectorizer model in a separate pkl file
    with open('tfidf_vectorizer.pkl', 'wb') as f:
        pickle.dump(vectorizer, f)
    
    # Step 9: Done!  
    print("Text data and TF-IDF vectorizer model stored in pkl files!")
    


In [353]:
# store vectorized data and the trained TF-IDF vectorizer model from sampled data
precompute_embeddings(sampled_data)

Text data and TF-IDF vectorizer model stored in pkl files!


In [354]:
# Function to load the combined embeddings and TF-IDF vectorizer model
def load_embeddings_and_vectorizer():
    with open('combined_embeddings.pkl', 'rb') as f:
        combined_embeddings = pickle.load(f)
    with open('tfidf_vectorizer.pkl', 'rb') as f:
        vectorizer = pickle.load(f)
    return combined_embeddings, vectorizer

# Function for finding recipes
def find_similar_recipes(sampled_data, user_input, num_similar=5):
    try:
        combined_embeddings, vectorizer = load_embeddings_and_vectorizer()
    except FileNotFoundError:
        precompute_embeddings(sampled_data)
        combined_embeddings, vectorizer = load_embeddings_and_vectorizer()

    # Process user input
    # Create a DataFrame for user input
    user_data = pd.DataFrame({'text_data': [user_input]})
    user_data['text_data'] = user_data['text_data'].str.lower()

    # Vectorize the user input using the provided vectorizer
    user_vectorized_data = vectorizer.transform(user_data['text_data'])

    # Ensure the number of features in user_vectorized_data matches with combined_embeddings
    num_missing_features = combined_embeddings.shape[1] - user_vectorized_data.shape[1]
    if num_missing_features > 0:
        # Add zero columns to user_vectorized_data to match the number of features
        user_vectorized_data = np.pad(user_vectorized_data.toarray(), ((0, 0), (0, num_missing_features)))

    # Compute cosine similarity with user input
    cosine_sim_matrix = cosine_similarity(user_vectorized_data, combined_embeddings)

    # Retrieve similar recipe indices
    similar_recipes = cosine_sim_matrix[0].argsort()[::-1][:num_similar]

    # Get similar recipe names from food_df
    similar_recipe_names = sampled_data.iloc[similar_recipes]['name'].tolist()

    return similar_recipe_names

In [357]:
# Test
find_similar_recipes(sampled_data, "japanese dishes vegetarian")

['japanese noodle soup',
 'okonomiyaki',
 'vegetarian yakisoba',
 'omuraisu  japanese omelette',
 'japanese salad with ginger soy dressing']