In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
food_df =pd.read_pickle("food.pkl")
food_df.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,n_steps,steps,description,ingredients,...,submitted_month,submitted_year,dairy-free,gluten-free,low-carb,vegan,vegetarian,recipe_id,average_rating,votes
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",...,Sep,2005,0,0,0,0,1,137739,5.0,3
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",...,Jun,2002,0,0,0,0,0,31490,4.666667,3
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",...,Feb,2005,0,0,0,0,0,112140,4.0,1
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",...,Apr,2003,0,0,0,0,0,59389,4.5,2
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",...,Oct,2002,0,0,0,0,1,44061,5.0,1


In [4]:
# import libraries
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kelly\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Create a function for tokenizer

stemmer = nltk.stem.PorterStemmer()
ENGLISH_STOP_WORDS = stopwords.words('english')

def recipe_tokenizer(sentence):
    # remove punctuation and set to lower case
    for punctuation_mark in string.punctuation:
        sentence = sentence.replace(punctuation_mark,'').lower()

    # split sentence into words
    listofwords = sentence.split(' ')
    listofstemmed_words = []

    # remove stopwords and any tokens that are just empty strings
    for word in listofwords:
        if (not word in ENGLISH_STOP_WORDS) and (word!=''):
            # Stem words
            stemmed_word = stemmer.stem(word)
            listofstemmed_words.append(stemmed_word)

    return listofstemmed_words

In [None]:
# Import libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Create a function for finding recipes
def find_similar_recipes(food_df, recipe_index, num_similar=5):
    # Step 1: Concatenate relevant columns
    food_df['text_data'] = food_df[['name', 'tags', 'steps', 'description', 'ingredients']].astype(str).agg(' '.join, axis=1)

    # Step 2: Preprocess the text data (example: lowercase conversion)
    food_df['text_data'] = food_df['text_data'].str.lower()

    # Step 3: Vectorize the text data using TF-IDF
    vectorizer = TfidfVectorizer(min_df=5,
                        tokenizer=recipe_tokenizer)
    vectorized_data = vectorizer.fit_transform(food_df['text_data'])

    # Step 4: Compute cosine similarity
    cosine_sim_matrix = cosine_similarity(vectorized_data)

    # Step 5: Retrieve similar recipes
    similar_recipes = cosine_sim_matrix[recipe_index].argsort()[::-1][1:num_similar+1]  # Exclude the recipe itself

    # Get similar recipe names
    similar_recipe_names = food_df.loc[similar_recipes, 'name'].tolist()

    return similar_recipe_names

In [None]:
# Function for batch processing
def batch_processing(data_df, batch_size):
    num_rows = data_df.shape[0]
    num_batches = (num_rows // batch_size) + 1

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, num_rows)

        batch_data = data_df.iloc[start_idx:end_idx]

        # Perform your desired operations on the batch_data
        # For example, you can call your find_similar_recipes function here
        for recipe_index in range(batch_data.shape[0]):
            similar_recipes = find_similar_recipes(batch_data, recipe_index, num_similar=5)
            print(f"Batch {i+1}, Recipe Index {recipe_index}: Similar Recipes:")
            for j, recipe in enumerate(similar_recipes):
                print(f"{j+1}. {recipe}")

# Define the batch size
batch_size = 5000

# Call the batch processing function
batch_processing(food_df, batch_size)