In [None]:
food_df =pd.read_pickle("drive/My Drive/food.pkl")
food_df.head()

In [None]:
# import libraries
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

In [None]:
# Create a function for tokenizer

stemmer = nltk.stem.PorterStemmer()
ENGLISH_STOP_WORDS = stopwords.words('english')

def recipe_tokenizer(sentence):
    # remove punctuation and set to lower case
    for punctuation_mark in string.punctuation:
        sentence = sentence.replace(punctuation_mark,'').lower()

    # split sentence into words
    listofwords = sentence.split(' ')
    listofstemmed_words = []

    # remove stopwords and any tokens that are just empty strings
    for word in listofwords:
        if (not word in ENGLISH_STOP_WORDS) and (word!=''):
            # Stem words
            stemmed_word = stemmer.stem(word)
            listofstemmed_words.append(stemmed_word)

    return listofstemmed_words

In [None]:
# Import libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Create a function for finding recipes
def find_similar_recipes(food_df, recipe_index, num_similar=5):
    # Step 1: Concatenate relevant columns
    food_df['text_data'] = food_df[['name', 'tags', 'steps', 'description', 'ingredients']].astype(str).agg(' '.join, axis=1)

    # Step 2: Preprocess the text data (example: lowercase conversion)
    food_df['text_data'] = food_df['text_data'].str.lower()

    # Step 3: Vectorize the text data using TF-IDF
    vectorizer = TfidfVectorizer(min_df=5,
                        tokenizer=recipe_tokenizer)
    vectorized_data = vectorizer.fit_transform(food_df['text_data'])

    # Step 4: Compute cosine similarity
    cosine_sim_matrix = cosine_similarity(vectorized_data)

    # Step 5: Retrieve similar recipes
    similar_recipes = cosine_sim_matrix[recipe_index].argsort()[::-1][1:num_similar+1]  # Exclude the recipe itself

    # Get similar recipe names
    similar_recipe_names = food_df.loc[similar_recipes, 'name'].tolist()

    return similar_recipe_names

In [None]:
# Test
recipe_index = 0  # Index of the recipe for which you want to find similar recipes
similar_recipes = find_similar_recipes(food_df, recipe_index, num_similar=5)
print("Similar Recipes:")
for i, recipe in enumerate(similar_recipes):
    print(f"{i+1}. {recipe}")