In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from joblib import dump, load

from data.preprocess_data import combine_json_to_dataframe

ModuleNotFoundError: No module named 'data'

In [10]:
# Call the function with the path to the zip file
recipe_data = combine_json_to_dataframe("../data/recipes_raw.zip")

recipe_data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 124434 entries, p3pKOD6jIHEcjf20CCXohP8uqkG5dGi to 2Q3Zpfgt/PUwn1YABjJ5A9T3ZW8xwVa
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   instructions  124434 non-null  object 
 1   ingredients   124434 non-null  object 
 2   title         124434 non-null  object 
 3   full_text     124434 non-null  object 
 4   num_words     124434 non-null  float64
dtypes: float64(1), object(4)
memory usage: 5.7+ MB


In [11]:
vectoriser = TfidfVectorizer(
    stop_words='english', # default is without it, but this decreases the dictionary size significantly
    min_df = 2, # Ignore terms that have a document frequency strictly lower than the given threshold. When float, proportion of docs.
    max_df = 0.95, # ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words).
    ngram_range=(1,2), # uni and bi-grams
    max_features=30_000, # unigrams are ca. 22K, get top 8000 bigrams
    dtype=np.float32 # Reduces the size of the resulting array without much quality sacrifice, default is float64
)

embeddings = vectoriser.fit_transform(recipe_data.full_text)
print(embeddings.shape)
print("Note that the rows are the number of documents, while the columns equal the number of tokens")

(124434, 30000)
Note that the rows are the number of documents, while the columns equal the number of tokens


In [12]:
print(vectoriser.get_feature_names_out())
print(len(vectoriser.get_feature_names_out()))

['10' '10 11' '10 12' ... 'zucchini yellow' 'zucchinis' 'árbol']
30000


In [13]:
def get_most_similar_doc(text, vectorized_corpus, original_data, vectoriser=vectoriser, top_n = 5):
    new_doc_vector = vectoriser.transform(text)
    sim = cosine_similarity(X = vectorized_corpus, Y = new_doc_vector).flatten()
    argmax = np.argpartition(sim, -top_n)[-top_n:]
    return(
        (original_data.
         iloc[argmax].
         assign(similarity = sim[argmax]).
         loc[:, ['title', 'ingredients', 'similarity']].
         sort_values('similarity', ascending = False).
         reset_index(drop=True)
         )

    )

In [14]:
test_recipe = ["""
Ingredients:

4 veal cutlets (or pork if you can't find veal)
Salt
Pepper
1 cup flour
2 eggs
1 cup breadcrumbs (preferably made from stale bread)
1/2 cup vegetable oil (for frying)
1 lemon (optional)
Instructions:

Pound the cutlets with a meat mallet until they are about 1/4 inch thick. Season both sides with salt and pepper.

Place the flour in a shallow dish. In another dish, beat the eggs. In a third dish, place the breadcrumbs.

Coat each cutlet with flour, shaking off any excess. Dip it into the beaten eggs, and then coat with the breadcrumbs. Repeat this for all of the cutlets.

Heat the vegetable oil in a large frying pan until hot.

Fry each cutlet for about 2-3 minutes on each side, or until golden brown and crispy. Be careful not to overcrowd the pan, you may need to fry them in batches.

Remove the cutlets from the pan with a slotted spoon and place them on paper towels to drain any excess oil.

"""]

get_most_similar_doc(text = test_recipe, 
                     vectorized_corpus = embeddings, 
                     original_data = recipe_data)

Unnamed: 0,title,ingredients,similarity
0,Wiener Schnitzel (Breaded Veal Cutlets),"[8 slices white bread, crusts removed, Flour f...",0.392512
1,Torta Milanesa,[Four 4-ounce chicken cutlets (about 1/4-inch ...,0.371962
2,Schnitzelwiches Holstein,"[1 cup all-purpose flour, 6 large eggs, 1 cup ...",0.368389
3,Turkey Cutlets Milanese,[1 1/2 pounds turkey cutlets (about 1/4-inch t...,0.36749
4,Turkey Cutlets Milanese with Watercress Salad,[3/4 pound turkey cutlets (about 1/4-inch thic...,0.36626


In [15]:
test_recipe_2 = [
    """Ingredients:
- 2 cups heavy cream
- 1 vanilla bean, split and scraped or 1 tsp vanilla extract
- 5 egg yolks
- 1/2 cup granulated sugar, plus more for caramelizing

Instructions:

1. Preheat the oven to 325°F.

2. In a medium saucepan, heat the cream and vanilla bean (both the seeds and the pod) over medium heat until it just begins to simmer. Remove from heat and let sit for 15 minutes to infuse the vanilla flavor.

3. Whisk together the egg yolks and sugar in a medium bowl until light and fluffy.

4. Remove the vanilla pod from the cream and scrape the seeds back into the cream. Discard the pod.

5. Slowly add the cream to the egg mixture, whisking constantly, until well combined.

6. Divide the mixture among four 6-ounce ramekins or custard cups.

7. Place the ramekins in a baking dish and add enough hot water to the dish to reach halfway up the sides of the ramekins.

8. Bake for 30-35 minutes or until the custard is set but still slightly jiggly in the center.

9. Remove the ramekins from the water bath and let cool to room temperature. Refrigerate for at least 2 hours or overnight.

10. When ready to serve, sprinkle a thin layer of sugar over the top of each custard. Either use a culinary torch to caramelize the sugar or place the ramekins under a broiler until the sugar is melted and caramelized. Serve immediately.
"""
]

get_most_similar_doc(text = test_recipe_2, 
                     vectorized_corpus = embeddings, 
                     original_data = recipe_data)

Unnamed: 0,title,ingredients,similarity
0,Creme Brulee,"[1 quart heavy cream, 1 vanilla bean, split, o...",0.452146
1,Vanilla Creme Brulee,"[1 vanilla bean, 2 cups heavy cream, 2 cups ha...",0.450293
2,Chocolate Creme Brulee,"[4 cups heavy whipping cream ADVERTISEMENT, 1 ...",0.439187
3,Vanilla Creme Brulee,"[1 vanilla bean, 2 cups heavy cream, 2 cups ha...",0.437355
4,Chocolate Sprinkled Creme Brulee,"[2 1/3 cups heavy cream, 1/3 cup half-and-half...",0.435901


## Save model and embeddings

In [16]:
dump(vectoriser, "../models/tfidf.joblib")
dump(embeddings, "../embeddings/tfidf_embeddings.joblib")

['../embeddings/tfidf_embeddings.joblib']