In [1]:
from simsity import create_index, load_index
from sentence_transformers import SentenceTransformer

from dotenv import load_dotenv
load_dotenv()

from data.preprocess_data import combine_json_to_dataframe

In [2]:
# Call the function with the path to the zip file
recipe_data = combine_json_to_dataframe("../data/recipes_raw.zip")

recipe_data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 124434 entries, p3pKOD6jIHEcjf20CCXohP8uqkG5dGi to 2Q3Zpfgt/PUwn1YABjJ5A9T3ZW8xwVa
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   instructions  124434 non-null  object 
 1   ingredients   124434 non-null  object 
 2   title         124434 non-null  object 
 3   full_text     124434 non-null  object 
 4   num_words     124434 non-null  float64
dtypes: float64(1), object(4)
memory usage: 5.7+ MB


# SimSity

`SimSity` is a [lightweight indexing tool by Vincent WarmerDam](https://github.com/koaning/simsity).

*"The goal of simsity is to be minimal, to make rapid prototyping very easy and to be "just enough" for medium sized datasets."*

It is built to interact easily with scikit-learn Transformers (like TfIDFTransformer) or another of Vincent's packages - [embetter](https://github.com/koaning/embetter).


In [3]:
# simsity requires the encoder class to have a `transform` method, thus this simple wrapper.
class Encoder:
    def __init__(self, model_name, max_seq_length = 512):
        self.model = SentenceTransformer(model_name)
        # Ensure we increase the max_seq_length to the maximum 512 to handle the long recipes
        self.model.max_seq_length = max_seq_length
    
    def transform(self, data:list):
        return self.model.encode(data)

encoder = Encoder(model_name="all-MiniLM-L12-v2")

# Populate the ANN vector index and use it. 
index = create_index([rec for rec in recipe_data.full_text], 
                     encoder,
                     path="../embeddings/"
                     )

indexing: 100%|██████████| 249/249 [4:54:50<00:00, 71.05s/it]  


`index.query()` returns cosine distances, rather than similarity. Do 1 - distance, in case you prefer similarity scores.

In [4]:
test_recipe = """
Ingredients:

4 veal cutlets (or pork if you can't find veal)
Salt
Pepper
1 cup flour
2 eggs
1 cup breadcrumbs (preferably made from stale bread)
1/2 cup vegetable oil (for frying)
1 lemon (optional)
Instructions:

Pound the cutlets with a meat mallet until they are about 1/4 inch thick. Season both sides with salt and pepper.

Place the flour in a shallow dish. In another dish, beat the eggs. In a third dish, place the breadcrumbs.

Coat each cutlet with flour, shaking off any excess. Dip it into the beaten eggs, and then coat with the breadcrumbs. Repeat this for all of the cutlets.

Heat the vegetable oil in a large frying pan until hot.

Fry each cutlet for about 2-3 minutes on each side, or until golden brown and crispy. Be careful not to overcrowd the pan, you may need to fry them in batches.

Remove the cutlets from the pan with a slotted spoon and place them on paper towels to drain any excess oil.

"""

index.query(test_recipe, n=3)

(['Recipe title: Pork Cutlets. Ingredients: 1 egg white plus 1 tablespoon water; 1 teaspoon dry mustard; 1 (4-ounce) boneless pork cutlet, pounded thin; 1/2 cup flour; 1/2 cup dry bread crumbs; 1 tablespoon olive oil; 1 lime. Instructions: In a mixing bowl, beat the egg white and water until it starts to foam. Stir in a teaspoon of dry mustard. Season cutlet with salt and pepper. Dip the cutlet in flour, then the egg mixture, coating thoroughly, and then in the breadcrumbs. Be sure that both sides are covered well. Set aside, or refrigerate, covered by a piece of waxed paper or plastic wrap.\nHeat the oil in a heavy frying pan until hot. Add the cutlet, lower the heat slightly, and cook for about 2 minutes. Turn with a spatula, and brown the other side. Continue cooking, turning once again if necessary, until the pork is cooked through, about 5 minutes in total. When done, the flesh will be white. Drain on a paper bag. Serve with a squeeze of lime.\nServe with potato and leeks',
  'Rec

In [5]:
test_recipe_2 =     """Ingredients:
- 2 cups heavy cream
- 1 vanilla bean, split and scraped or 1 tsp vanilla extract
- 5 egg yolks
- 1/2 cup granulated sugar, plus more for caramelizing

Instructions:

1. Preheat the oven to 325°F.

2. In a medium saucepan, heat the cream and vanilla bean (both the seeds and the pod) over medium heat until it just begins to simmer. Remove from heat and let sit for 15 minutes to infuse the vanilla flavor.

3. Whisk together the egg yolks and sugar in a medium bowl until light and fluffy.

4. Remove the vanilla pod from the cream and scrape the seeds back into the cream. Discard the pod.

5. Slowly add the cream to the egg mixture, whisking constantly, until well combined.

6. Divide the mixture among four 6-ounce ramekins or custard cups.

7. Place the ramekins in a baking dish and add enough hot water to the dish to reach halfway up the sides of the ramekins.

8. Bake for 30-35 minutes or until the custard is set but still slightly jiggly in the center.

9. Remove the ramekins from the water bath and let cool to room temperature. Refrigerate for at least 2 hours or overnight.

10. When ready to serve, sprinkle a thin layer of sugar over the top of each custard. Either use a culinary torch to caramelize the sugar or place the ramekins under a broiler until the sugar is melted and caramelized. Serve immediately.
"""

index.query(test_recipe_2, n=3)

(['Recipe title: Crème Brûlée . Ingredients: 2 1/2 cups heavy cream; 3/4 cup sugar; 1/2 vanilla bean, split lengthwise and seeds scraped; Pinch salt; 5 large egg yolks, lightly beaten; 6 tablespoons sanding sugar. Instructions: 1. Preheat the oven to 325°F and arrange the ramekins on a large rimmed baking sheet. 2. In a medium saucepan over moderately high heat, combine the heavy cream, 1/2 of the sugar, the vanilla bean seeds and pod, and the salt and bring to a boil, stirring to dissolve the sugar. 3. While the cream is heating, combine the egg yolks with the remaining sugar and whisk to combine. Gradually add about 1/3 of the hot cream to the yolks, whisking constantly, then add the remaining hot cream and stir to fully combine. Strain the custard through a fine-mesh sieve into a clean container. Carefully ladle or pour the custard into the ramekins, filling them to the rim. 4. Place the baking sheet in the oven and carefully pour enough hot water into the pan to come halfway up the