# Save recipes

Just a quick notebook to scrape a load of recipes to train on.

In [1]:
from recipe_scrapers import scrape_me
from pathlib import Path
import random
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import re

Path.ls = lambda x: list(x.iterdir())

In [2]:
PROJECT = Path("../../")

PROMPT = PROJECT / "prompt.txt"

DATA = PROJECT / "data"

RECIPES_URLS = DATA / "20170107-061401-recipeitems.json"

In [3]:
def extract_ingredients_and_steps(scraper):
    recipe_dict = scraper.to_json()

    ingredients = recipe_dict["ingredients"]

    instructions = recipe_dict["instructions_list"]
    if len(instructions) == 1:  # the method split didn't work
        instructions = instructions[0].split(".")
        instructions = [i.strip() + "." for i in instructions]
        instructions = [i for i in instructions if i != "."]

    return ingredients, instructions


def format_recipe(ingredients, instructions):
    formatted_text = "Ingredients\n\n"
    for ingredient in ingredients:
        formatted_text += ingredient + "\n"

    formatted_text += "\nMethod\n\n"
    for i, step in enumerate(instructions):
        formatted_text += f"{i+1}. {step}\n"

    return formatted_text

## Scraper site setup

These are the sites that are supported by this scraper:

In [4]:
sites = (DATA / "recipe-sites.txt").read_text().splitlines()[1:]
len(sites)

296

In [5]:
pattern = r"https?://(?:www\.)?([^/]+)"

site_names = [
    re.search(pattern, url).group(1) for url in sites if re.search(pattern, url)
]
print(site_names)

['claudia.abril.com.br', 'abuelascounter.com', 'acouplecooks.com', 'addapinch.com', 'afghankitchenrecipes.com', 'akispetretzikis.com', 'ah.nl', 'allrecipes.com', 'alltommat.se', 'altonbrown.com', 'amazingribs.com', 'ambitiouskitchen.com', 'archanaskitchen.com', 'arla.se', 'atelierdeschefs.fr', 'averiecooks.com', 'barefootcontessa.com', 'baking-sense.com', 'bakingmischief.com', 'bbc.com', 'bbc.co.uk', 'bbcgoodfood.com', 'bettybossi.ch', 'bettycrocker.com', 'biancazapatka.com', 'bigoven.com', 'blueapron.com', 'bluejeanchef.com', 'bonappetit.com', 'bodybuilding.com', 'bongeats.com', 'bowlofdelicious.com', 'briceletbaklava.ch', 'budgetbytes.com', 'carlsbadcravings.com', 'castironketo.net', 'cdkitchen.com', 'chefkoch.de', 'chefnini.com', 'chefsavvy.com', 'closetcooking.com', 'comidinhasdochef.com', 'cookeatshare.com', 'cookieandkate.com', 'cookingcircle.com', 'cookinglight.com', 'cookpad.com', 'cookstr.com', 'cook-talk.com', 'coop.se', 'copykat.com', 'costco.com', 'countryliving.com', 'crea

## Get recipe urls

We have a list of sites but we want actual recipe URLs - I've found a dataset with some of these.

In [6]:
rdf = pd.read_json(RECIPES_URLS, lines=True)

Let's do the same regex as before to get our main domains:

In [7]:
rdf["simple_url"] = rdf["url"].str.extract(pattern)
rdf["simple_url"].value_counts()

simple_url
tastykitchen.com            66700
allrecipes.com              47086
bbcgoodfood.com             15550
epicurious.com               9787
chow.com                     8312
bbc.co.uk                    6844
williams-sonoma.com          5366
jamieoliver.com              1797
lovefood.com                 1746
browneyedbaker.com            945
smittenkitchen.com            894
cookincanuck.com              767
biggirlssmallkitchen.com      749
steamykitchen.com             735
thepioneerwoman.com           705
whatsgabycooking.com          666
aspicyperspective.com         585
101cookbooks.com              582
recipage.com                  572
naturallyella.com             418
bonappetit.com                361
cookieandkate.com             323
elanaspantry.com              290
bunkycooks.com                264
picky-palate.com              259
eatthelove.com                230
seriouseats.com               220
thevintagemixer.com           180
delishhh.com                  161
pan

Filter to the sites that are supported by the scraper:

In [8]:
common_sites = np.intersect1d(
    rdf["simple_url"].unique().astype(str), np.array(site_names)
)
len(common_sites)

17

In [9]:
rdf = rdf[rdf["simple_url"].isin(common_sites)].reset_index(drop=True)
rdf["simple_url"].value_counts()

simple_url
tastykitchen.com        66700
allrecipes.com          47086
bbcgoodfood.com         15550
epicurious.com           9787
bbc.co.uk                6844
williams-sonoma.com      5366
jamieoliver.com          1797
steamykitchen.com         735
thepioneerwoman.com       705
whatsgabycooking.com      666
101cookbooks.com          582
bonappetit.com            361
cookieandkate.com         323
seriouseats.com           220
thevintagemixer.com       180
paninihappy.com            71
foodnetwork.com            59
Name: count, dtype: int64

These sites I've anecdotally found just cause problems so I'll remove them:

In [10]:
problematic_sites = [
    "allrecipes.com",
    "foodnetwork.com",
    "epicurious.com",
    "williams-sonoma.com",
]

rdf = rdf.loc[~rdf["simple_url"].isin(problematic_sites)].reset_index(drop=True)

In [11]:
rdf["simple_url"].value_counts()

simple_url
tastykitchen.com        66700
bbcgoodfood.com         15550
bbc.co.uk                6844
jamieoliver.com          1797
steamykitchen.com         735
thepioneerwoman.com       705
whatsgabycooking.com      666
101cookbooks.com          582
bonappetit.com            361
cookieandkate.com         323
seriouseats.com           220
thevintagemixer.com       180
paninihappy.com            71
Name: count, dtype: int64

Might need to do some proper cutdown of things like tasty kitchen, otherwise the model might get really good at certain types of recipe and not others.

In [12]:
sdf = (
    rdf.groupby("simple_url")
    .apply(lambda x: x.sample(n=rdf["simple_url"].value_counts().min()))
    .reset_index(drop=True)
)
sdf["simple_url"].value_counts()

simple_url
101cookbooks.com        71
bbc.co.uk               71
bbcgoodfood.com         71
bonappetit.com          71
cookieandkate.com       71
jamieoliver.com         71
paninihappy.com         71
seriouseats.com         71
steamykitchen.com       71
tastykitchen.com        71
thepioneerwoman.com     71
thevintagemixer.com     71
whatsgabycooking.com    71
Name: count, dtype: int64

In [13]:
len(sdf)

923

Remove ones with duplicate titles:

In [14]:
sdf = sdf.loc[
    ~sdf["name"].isin(sdf["name"].value_counts()[sdf["name"].value_counts() > 1].index)
]
len(sdf)

905

Maybe this is enough to get going with (will take a while to create my gantt dataset otherwise).

In [16]:
failed = []
for i, row in tqdm(sdf.iterrows(), total=len(sdf)):
    dir = DATA / "recipes" / row["name"]
    try:
        scraper = scrape_me(row["url"])
        ingredients, steps = extract_ingredients_and_steps(scraper)
        if len(ingredients) <= 2 or len(steps) <= 3:
            failed.append(row["url"])
            continue
        recipe = format_recipe(ingredients, steps)
        dir.mkdir(exist_ok=True)
        with open(dir / "recipe.txt", "w") as f:
            f.write(recipe)
    except Exception as e:
        failed.append(row["url"])
        continue

  0%|          | 0/905 [00:00<?, ?it/s]

In [17]:
len(failed)

479

In [18]:
succ_recipes = (DATA / "recipes").glob("*/recipe.txt")
succ_recipes = [recipe.parts[-2] for recipe in succ_recipes if recipe.is_file()]
len(succ_recipes)

426

In [21]:
sdf = sdf.loc[sdf["name"].isin(succ_recipes)].reset_index(drop=True)
len(sdf)

426

Now I need to save the metadata.

In [22]:
# sdf.to_csv(DATA / "recipes-meta.csv", index=False)

## Sample quality

Eyeball a few just to make sure it went ok.

In [39]:
sample = sdf.sample(10)

for i, row in sample.iterrows():
    fp = DATA / "recipes" / row["name"] / "recipe.txt"
    print(row["name"] + "\n")
    print(fp.read_text())

Silverdollar Socca

Ingredients

1 1/2 cup / 6 3/4 ounces chickpea flour
1 1/2 teaspoons fine sea salt
2 large eggs
1/2 cup / 120 ml water
1 cup / 240 ml buttermilk
1/4 cup / 60 ml extra virgin olive oil
2 tablespoons black sesame seeds
1 teaspoon yellow mustard seeds (optional)

Method

1. Combine the chickpea flour, salt, eggs, water, buttermilk, olive oil and sesame seeds in a large bowl.
2. Whisk until the batter is smooth.
3. You're after a batter that is on the thin side, similar to a crepe batter.
4. Allow it to sit for at least 15 minutes, so the chickpea flour absorbs the buttermilk, resulting in a more tender pancake.
5. Stir again.

Kofta hara masala (Indian meatballs in a green sauce)

Ingredients

500g/1 lb minced lamb
2 or 3 spring onions, finely chopped
2 green chillies, finely chopped
1 tbsp fresh coriander, chopped
1 tsp ground allspice
1 tsp ground cinnamon
1 tsp garam masala
1 tsp freshly ground black pepper
1 tsp garlic paste
1 tsp ginger paste
salt to taste
75ml/2½

These look good to me. Some formatting issues but maybe we train the model to deal with those.