# Libraries

In [7]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from recipe_scrapers import scrape_html
import pandas as pd
import time

# Code

## Bon Appetit

In [28]:
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
from recipe_scrapers import scrape_html
from tqdm import tqdm
import time
import random
from urllib.parse import urljoin

def get_recipe_links(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')
    
    links = soup.find_all('a', 
                        class_=['BaseLink-eNWuiM', 'Link-ehXskl'],
                        href=lambda x: x and '/recipe/' in x)
    
    recipe_urls = []
    for link in links:
        href = link.get('href')
        if href:
            full_url = f"https://www.bonappetit.com{href}" if href.startswith('/') else href
            recipe_urls.append(full_url)
    
    return list(set(recipe_urls))

def get_next_page_url(soup, base_url):
    next_button = soup.find('a', 
                           class_=['BaseButton-bLlsy', 'ButtonWrapper-xCepQ'],
                           string='Next')
    if next_button and next_button.get('href'):
        return urljoin(base_url, next_button.get('href'))
    return None

def clean_recipe_data(data):
    keys_to_keep = [
        'canonical_url', 'description', 'ingredient_groups', 
        'ingredients', 'instructions', 'instructions_list', 
        'keywords', 'ratings', 'ratings_count', 'site_name', 'title'
    ]
    return {k: v for k, v in data.items() if k in keys_to_keep}

def main():
    base_url = "https://www.bonappetit.com/simple-cooking/weeknight-meals"
    all_recipe_urls = []
    current_url = base_url
    failed_recipes = []
    
    # First progress bar for collecting URLs
    pbar_urls = tqdm(desc="Collecting recipe URLs", unit=" pages")
    
    while current_url:
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            }
            response = requests.get(current_url, headers=headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            
            page_recipes = get_recipe_links(current_url)
            all_recipe_urls.extend(page_recipes)
            
            current_url = get_next_page_url(soup, base_url)
            time.sleep(random.uniform(1, 3))
            pbar_urls.update(1)
            
        except Exception:
            break
    
    pbar_urls.close()
    all_recipe_urls = list(set(all_recipe_urls))
    
    # Second progress bar for scraping recipes
    recipes_dict = {}
    pbar_scraping = tqdm(total=len(all_recipe_urls), desc="Scraping recipes", unit=" recipes")
    
    for url in all_recipe_urls:
        try:
            html = urlopen(url).read().decode("utf-8")
            scraper = scrape_html(html, org_url=url)
            recipe_data = scraper.to_json()
            
            # Clean data and store with URL key
            recipe_key = url.split('/recipe/')[-1]
            recipes_dict[recipe_key] = clean_recipe_data(recipe_data)
            
        except Exception as e:
            failed_recipes.append({
                'url': url,
                'name': url.split('/recipe/')[-1],
                'error': str(e)
            })
            
        time.sleep(random.uniform(1, 3))
        pbar_scraping.update(1)
    
    pbar_scraping.close()
    
    # Print summary of failed recipes
    if failed_recipes:
        print(f"\nFailed to scrape {len(failed_recipes)} recipes:")
        for failed in failed_recipes:
            print(f"- {failed['name']}: {failed['url']}")
    
    return recipes_dict

if __name__ == "__main__":
    recipes = main()

Collecting recipe URLs: 51 pages [04:12,  4.95s/ pages]
Scraping recipes: 100%|██████████| 1192/1192 [50:25<00:00,  2.54s/ recipes] 


In [30]:
import pyarrow as pa
import pyarrow.parquet as pq

# Convert dictionary to table with proper schema
df = pd.DataFrame.from_dict(recipes, orient='index')
table = pa.Table.from_pandas(df)
pq.write_table(table, 'bonappetit_recipes.parquet')

## Chefkoch

In [33]:
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
from recipe_scrapers import scrape_html
from tqdm import tqdm
import time
import random
import json
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from datetime import datetime

def get_recipe_links(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    response.encoding = 'utf-8'
    response.raise_for_status()
    
    soup = BeautifulSoup(response.text, 'html.parser')
    links = soup.find_all('a', class_='ds-recipe-card__link')
    
    recipe_urls = []
    for link in links:
        href = link.get('href')
        if href:
            clean_url = href.split('#')[0]
            recipe_urls.append(clean_url)
    
    return list(set(recipe_urls))

def clean_recipe_data(data):
    keys_to_keep = [
        'canonical_url', 'description', 'ingredient_groups',
        'ingredients', 'instructions', 'instructions_list',
        'keywords', 'ratings', 'ratings_count', 'site_name', 'title'
    ]
    return {k: v for k, v in data.items() if k in keys_to_keep}

def main():
    base_url = "https://www.chefkoch.de/rs/s{page}t49,50r4.5p30/Schnell-Einfach-Rezepte.html"
    urls_file = 'chefkoch_recipe_urls.json'
    max_recipes = 2000
    
    # Try to load existing URLs
    try:
        with open(urls_file, 'r', encoding='utf-8') as f:
            all_recipe_urls = json.load(f)
        print(f"Loaded {len(all_recipe_urls)} existing URLs")
    except FileNotFoundError:
        all_recipe_urls = []
    
    # Only collect new URLs if we don't have enough
    if len(all_recipe_urls) < max_recipes:
        pbar_urls = tqdm(desc="Collecting recipe URLs", unit=" pages")
        page = 0
        
        while len(all_recipe_urls) < max_recipes:
            try:
                current_url = base_url.format(page=page)
                page_recipes = get_recipe_links(current_url)
                
                if not page_recipes:
                    break
                    
                all_recipe_urls.extend(page_recipes)
                all_recipe_urls = list(set(all_recipe_urls))  # Remove duplicates
                
                # Save URLs after each page
                with open(urls_file, 'w', encoding='utf-8') as f:
                    json.dump(all_recipe_urls, f, ensure_ascii=False, indent=2)
                
                page += 1
                time.sleep(random.uniform(2, 4))
                pbar_urls.update(1)
                
            except Exception as e:
                print(f"Error on page {page}: {str(e)}")
                break
        
        pbar_urls.close()
    
    # Limit to max_recipes
    all_recipe_urls = list(set(all_recipe_urls))[:max_recipes]
    
    # Second progress bar for scraping recipes
    recipes_dict = {}
    failed_recipes = []
    pbar_scraping = tqdm(total=len(all_recipe_urls), desc="Scraping recipes", unit=" recipes")
    
    for url in all_recipe_urls:
        try:
            html = urlopen(url).read().decode("utf-8")
            scraper = scrape_html(html, org_url=url)
            recipe_data = scraper.to_json()
            
            recipe_key = url.split('/')[-1].replace('.html', '')
            recipes_dict[recipe_key] = clean_recipe_data(recipe_data)
            
        except Exception as e:
            failed_recipes.append({
                'url': url,
                'name': url.split('/')[-1],
                'error': str(e)
            })
            
        time.sleep(random.uniform(2, 4))
        pbar_scraping.update(1)
    
    pbar_scraping.close()
    
    # Print summary of failed recipes
    if failed_recipes:
        print(f"\nFailed to scrape {len(failed_recipes)} recipes:")
        for failed in failed_recipes:
            print(f"- {failed['name']}: {failed['url']}")
        
        # Save failed recipes
        with open('failed_recipes.json', 'w', encoding='utf-8') as f:
            json.dump(failed_recipes, f, ensure_ascii=False, indent=2)
    
    print(f"\nSuccessfully scraped {len(recipes_dict)} recipes")
    
    # Save to parquet with timestamp
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    df = pd.DataFrame.from_dict(recipes_dict, orient='index')
    table = pa.Table.from_pandas(df)
    output_file = f'chefkoch_recipes_{timestamp}.parquet'
    pq.write_table(table, output_file)
    print(f"Saved recipes to {output_file}")
    
    return recipes_dict

if __name__ == "__main__":
    recipes = main()

Collecting recipe URLs: 24 pages [01:26,  3.60s/ pages]


Error on page 24: 502 Server Error: Bad Gateway for url: https://www.chefkoch.de/rs/s24t49,50r4.5p30/Schnell-Einfach-Rezepte.html


Scraping recipes: 100%|██████████| 1051/1051 [1:06:48<00:00,  3.81s/ recipes]


Successfully scraped 1043 recipes
Saved recipes to chefkoch_recipes_20250120_171102.parquet





In [34]:
scraper.to_json()

{'author': 'Feline86',
 'canonical_url': 'https://www.chefkoch.de/rezepte/1748901284210089/Dicke-Obst-Pfannkuchen.html',
 'category': 'Dessert',
 'cook_time': 20,
 'description': 'Dicke Obst-Pfannkuchen - wie auf der Kirmes. Über 1162 Bewertungen und für vorzüglich befunden. Mit ► Portionsrechner ► Kochbuch ► Video-Tipps!',
 'host': 'chefkoch.de',
 'image': 'https://img.chefkoch-cdn.de/rezepte/1748901284210089/bilder/1394501/crop-960x540/dicke-obst-pfannkuchen.jpg',
 'ingredient_groups': [{'ingredients': ['250 g Mehl',
    '75 g Zucker',
    '3 m.-große Ei(er)',
    '2 Pck. Vanillinzucker',
    '200 ml Milch',
    '½ TL Salz',
    '2 große Äpfel oder 400 g Heidelbeeren oder 1 Glas Kirschen oder 400 g Johannisbeeren',
    'Zucker zum Bestreuen',
    'Butter , neutrales Öl oder Margarine'],
   'purpose': None}],
 'ingredients': ['250 g Mehl',
  '75 g Zucker',
  '3 m.-große Ei(er)',
  '2 Pck. Vanillinzucker',
  '200 ml Milch',
  '½ TL Salz',
  '2 große Äpfel oder 400 g Heidelbeeren oder 1