In [14]:
from bs4 import BeautifulSoup, SoupStrainer
from recipe_scrapers import scrape_me
import pandas as pd
import requests
import os

In [43]:
def scrape_page(url_, recipe_url_list, rules):
    req = requests.get(url_)
    soup = BeautifulSoup(req.content)
    for item in soup.find_all('a', href=True):
        link = item['href']
        if rules(link):
            recipe_url_list.add(link)

def get_recipes(url, pages, rules, black_list):
    recipe_url_list = set()
    
    for i in range(1, pages+1):
        if i > 1:
            url_ = f'{url}/page/{i}'
        else:
            url_ = url
        scrape_page(url_, recipe_url_list, rules)
    
    recipe_url_list -= black_list
    print(f'got {len(recipe_url_list)} recipes from {url}')
    return recipe_url_list

def save_csv(recipe_url_list, website, name):
    all_ = []

    for link in recipe_url_list:
        try:
            scraper = scrape_me(link, wild_mode = True)
            x = [scraper.title(), scraper.author(), scraper.canonical_url(), scraper.cuisine(), 
                 scraper.image(), scraper.ingredients(), scraper.instructions(), scraper.language(),
                 scraper.nutrients(), scraper.ratings(), scraper.reviews(), 
                 scraper.site_name(), scraper.total_time(), scraper.yields()]
            all_.append(x)
        # default to chinese cuisine
        except AttributeError:
            scraper = scrape_me(link, wild_mode = True)
            x = [scraper.title(), scraper.author(), scraper.canonical_url(), 'Chinese', 
                 scraper.image(), scraper.ingredients(), scraper.instructions(), scraper.language(),
                 scraper.nutrients(), scraper.ratings(), scraper.reviews(), 
                 scraper.site_name(), scraper.total_time(), scraper.yields()]
            all_.append(x)
        except:
            print(f'error in {link}')
            continue

    df = pd.DataFrame(all_, columns = ['title', 'author', 'url', 'cuisine', 'image_url',
                                 'ingredients', 'instructions', 'language', 'nutrients', 'ratings',
                                 'reviews', 'site_name', 'total_time', 'yields'])
    df = df.set_index('title')
    print(f'saved {len(df)} recipes to {name}')
    if not os.path.exists(f'data/{website}'):
        os.makedirs(f'data/{website}')
    df.to_csv(f'data/{website}/{name}.csv')

# Omnivore

In [None]:
urls = [
    ('https://omnivorescookbook.com/category/recipe/main/', 'omnivore_mains', 11),
    ('https://omnivorescookbook.com/category/recipe/appetizer/', 'omnivore_appetizers', 4),
    ('https://omnivorescookbook.com/category/recipe/bakery/', 'omnivore_bakery', 1),
    ('https://omnivorescookbook.com/category/recipe/breakfast-brunch/', 'omnivore_breakfast_brunch', 1),
    ('https://omnivorescookbook.com/category/recipe/dessert/', 'omnivore_dessert', 2),
    ('https://omnivorescookbook.com/category/recipe/side/', 'omnivore_side', 5),
    ('https://omnivorescookbook.com/category/recipe/soup-recipe/', 'omnivore_soup_stew', 4),
]

rules = lambda link: 'https://omnivorescookbook.com' in link and '-' in link and len(link.split('/')) == 5
black_list = {'https://omnivorescookbook.com/how-to/', 
              'https://omnivorescookbook.com/start-here/',
                 }

for url, name, pages in urls:
    recipe_url_list = get_recipes(url, pages, rules, black_list)
    save_csv(recipe_url_list, 'omnivore', name)

got 210 recipes from https://omnivorescookbook.com/category/recipe/main/
error in https://omnivorescookbook.com/best-chinese-noodles-recipes/
error in https://omnivorescookbook.com/century-egg/
error in https://omnivorescookbook.com/chinese-freezer-meals/
error in https://omnivorescookbook.com/chinese-vegetable-stir-fry/
error in https://omnivorescookbook.com/easy-chicken-curry/


# My korean kitchen

In [17]:
rules = lambda link: 'https://mykoreankitchen.com/' in link and '-' in link and len(link.split('/')) == 5
black_list = {
    'https://mykoreankitchen.com/terms-and-conditions/',
    'https://mykoreankitchen.com/food-blogger-resources/',
    'https://mykoreankitchen.com/how-to-enjoy-korean-bbq-at-home/',
}

urls = [
    ('https://mykoreankitchen.com/recipes/#recipes+category:appetizers', 'korean_appetizers', 1),
    ('https://mykoreankitchen.com/recipes/#recipes+category:beef', 'korean_beef', 1),
    ('https://mykoreankitchen.com/recipes/#recipes+category:chicken', 'korean_chicken', 1),
    ('https://mykoreankitchen.com/recipes/#recipes+category:desserts', 'korean_desserts', 1),
    ('https://mykoreankitchen.com/recipes/#recipes+category:kimchi', 'korean_kimchi', 1),
    ('https://mykoreankitchen.com/recipes/#recipes+category:korean-bbq-side-dishes', 'korean_bbq_side_dish', 1),
    ('https://mykoreankitchen.com/recipes/#recipes+category:korean-bbq', 'korean_bbq', 1),
    ('https://mykoreankitchen.com/recipes/#recipes+category:drinks', 'korean_drinks', 1),
    ('https://mykoreankitchen.com/recipes/#recipes+category:side-dishes-banchan', 'korean_banchan', 1),
    ('https://mykoreankitchen.com/recipes/#recipes+category:side-dishes-soup', 'korean_soup', 1),
    ('https://mykoreankitchen.com/recipes/#recipes+category:noodles', 'korean_noodles', 1),
    ('https://mykoreankitchen.com/recipes/#recipes+category:pork', 'korean_pork', 1),
    ('https://mykoreankitchen.com/recipes/#recipes+category:rice', 'korean_rice', 1),
    ('https://mykoreankitchen.com/recipes/#recipes+category:seafood', 'korean_seafood', 1),
    ('https://mykoreankitchen.com/recipes/#recipes+category:snacks', 'korean_snacks', 1),
    ('https://mykoreankitchen.com/recipes/#recipes+category:vegetarian', 'korean_vegetarian', 1)
]

for url, name, pages in urls:
    print(url, name, pages)
    recipe_url_list = get_recipes(url, pages, rules, black_list)
    save_csv(recipe_url_list, 'korean', name)

got 26 recipes from https://mykoreankitchen.com/recipes/#recipes+category:appetizers
https://mykoreankitchen.com/recipes/#recipes+category:appetizers korean_appetizers 1
got 23 recipes from https://mykoreankitchen.com/recipes/#recipes+category:appetizers
error in https://mykoreankitchen.com/the-banchan-cookbook/
error in https://mykoreankitchen.com/essential-korean-cooking-ingredients/
saved {len(df)} recipes to {name}
https://mykoreankitchen.com/recipes/#recipes+category:beef korean_beef 1
got 23 recipes from https://mykoreankitchen.com/recipes/#recipes+category:beef
error in https://mykoreankitchen.com/the-banchan-cookbook/
error in https://mykoreankitchen.com/essential-korean-cooking-ingredients/
saved {len(df)} recipes to {name}
https://mykoreankitchen.com/recipes/#recipes+category:chicken korean_chicken 1
got 23 recipes from https://mykoreankitchen.com/recipes/#recipes+category:chicken
error in https://mykoreankitchen.com/the-banchan-cookbook/
error in https://mykoreankitchen.com/

# Just one cookbook

In [13]:
rules = lambda link: 'https://www.justonecookbook.com/' in link and '-' in link and len(link.split('/')) == 5
black_list = {
    ''
}

urls = [
    ('https://www.justonecookbook.com/categories/recipes/appetizer/', 'just_one_appetizer', 7),
    ('https://www.justonecookbook.com/categories/recipes/beverage/', 'just_one_beverage', 3),
    ('https://www.justonecookbook.com/categories/recipes/breakfast/', 'just_one_breakfast', 3),
    ('https://www.justonecookbook.com/categories/recipes/dessert/', 'just_one_dessert', 13),
    ('https://www.justonecookbook.com/categories/recipes/salad/', 'just_one_beverage', 6),
    ('https://www.justonecookbook.com/categories/recipes/side/', 'just_one_side', 11),
    ('https://www.justonecookbook.com/categories/recipes/snack/', 'just_one_snack', 2),
    ('https://www.justonecookbook.com/categories/recipes/soup-stew/', 'just_one_soup_stew', 7),
]

for url, name, pages in urls:
    print(url, name, pages)
    recipe_url_list = get_recipes(url, pages, rules, black_list)
    save_csv(recipe_url_list, 'just_one_cookbook', name)

https://www.justonecookbook.com/categories/recipes/appetizer/ just_one_appetizer 7
got 77 recipes from https://www.justonecookbook.com/categories/recipes/appetizer/
{'https://www.justonecookbook.com/seared-tuna-aburi-toro/', 'https://www.justonecookbook.com/easy-holiday-appetizers-salads-desserts/', 'https://www.justonecookbook.com/chicken-spring-rolls/', 'https://www.justonecookbook.com/gyoza-with-wings-hanetsuki-gyoza/', 'https://www.justonecookbook.com/sauteed-abalone/', 'https://www.justonecookbook.com/grilled-oysters-ponzu-sauce/', 'https://www.justonecookbook.com/kamaboko-japanese-fish-cake/#comments', 'https://www.justonecookbook.com/terms-and-conditions/', 'https://www.justonecookbook.com/japanese-clams-sake-steamed-clams/', 'https://www.justonecookbook.com/miso-dengaku/', 'https://www.justonecookbook.com/thai-chicken-lettuce-wraps/', 'https://www.justonecookbook.com/shrimp-egg-rolls/', 'https://www.justonecookbook.com/agedashi-tofu-2/', 'https://www.justonecookbook.com/tazukur

# Woks of Life

In [44]:
rules = lambda link: 'https://thewoksoflife.com/' in link and '-' in link and len(link.split('/')) == 5
black_list = {
    ''
}

urls = [
    ('https://thewoksoflife.com/category/recipes/beef-recipes/', 'woks_beef', 5),
    ('https://thewoksoflife.com/category/recipes/chicken/', 'woks_chicken', 9),
    ('https://thewoksoflife.com/category/recipes/pork/', 'woks_pork', 9),
    ('https://thewoksoflife.com/category/recipes/lamb/', 'woks_lamb', 1),
    ('https://thewoksoflife.com/category/recipes/fish-and-seafood/', 'woks_seafood', 6),
    ('https://thewoksoflife.com/category/recipes/eggs/', 'woks_eggs', 2),
    ('https://thewoksoflife.com/category/recipes/vegetables/', 'woks_veg', 8),
    ('https://thewoksoflife.com/category/recipes/tofu/', 'woks_tofu', 4),
    ('https://thewoksoflife.com/category/recipes/bread-and-pizza/', 'woks_bread', 3),
    ('https://thewoksoflife.com/category/recipes/noodles/', 'woks_noodles', 8),
    ('https://thewoksoflife.com/category/recipes/rice-recipes/', 'woks_noodles', 5),
    ('https://thewoksoflife.com/category/recipes/soups-and-stocks/', 'woks_soup', 5),
    ('https://thewoksoflife.com/category/recipes/beverages/', 'woks_beverage', 1),
    ('https://thewoksoflife.com/category/recipes/dessert/', 'woks_dessert', 6),
]

for url, name, pages in urls:
    print(f'url: {url} category: {name} pages: {pages}')
    recipe_url_list = get_recipes(url, pages, rules, black_list)
    save_csv(recipe_url_list, 'woks_of_life', name)

https://thewoksoflife.com/category/recipes/beef-recipes/ woks_beef 5
got 94 recipes from https://thewoksoflife.com/category/recipes/beef-recipes/
error in https://thewoksoflife.com/chinese-vegetables-asian-leafy-greens/
error in https://thewoksoflife.com/chinese-chives-scallions-aromatics-peppers/
error in https://thewoksoflife.com/wok-guide/
error in https://thewoksoflife.com/chinese-sauces-vinegars-oils/
error in https://thewoksoflife.com/rice-grains-and-flours/
error in https://thewoksoflife.com/chinese-dried-preserved-ingredients/
error in https://thewoksoflife.com/visual-recipe-index/
error in https://thewoksoflife.com/chinese-ingredients-glossary/
error in https://thewoksoflife.com/chinese-tofu-bean-curd/
error in https://thewoksoflife.com/how-to/chinese-cooking-tools
error in https://thewoksoflife.com/privacy-policy/
error in https://thewoksoflife.com/chinese-noodles-wrappers/
error in https://thewoksoflife.com/seasoned-soy-sauce/
error in https://thewoksoflife.com/work-with-us/

# Thai

In [82]:
rules = lambda link: 'https://www.eatingthaifood.com/' in link and '-' in link and len(link.split('/')) == 5
black_list = {
    ''
}

urls = [
    ('https://www.eatingthaifood.com/thai-recipes/', 'thai', 1)
]

for url, name, pages in urls:
    print(f'url: {url}, category: {name}, pages: {pages}')
    recipe_url_list = get_recipes(url, pages, rules, black_list)
    save_csv(recipe_url_list, 'thai_food', name)

url: https://www.eatingthaifood.com/thai-recipes/ category: thai pages: 1
got 30 recipes from https://www.eatingthaifood.com/thai-recipes/
error in https://www.eatingthaifood.com/contact-us/
error in https://www.eatingthaifood.com/thai-recipes/
error in https://www.eatingthaifood.com/start-here/
saved 27 recipes to thai


# spice the plate

In [86]:
rules = lambda link: 'https://www.spicetheplate.com/' in link and '-' in link and len(link.split('/')) == 6
black_list = {
    ''
}

urls = [
    ('https://www.spicetheplate.com/category/chicken/', 'spice_plate_chicken', 1),
    ('https://www.spicetheplate.com/category/beef/', 'spice_plate_beef', 1),
    ('https://www.spicetheplate.com/category/pork/', 'spice_plate_pork', 1),
    ('https://www.spicetheplate.com/category/veggie/', 'spice_plate_veggie', 1),
    ('https://www.spicetheplate.com/category/rice-noodle/', 'spice_plate_rice_noodle', 1),
    ('https://www.spicetheplate.com/category/seafood-and-other/', 'spice_plate_seafood', 1),
]

for url, name, pages in urls:
    print(f'url: {url}, category: {name}, pages: {pages}')
    recipe_url_list = get_recipes(url, pages, rules, black_list)
    save_csv(recipe_url_list, 'spice_plate', name)

url: https://www.spicetheplate.com/category/chicken/, category: spice_plate_chicken, pages: 1
got 38 recipes from https://www.spicetheplate.com/category/chicken/
error in https://www.spicetheplate.com/category/popular-classic/
error in https://www.spicetheplate.com/product-category/seasoning-sauce-and-spice/
error in https://www.spicetheplate.com/category/rice-noodle/
error in https://www.spicetheplate.com/guides/5-essential-cooking-guides-for-beginner-cooks/
error in https://www.spicetheplate.com/category/light-healthy/
error in https://www.spicetheplate.com/product-category/cooking/
error in https://www.spicetheplate.com/category/seafood-and-other/
error in https://www.spicetheplate.com/product-category/serving/
error in https://www.spicetheplate.com/category/budget-friendly/
error in https://www.spicetheplate.com/category/quick-easy/
error in https://www.spicetheplate.com/category/comfort-food/
error in https://www.spicetheplate.com/product-category/storage/
saved 26 recipes to spic

# glebe

In [94]:
rules = lambda link: 'https://glebekitchen.com/' in link and '-' in link and len(link.split('/')) == 5
black_list = {
    ''
}

urls = [
    ('https://glebekitchen.com/thai/', 'glebe_thai', 1),
    ('https://glebekitchen.com/vietnamese/', 'glebe_viet', 1),
    ('https://glebekitchen.com/korean/', 'glebe_korean', 1),
    ('https://glebekitchen.com/japanese/', 'glebe_japanese', 1),
    ('https://glebekitchen.com/indian/', 'glebe_indian', 1),

]

for url, name, pages in urls:
    print(f'url: {url}, category: {name}, pages: {pages}')
    recipe_url_list = get_recipes(url, pages, rules, black_list)
    save_csv(recipe_url_list, 'glebe_kitchen', name)

url: https://glebekitchen.com/thai/, category: glebe_thai, pages: 1
got 35 recipes from https://glebekitchen.com/thai/
error in https://glebekitchen.com/mexican-2/
error in https://glebekitchen.com/privacy-policy/
error in https://glebekitchen.com/recipe-index/
error in https://glebekitchen.com/main-course/
error in https://glebekitchen.com/sides-and-stuff/
error in https://glebekitchen.com/bbq-and-grilling/
saved 29 recipes to glebe_thai
url: https://glebekitchen.com/vietnamese/, category: glebe_viet, pages: 1
got 27 recipes from https://glebekitchen.com/vietnamese/
error in https://glebekitchen.com/mexican-2/
error in https://glebekitchen.com/privacy-policy/
error in https://glebekitchen.com/recipe-index/
error in https://glebekitchen.com/main-course/
error in https://glebekitchen.com/sides-and-stuff/
error in https://glebekitchen.com/bbq-and-grilling/
saved 21 recipes to glebe_viet
url: https://glebekitchen.com/korean/, category: glebe_korean, pages: 1
got 23 recipes from https://gl

In [93]:
url_ = 'https://glebekitchen.com/thai/'

req = requests.get(url_)
soup = BeautifulSoup(req.content)
for item in soup.find_all('a', href=True):
    link = item['href']
    print(link)
    if rules(link):
        print(link)

https://glebekitchen.com/
#content
https://glebekitchen.com
https://glebekitchen.com/recipe-index/
https://glebekitchen.com/mexican/
https://glebekitchen.com/bbq-and-grilling/
https://glebekitchen.com/french/
https://glebekitchen.com/indian/
https://glebekitchen.com/italian/
https://glebekitchen.com/japanese/
https://glebekitchen.com/korean/
https://glebekitchen.com/mexican-2/
https://glebekitchen.com/thai/
https://glebekitchen.com/vietnamese/
https://glebekitchen.com/appetizers/
https://glebekitchen.com/main-course/
https://glebekitchen.com/noodles/
https://glebekitchen.com/poultry/
https://glebekitchen.com/meat/
https://glebekitchen.com/seafood/
https://glebekitchen.com/vegetarian/
https://glebekitchen.com/sides-and-stuff/
https://glebekitchen.com/about/
https://glebekitchen.com/
https://glebekitchen.com/thai-beef-noodles/
https://glebekitchen.com/thai-beef-noodles/
https://glebekitchen.com/beef-and-broccoli-thai-style/
https://glebekitchen.com/beef-and-broccoli-thai-style/
https://g

In [87]:
link = 'https://glebekitchen.com/laksa-spicy-coconut-noodle-soup/'
scraper = scrape_me(link, wild_mode = True)