# BWB Recipe Data Extraction

## Setup

In [2]:
import requests
import bs4
import re
from functional import seq
from collections import namedtuple
from unicodedata import numeric
from string import capwords
from collections import namedtuple

## Sample data for validation

In [62]:
raw_episode = requests.get('https://www.bingingwithbabish.com/recipes/2017/8/22/parksandrecburger')

In [63]:
soup = bs4.BeautifulSoup(raw_episode.content, 'html.parser')

In [64]:
soup.title.string

'Parks & Rec Burger Cookoff — Binging With Babish'

In [65]:
recipe_locations = soup.find_all(['h1','h2','h3','h4','h5'], string='Ingredients')
loc1 = recipe_locations[0]

In [66]:
method = loc1.find_next(['h1','h2','h3','h4','h5']).string
method

"Method: Chris Traeger's East Meets West Turkey Burger"

## Parsing ingredients in recipes

In [67]:
Ingredient = namedtuple('Ingredient', 'qty unit name raw')
units_pattern = r'(\s?g|kg|oz|ounce|tbsp|Tbsp|tablespoon|tsp|teaspoon|cup|lb|pound|small|medium|large|whole|half)?(?:s|es)?\.?\b)'

#pattern = re.compile(r'^(?:([-\.\/\s0-9\u2150-\u215E\u00BC-\u00BE]+)([^\d\s]+)\.?)?(?:.*\sof\s)?\s?(.+?)(?:,|$)', flags=re.UNICODE)
pattern = re.compile(r'^(?:([-\.\/\s0-9\u2150-\u215E\u00BC-\u00BE]+)?{UNITS_PATTERN}?(?:.*\sof\s)?\s?(.+?)(?:,|$)'.format(UNITS_PATTERN=units_pattern), flags=re.UNICODE)


def parse_ingredient(i):
    if type(i) is not str:
        if i.string is None:
            # multiple tags in child,
            # bs4 gets confused per https://www.crummy.com/software/BeautifulSoup/bs4/doc/#string
            s = ' '.join(i.stripped_strings)
        else:
            s = i.string.strip()
    else:
        s = i
    raw = s.replace('\xa0','').strip()
    
    clean = re.sub(r'\(.+?\)', '', raw).replace('’',"'")

    parsed = pattern.match(clean)
    if parsed:
        qty, unit, name = parsed.groups()
    else:
        print("WARN: Unable to parse ingredient '{}'".format(raw))
        return Ingredient(None, None, None, raw)
    
    if qty is not None:
        qty = qty.strip() # whitespace

        if len(qty) == 1:
            qty = numeric(qty)
        else:
            try:
                # Assume the last character is a vulgar fraction
                qty = float(qty[:-1]) + numeric(qty[-1])
            except ValueError:
                pass # let it be a string

    name = capwords(name)

    return Ingredient(qty, unit, name, raw)

# Tests to validate parse_ingredient!!!
assert parse_ingredient('4 eggs') == Ingredient(4.0, None, 'Eggs', '4 eggs')
assert parse_ingredient('2 ½ pounds of full fat cream cheese, cut') == Ingredient(2.5, 'pound', 'Full Fat Cream Cheese', '2 ½ pounds of full fat cream cheese, cut')
assert parse_ingredient('6 stalks celery') == Ingredient(6.0, None, 'Stalks Celery', '6 stalks celery')

ingredients = seq(loc1.find_next_sibling(['ul','ol']).children)\
    .map(parse_ingredient)

ingredients

qty,unit,name,raw
1.0,small,Eggplant,1 small eggplant
,,Olive Oil,Olive oil
,,Kosher Salt,Kosher salt
,,Ground Black Pepper,Ground black pepper
1.0,large,Papaya,1 large papaya
0.333333,cup,Apple Cider Vinegar,⅓ cup apple cider vinegar
0.333333,cup,Golden Raisins,⅓ cup golden raisins
,,Saffron Threads,Saffron threads
4.0,,Egg Yolks,4 egg yolks
2.0,,Lemon,2 squeezes of lemon


## Fetch the current list of BWB episodes

In [14]:
raw_episode_list = requests.get('https://www.bingingwithbabish.com/recipes/')
soup = bs4.BeautifulSoup(raw_episode_list.content, 'html5lib')

In [15]:
soup.title.string.strip()

'Recipes — Binging With Babish'

In [16]:
episode_links = seq(soup.find('div', class_='recipe-row').select('.main-image-wrapper a'))\
    .map(lambda atag: atag.get('href'))

## Cache episodes/recipes content locally

```python
# This cell is commented out to prevent accidental runs
import time
import os

for link in episode_links:
    episodeHTML = requests.get('https://www.bingingwithbabish.com'+link).content
    filename = 'tmp/raw-episodes/'+link.lstrip('/')
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "wb") as f:
        f.write(episodeHTML)
    time.sleep(3)
```

## Parse all episodes into babish.json

In [59]:
episodes = []
for link in episode_links:
    path = 'tmp/raw-episodes/'+link.lstrip('/')
    with open(path, 'rb') as f:
        soup = bs4.BeautifulSoup(f, 'html.parser')

    episode_name = soup.title.string.strip().replace(' — Binging With Babish','')
    
    youtube_link = json.loads(soup.find('div', class_='video-block')['data-block-json'])['url']
    
    ep = {
        'episode_name': episode_name,
        'episode_link': 'https://www.bingingwithbabish.com'+link,
        'youtube_link': youtube_link,
        'recipes': []
    }

    recipe_locations = soup.find_all(['h1','h2','h3','h4','h5'], string=re.compile('Ingredients'))
    
    for loc in recipe_locations:
        method = loc.find_next(['h1','h2','h3','h4','h5'])
        if method:
            method = method.string.strip()
        else:
            method = 'Default - {}'.format(episode_name)

        ingredients = list(loc.find_next_sibling(['ul','ol']).children)
        
        if len(ingredients) > 0:
            ingredients = list(seq(ingredients).map(parse_ingredient))
        else:
            print("WARN: Could not location ingredients for {0} (Episode {1})".format(method, episode_name))
            
        recipe = {
            'method': method,
            'ingredients': ingredients,
        }
        
        ep['recipes'].append(recipe)
    
    episodes.append(ep)

In [60]:
import json
with open('babish.json', 'w') as f:
    json.dump(episodes, f, indent=2)

## Parse a single episode

In [58]:
link = '/recipes/2017/2/20/il-timpano-inspired-by-big-night'
path = 'tmp/raw-episodes/'+link.lstrip('/')
with open(path, 'rb') as f:
    soup = bs4.BeautifulSoup(f, 'html.parser')

episode_name = soup.title.string.strip().replace(' — Binging With Babish','')

youtube_link = json.loads(soup.find('div', class_='video-block')['data-block-json'])['url']

ep = {
    'episode_name': episode_name,
    'episode_link': 'https://www.bingingwithbabish.com'+link,
    'recipes': []
}

recipe_locations = soup.find_all(['h1','h2','h3','h4','h5'], string=re.compile('Ingredients'))

for loc in recipe_locations:
    method = loc.find_next(['h1','h2','h3','h4','h5'])
    if method:
        method = method.string.strip()
    else:
        method = 'Default - {}'.format(episode_name)

    ingredients = list(loc.find_next_sibling(['ul','ol']).children)

    if len(ingredients) > 0:
        ingredients = list(seq(ingredients).map(parse_ingredient))
    else:
        print("WARN: Could not location ingredients for {0} (Episode {1})".format(method, episode_name))

    recipe = {
        'method': method,
        'ingredients': ingredients,
    }

    ep['recipes'].append(recipe)
    
ep

{'episode_link': 'https://www.bingingwithbabish.com/recipes/2017/2/20/il-timpano-inspired-by-big-night',
 'episode_name': 'Il Timpano inspired by Big Night',
 'recipes': [{'ingredients': [Ingredient(qty=1.0, unit='pound', name='Ground Chuck', raw='1 pound ground chuck'),
    Ingredient(qty='1/2', unit='pound', name='Ground Short Rib', raw='1/2 pound ground short rib'),
    Ingredient(qty=1.0, unit='pound', name='Ground Pork', raw='1 pound ground pork'),
    Ingredient(qty=1.0, unit='pound', name='Ground Veal', raw='1 pound ground veal'),
    Ingredient(qty=3.0, unit='ounce', name='Torn Italian Bread', raw='3 ounces torn Italian bread'),
    Ingredient(qty='1/2', unit='cup', name='Buttermilk', raw='1/2 cup buttermilk, plus more as needed'),
    Ingredient(qty=1.0, unit='medium', name='Onion', raw='1 medium onion, minced'),
    Ingredient(qty=4.0, unit=None, name='Cloves Garlic', raw='4 cloves garlic, finely minced'),
    Ingredient(qty=4.0, unit='ounce', name='Parmesan Cheese', raw='4 o

## Experimenting with units/measures

In [18]:
measures = {
    'drop': {'abrv': 'dr gt gtt', 'oz':1.0/576},
    'smidgen': {'abrv': 'smdg smi', 'oz': 1.0/256},
    'pinch': {'abrv': 'pn', 'oz': 1.0/128},
    'dash': {'abrv': 'ds', 'oz': 1.0/64},
    'saltspoon': {'abrv': 'ssp scruple', 'oz': 1.0/32},
    'coffeespoon': {'abrv': 'csp', 'oz': 1.0/16},
    'dram': {'abrv': 'dr', 'oz': 1.0/8},
    'teaspoon': {'abrv': 'tsp t', 'oz': 1.0/6},
    'tablespoon': {'abrv': 'tbsp T', 'oz': 1.0/2},
    'ounce': {'abrv': 'oz fl.oz', 'oz': 1.0},
    'wineglass': {'abrv': 'wgf', 'oz': 2.0},
    'teacup': {'abrv': 'tcf gill', 'oz': 4.0},
    'cup': {'abrv': 'C', 'oz': 8.0},
    'pint': {'abrv': 'pt', 'oz': 16.0},
    'quart': {'abrv': 'qt', 'oz': 32.0},
    'pottle': {'abrv': 'pot', 'oz': 64.0},
    'gallon': {'abrv': 'gal', 'oz': 128.0},
    'pounds': {'abrv': 'lbs', 'oz': 16.0},
}

In [19]:
items = []
for m, d in measures.items():
    items.append(m)
    items += d['abrv'].split(' ')
items.sort()
print('|'.join(items))

C|T|coffeespoon|csp|cup|dash|dr|dr|dram|drop|ds|fl.oz|gal|gallon|gill|gt|gtt|lbs|ounce|oz|pinch|pint|pn|pot|pottle|pounds|pt|qt|quart|saltspoon|scruple|smdg|smi|smidgen|ssp|t|tablespoon|tbsp|tcf|teacup|teaspoon|tsp|wgf|wineglass
