In [2]:
#Imports
import urllib.request as url
from bs4 import BeautifulSoup
import requests as rq
import nltk as nltk
import pandas as pd
from fractions import Fraction
import time
import inflect
import numpy as np

In [3]:
#For parsing:
#Just interested in the ingredients in the recipie that contribute flavor
#i.e. strawberry cake would consist of strawberries, flour, salt, butter, etc.
#but only strawberry is a flavor
#this is just a way to make the problem simpiler to begin with
engine = inflect.engine()

flavors = ['vanilla','chocolate','cocoa','rum','banana','strawberry',
        'cherry','yam','apple','coconut','raspberry','blueberry',
        'pineapple','peach','pear','walnut','pecan','cashew',
        'hazelnut','coffee','expresso','cinnamon','cardamom','clove',
        'nutmeg','allspice','pistachio','plum','vodka','champagne',
        'tequila','whiskey','whisky','cayenne','pepper','ginger','brandy',
        'tomato','carrot','cucumber','rhubarb', 'seseame','lime','mascarpone',
        'gingerbread','cranberry','pumpkin','blackberry','orange','coconut',
        'zucchini','lemon','tahini','peanut','almond','caramel',
        'lavender','passionfruit','apricot','anise','chestnut',
        'maple','pomegranate','chai','matcha','date','raisin','rose','fig','oat',
        'cayenne','chamomile','pretzel','grape','mochi','basil','mint','graham',
        'lemongrass','earl','avocado','ricotta','marshmallow','cola','mango',
        'hibiscus','honey','oatmeal','saffron','curry','tarragon',
        'rosemary','thyme','oregano','bay','cilantro','fennel','garlic',
        'parsley','sage',]

plurals = list(map(engine.plural,flavors))
sorted(flavors, key=str.lower)

['allspice',
 'almond',
 'anise',
 'apple',
 'apricot',
 'avocado',
 'banana',
 'basil',
 'bay',
 'blackberry',
 'blueberry',
 'brandy',
 'caramel',
 'cardamom',
 'carrot',
 'cashew',
 'cayenne',
 'cayenne',
 'chai',
 'chamomile',
 'champagne',
 'cherry',
 'chestnut',
 'chocolate',
 'cilantro',
 'cinnamon',
 'clove',
 'cocoa',
 'coconut',
 'coconut',
 'coffee',
 'cola',
 'cranberry',
 'cucumber',
 'curry',
 'date',
 'earl',
 'expresso',
 'fennel',
 'fig',
 'garlic',
 'ginger',
 'gingerbread',
 'graham',
 'grape',
 'hazelnut',
 'hibiscus',
 'honey',
 'lavender',
 'lemon',
 'lemongrass',
 'lime',
 'mango',
 'maple',
 'marshmallow',
 'mascarpone',
 'matcha',
 'mint',
 'mochi',
 'nutmeg',
 'oat',
 'oatmeal',
 'orange',
 'oregano',
 'parsley',
 'passionfruit',
 'peach',
 'peanut',
 'pear',
 'pecan',
 'pepper',
 'pineapple',
 'pistachio',
 'plum',
 'pomegranate',
 'pretzel',
 'pumpkin',
 'raisin',
 'raspberry',
 'rhubarb',
 'ricotta',
 'rose',
 'rosemary',
 'rum',
 'saffron',
 'sage',
 'sese

# Define Functions

In [4]:
def findflavor(tokens):
    
    flavor = None
    
    for string in tokens:
        if classifytoken(string) is False:
            continue
        else:
            flavor = classifytoken(string)
    
    return flavor

In [5]:
def classifytoken(string):
    if string.lower() in flavors:
        return string.lower()
    elif string.lower() in plurals:
        return engine.plural(string.lower())
    else:
        return False

In [6]:
def readrecipe(page_url):
    
    page = url.urlopen(page_url)
    soup = BeautifulSoup(page, 'html.parser')
    name_box = soup.find_all('li', attrs={'class': 'ingredient'})
    recipe_name = soup.find('h1', attrs={'itemprop': 'name'})
    
    flavor_list = []

    for ingred in name_box:
        
        tokens = nltk.word_tokenize(ingred.text)
        flavor = findflavor(tokens)
        
        if flavor is not None:
            flavor_list.append(flavor)
            
    flavor_list = ' '.join(flavor_list)
            
    star_box = soup.find('meta', attrs={'itemprop': 'ratingValue'})
    rating = float(star_box['content'])

    succ_box = soup.find('div', attrs={'class': 'prepare-again-rating'})
    
    try:
        success = float(succ_box.text[0:3])/100
    except:
        success = 0
    
    ingredients = {'rating' : {recipe_name.text : rating},
                   'success' : {recipe_name.text : success},
                   'flavors' : {recipe_name.text : flavor_list}}
    
    df = pd.DataFrame.from_dict(ingredients)
    
    return df

In [7]:
def readsite(site,sitebase,numpages):

    for pagenum in range(numpages):

        print('Reading Page: '+str(pagenum+1))
        
        url_page = site+str(pagenum+1)

        try:
            page = url.urlopen(url_page)
        except Exception as e:
            print(url_page)
            print(e)
            continue
            
        soup = BeautifulSoup(page, 'html.parser')
        recipe_panel = soup.find_all('div', attrs={'class': 'controls'})

        for recipe_ext in recipe_panel:
            recipe_url = sitebase + recipe_ext.a['href']
            
            try:
                ingred = readrecipe(recipe_url)
            except Exception as e:
                print(recipe_url)
                print(e)
                continue
                
            try:
                allrecipes = pd.concat([allrecipes,ingred])
            except:
                allrecipes = ingred

            time.sleep(1)
        
        time.sleep(1)
     
    allrecipes = allrecipes.fillna(False)
    
    return allrecipes

# Run Scrape

In [7]:
cake_url = 'https://www.epicurious.com/search/?meal=dessert&content=recipe&page='
site_base = 'https://www.epicurious.com'
allrecipes = readsite(cake_url,site_base,417)
allrecipes.to_csv('recipes.csv')

Reading Page: 1
Reading Page: 2
Reading Page: 3
Reading Page: 4
Reading Page: 5
Reading Page: 6
Reading Page: 7
Reading Page: 8
Reading Page: 9
Reading Page: 10
Reading Page: 11
Reading Page: 12
Reading Page: 13
Reading Page: 14
Reading Page: 15
Reading Page: 16
Reading Page: 17
Reading Page: 18
Reading Page: 19
Reading Page: 20
Reading Page: 21
Reading Page: 22
Reading Page: 23
Reading Page: 24
Reading Page: 25
https://www.epicurious.com/recipes/food/views/chocolate-plum-cake56390135
HTTP Error 404: Not Found
Reading Page: 26
Reading Page: 27
Reading Page: 28
Reading Page: 29
Reading Page: 30
Reading Page: 31
Reading Page: 32
Reading Page: 33
Reading Page: 34
Reading Page: 35
Reading Page: 36
Reading Page: 37
Reading Page: 38
https://www.epicurious.com/recipes/food/views/almond-pistachio-nougatine-51228610
'NoneType' object is not subscriptable
Reading Page: 39
Reading Page: 40
Reading Page: 41
Reading Page: 42
Reading Page: 43
Reading Page: 44
https://www.epicurious.com/recipes/food/