In [1]:
import json
import requests
import pickle
from bs4 import BeautifulSoup
from yelp.client import Client
from yelp.oauth1_authenticator import Oauth1Authenticator
from collections import defaultdict
from itertools import islice

### Scraping Yelp Menus

In [None]:
def scrape_menu(business_id):
    link = 'https://www.yelp.com/menu/{0}'.format(business_id) ## need some error checking here
    page = requests.get(link)
    soup = BeautifulSoup(page.content)
    return soup

def get_menu_items(soup):
    menu = {}
    for ms in soup.select('.menu-item-details'):
        item = ms.h4.text.strip() ## just menu items --> section titles are in .menu-section-header
        #print item
        if ms.p:
            desc= ms.p.text.strip()
            #print desc
            menu[item] = desc
        else: menu[item] = ""    
    return menu  

## Code to use:

In [3]:
## Read credentials
with open('credentials/yelp_config.json') as cred:
    creds = json.load(cred)
    auth = Oauth1Authenticator(**creds)
    client = Client(auth)

In [12]:
## Make initial call to get # responses
params = {
    'term' : 'restaurants'
    }
bus_response = client.search('San Francisco', **params)
pages = (bus_response.total/20) + 1 ## businesses returns 20 at a time so will need to repeat calls with offset


In [21]:
params = {
    'term' : 'restaurants',
    'offset' : 51*20   
    }
response = client.search('San Francisco', **params)
    

InvalidParameter: 

In [19]:
for bus in response.businesses:
    print bus.id, bus.name

frog-hollow-farm-market-and-cafe-san-francisco Frog Hollow Farm Market & Cafe
little-gem-san-francisco Little Gem
new-alternatives-cafe-san-francisco-2 New Alternatives Cafe
higher-grounds-coffee-house-san-francisco Higher Grounds Coffee House
joo-mak-san-francisco Joo Mak
fumi-japanese-curry-and-ramen-san-francisco-2 Fumi Japanese Curry & Ramen
ideale-san-francisco Ideale
calabria-bros-san-francisco-2 Calabria Bros
pissed-off-petes-san-francisco Pissed Off Pete's
chai-bar-by-david-rio-san-francisco Chai Bar By David Rio
perilla-san-francisco-2 Perilla
proposition-chicken-san-francisco Proposition Chicken
pho-huynh-hiep-2-kevins-noodle-house-san-francisco Pho Huynh Hiep 2 - Kevin's Noodle House
orexi-san-francisco Orexi
vikings-giant-submarines-san-francisco-2 Viking's Giant Submarines
cafe-la-flore-san-francisco-3 Cafe La Flore
volare-pizza-and-restaurant-san-francisco Volare Pizza & Restaurant
little-sheep-mongolian-hot-pot-san-francisco-4 Little Sheep Mongolian Hot Pot
eastern-baker

In [None]:

## For every page of responses: get business ids
for page in xrange(pages):
    params = {
    'term' : 'restaurants',
    'offset' : page*20   
    }
    response = client.search('San Francisco', **params)
    
    for i in response.businesses:
        if response.businesses[i].menu_date_updated: ## check menu available on Yelp
            ## for every business id: go to menu page and scrape dish names and descriptions
            soup = scrape_menu(i)
            menu = get_menu_items(soup)

        ##insert into db

## Experimenting:

In [41]:
# read API keys
with open('config_secret.json') as cred:
    creds = json.load(cred)
    auth = Oauth1Authenticator(**creds)
    client = Client(auth)

params = {
    'term' : 'restaurants'#,
    #'category_filter' :'gastropubs'
    }

response = client.search('San Francisco', **params)

In [42]:
response.total ## businesses returns 20 at a time so will need to repeat calls with offset

8190

In [43]:
response.total*1./20

409.5

In [44]:
bus_ids = []
num_hits = len(response.businesses)
for i in xrange(num_hits):
    if response.businesses[i].menu_date_updated: ## check menu available on Yelp
        bus_ids.append(response.businesses[i].id)

In [45]:
bus_ids

[u'hogwash-san-francisco',
 u'fog-harbor-fish-house-san-francisco-2',
 u'marlowe-san-francisco-2',
 u'gary-danko-san-francisco',
 u'so-san-francisco-4',
 u'chez-maman-san-francisco-9',
 u'hrd-san-francisco-2',
 u'the-italian-homemade-company-san-francisco-4',
 u'causwells-san-francisco-5',
 u'lord-george-san-francisco',
 u'stones-throw-san-francisco-2']

In [29]:
len(bus_ids)

13

In [46]:
test_link = 'https://www.yelp.com/menu/garaje-san-francisco'

In [47]:
page = requests.get(test_link)
soup = BeautifulSoup(page.content)

In [54]:
menu = {}
for ms in soup.select('.menu-item-details'):
    item = ms.h4.text.strip() ## just menu items --> section titles are in .menu-section-header
    #print item
    if ms.p:
        desc= ms.p.text.strip()
        #print desc
        menu[item] = desc
    else: menu[item] = ""    
print menu        

{u'Jose Wong': u'endorsed by travis. hacked marinated chicken thighs, lettuce, cabbage, peppers, cilantro, broken corn chips, fiery hoisin chile pressing, peanuts, sesame seeds', u'Bottled Sweet Tea': '', u'Mordaditas': u'gourmet? no way! great w/ beer? crispy little rolled beef taquitos, fundido sauce, pico, gauc', u'Mex Coke or Mex Pepsi': '', u'Grilled Fish and Guac': u'sustainable tilapia , cabbage, cilantro lime mayo, la palma corn torts', u'Jerritos': '', u"Surfer Leo's Fish and Chip": u'crispy battered sustainable tilapia, fries, avo, jack, chipotle mayo', u"Pete's No Meat*": '', u'Wild Ahi and Aho Salpicon': u'hacked seared line caught ahi, cilantro lime vin, chipotle aioli, our pot chips', u'Cali Steak': u'skirt steak asada, jack, fries, avo, pico', u'Diet Soda': '', u'Wild Tuna Tostadita': u'seared line caught ahi, cabbage, guac, pico, cilantro lime vin, crispy onions, corn tort, chipotle aioli', u'Carnitas': '', u'Skirt Steak': '', u'Garaje Potato Chips': u'hand cut kennebec

In [130]:
for section in soup.select('.menu-section-header'):
    print section.h2.text.strip()  

Goes w/ Beer
Salads
Drive In Cheeseburger
Zapatos
Tacos
Not Beer Or Wine


In [111]:
for section in soup.select('.menu-section'):
    if section.div['class'] == ['media-block', 'menu-item']:
        print section.div.h4.text.strip()
        
## how to associate each menu item with section header??     
## can also get item descriptions from <p class='menu-item-details-description'>

Jose Wong
Drive In Cheeseburger
Pete's No Meat*


In [19]:
def scrape_menu(business_id):
    link = 'https://www.yelp.com/menu/{0}'.format(business_id) ## need some error checking here
    page = requests.get(link)
    soup = BeautifulSoup(page.content)
    return soup

def get_menu_items(soup):
    menu = []
    for ms in soup.select('.menu-item-details'):
        menu.append(ms.h4.text.strip()) ## just menu items --> section titles are in .menu-section-header
        ## do I also want item descriptions?
    return menu    

In [20]:
def collect_all_menus(bus_ids):
    menus = {}
    for bus_id in bus_ids:
        soup = scrape_menu(bus_id)
        menu = get_menu_items(soup)
        menus[bus_id] = menu
    return menus    
        

In [36]:
test_soup = scrape_menu('hogwash-san-francisco')
test_menu = get_menu_items(test_soup)

In [21]:
menu_dict = collect_all_menus(bus_ids[:10])


In [22]:
def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

In [23]:
print take(5, menu_dict.iteritems())

[(u'the-italian-homemade-company-san-francisco-4', [u'Piadina', u'Fresh Pasta', u'Salads', u'Vanini Chocolate', u'Gnocchi Roll']), (u'the-fine-mousse-san-francisco', [u'Dibon Brut Reserva, Cava, Penedes, Spain', u'Cortenova, Prosecco Italy', u'Gruet, Blanc de Blanc, New Mexico', u'Graham Beck Brut Rose, South Africa', u'Graham Beck Brut Zero 2009, South Africa', u'Mumm, Devaux Ranch 2010, Napa, California', u'Moutard, Grand Cuvee, Champagne, France', u'Duck Fat or Rice Bran Oil', u'Hibiscus & Pink Pepper', u'Garlic & Rosemary', u'Chipotle', u'Parmesan & Truffle', u'Mustard & Apple', u'Habanero & Dill', u'Yuzu Pepper', u'Grapefruit & Curry', u'Earl Grey & Peaches', u'Santa Julia Brut, Blanc de Blanc, Argentina', u"Roederer Estate L'Ermitage 2007, Andersen Valley California", u'Schramsberg, Blanc de Blanc, Napa California', u'Schramsberg, Blanc de Noir, Napa California', u'Les Rocailles Brut, Savoie, France', u'Caveau du Mont July, bubbly red Gamay, France', u'AR Lenoble, Brut Intense, C

In [26]:
menu_dict['so-san-francisco-4']

[u'Wings',
 u'Diced',
 u'Mushrooms',
 u'Fish (basa)',
 u'Beef',
 u'Calamari',
 u'Dried Sauteed String Beans',
 u'Spinach With Garlic',
 u'Fried Tofu',
 u'Cherry Pork / Chicken',
 u'Coroque/Curry Coroque/Edmame Coroque',
 u'Potstickers(limited Availability)',
 u'Fried Shrimp Dumplings',
 u'Fried Pork Dumplings',
 u'Fried Chicken Dumplings',
 u'Fried Vegetarian Dumplings',
 u'Three Deluxe',
 u'Black Bean Sauce',
 u'So Black Bean',
 u'Peanut',
 u'Shredded Pork With Garlic (stir Fried)',
 u'Pork With Hot Peppers (stir Fried)',
 u'Mao Pao',
 u'Curry',
 u'Seafood',
 u'Beef',
 u'Chicken',
 u'Pork With Chinese Pickle',
 u'Pork With Mustard Green',
 u'Vegetable',
 u'Seaweed',
 u'Spicy Combination',
 u'Hot & Sour',
 u'Na Ge Mian',
 u'What Do You Know',
 u'Whatever',
 u'Dirty Dastard',
 u"I Don't Know",
 u'Curry',
 u'Fish',
 u'String Beans With Beef, Chicken Or Pork',
 u'Fish',
 u'Broccoli Beef',
 u'Mao Pao Beef, Chicken, Pork Or Veg',
 u'Fish',
 u'Fried Rice',
 u'Curry Fried Rice (veg)',
 u'Hotl

In [None]:
## problem = menu pages are not as standard as I'd hoped --> some have dinner, lunch, full menu, apps tabs

### Food2Fork API

In [2]:
with open('config.json') as cred:
    creds = json.load(cred)

In [3]:
search_url = 'http://food2fork.com/api/search'
get_url = 'http://food2fork.com/api/get'

use creds['api-key'] for API key authentication

Sample request:
http://food2fork.com/api/search?key={API_KEY}&q=shredded%20chicken


Need to extract recipe ID from each search result and then use the get_url to extract the ingredients, images, etc. 

In [34]:
## Build list of recipe_ids
PAGE = 1
RECIPE_IDS = []

while True:
    payload = {'key' : creds['api-key'],
           'page' : PAGE,
           'sort' : 'r'
    }
    
    r = requests.get(search_url, params = payload) #returns 30 at a time
    
    for result in r.json()['recipes']:
        RECIPE_IDS.append(result['recipe_id'])
    
    if r.json()['count'] < 30: ## end of list
        print "End of recipe list!"
        break
    
    if PAGE % 20 == 0: ## every 20 pages dump to text file
#         with open('recipe_ids.txt', 'a') as out_file:
#             for x in RECIPE_IDS:
#                 out_file.write(x)
#             RECIPE_IDS = []
        print "Just finished page ", PAGE
    PAGE +=1    
    ## add a time.sleep ?        

Just finished page  20
Just finished page  40
Just finished page  60
Just finished page  80
Just finished page  100


KeyError: 'recipes'

In [19]:
len(RECIPE_IDS)

15000

In [38]:
#len(RECIPE_IDS)
#type(RECIPE_IDS[0])

## write recipe_ids to file so don't have to get again
with open('recipe_ids.pkl', 'w') as out_file:
    pickle.dump(RECIPE_IDS, out_file)

In [None]:
#len(RECIPE_IDS)
with open('recipe_ids_pt2.pkl', 'w') as out_file:
    pickle.dump(RECIPE_IDS, out_file)
    
## reached API limit for the day. Currently have 3030 + 15000 = 18,000 recipe ids. Pick up at page 601 for more.    

#### Start here for getting recipe info! But don't forget to read in api-key and URLs first.

In [4]:
## check pickling
with open('recipe_ids.pkl') as infile:
    id_list = pickle.load(infile)

In [5]:
len(id_list)

3030

In [24]:
id_list[0]

u'47024'

In [25]:
## need to initialize json file with 1 entry
payload_rec  = {'key' : creds['api-key'],
                   'rId': id_list[0]
                 }
recipe_dict = defaultdict(dict)
result = requests.get(get_url, params = payload_rec)
 

NameError: name 'id_' is not defined

In [26]:
print result.json()

{u'recipe': {u'publisher': u'The Pioneer Woman', u'ingredients': [u'1 pound Ground Coffee (good, Rich Roast)', u'8 quarts Cold Water', u'Half-and-half (healthy Splash Per Serving)', u'Sweetened Condensed Milk (2-3 Tablespoons Per Serving)', u'Note: Can Use Skim Milk, 2% Milk, Whole Milk, Sugar, Artificial Sweeteners, Syrups...adapt To Your Liking!'], u'f2f_url': u'http://food2fork.com/view/47024', u'source_url': u'http://thepioneerwoman.com/cooking/2011/06/perfect-iced-coffee/', u'recipe_id': u'47024', u'image_url': u'http://static.food2fork.com/icedcoffee5766.jpg', u'social_rank': 100.0, u'publisher_url': u'http://thepioneerwoman.com', u'title': u'Perfect Iced Coffee'}}


In [27]:
rec_id = result.json()['recipe']['recipe_id']
recipe_dict[rec_id]['ingredients'] = result.json()['recipe']['ingredients']
recipe_dict[rec_id]['publisher'] = result.json()['recipe']['publisher']
recipe_dict[rec_id]['source_url'] = result.json()['recipe']['source_url']
recipe_dict[rec_id]['image_url'] = result.json()['recipe']['image_url']
recipe_dict[rec_id]['title'] = result.json()['recipe']['title']

print recipe_dict       

defaultdict(<type 'dict'>, {u'47024': {'publisher': u'The Pioneer Woman', 'source_url': u'http://thepioneerwoman.com/cooking/2011/06/perfect-iced-coffee/', 'image_url': u'http://static.food2fork.com/icedcoffee5766.jpg', 'title': u'Perfect Iced Coffee', 'ingredients': [u'1 pound Ground Coffee (good, Rich Roast)', u'8 quarts Cold Water', u'Half-and-half (healthy Splash Per Serving)', u'Sweetened Condensed Milk (2-3 Tablespoons Per Serving)', u'Note: Can Use Skim Milk, 2% Milk, Whole Milk, Sugar, Artificial Sweeteners, Syrups...adapt To Your Liking!']}})


In [28]:
with open('recipe_info.json', "w") as json_file:
    json_file.write("{}\n".format(json.dumps(recipe_dict)))

In [6]:
def get_recipe_info(recipe_id):
    ## Gets the recipe info and stores in dict.
    payload  = {'key' : creds['api-key'],
                       'rId': recipe_id
                     }
    result = requests.get(get_url, params = payload)
    return result
    
    
def extract_info(json_obj):
    recipe_dict = defaultdict(dict)
    recipe_dict['rec_id'] = json_obj.json()['recipe']['recipe_id']
    recipe_dict['ingredients'] = json_obj.json()['recipe']['ingredients']
    recipe_dict['publisher'] = json_obj.json()['recipe']['publisher']
    recipe_dict['source_url'] = json_obj.json()['recipe']['source_url']
    recipe_dict['image_url'] = json_obj.json()['recipe']['image_url']
    recipe_dict['title'] = json_obj.json()['recipe']['title']

    #print recipe_dict  
    return recipe_dict

def write_to_json(filepath, dict_obj):
    with open(filepath, "a") as json_file:
#         for line in dict_obj:
        json_file.write("{}\n".format(json.dumps(dict_obj)))       

def write_to_database(dict_obj, db):
    db.recipes.insert_one(dict_obj)

In [48]:
id_list[1]

u'35382'

In [50]:
result = get_recipe_info(id_list[1])

In [51]:
print result.json()

{u'recipe': {u'publisher': u'Closet Cooking', u'ingredients': [u'2 jalapeno peppers, cut in half lengthwise and seeded', u'2 slices sour dough bread', u'1 tablespoon butter, room temperature', u'2 tablespoons cream cheese, room temperature', u'1/2 cup jack and cheddar cheese, shredded', u'1 tablespoon tortilla chips, crumbled\n'], u'f2f_url': u'http://food2fork.com/view/35382', u'source_url': u'http://www.closetcooking.com/2011/04/jalapeno-popper-grilled-cheese-sandwich.html', u'recipe_id': u'35382', u'image_url': u'http://static.food2fork.com/Jalapeno2BPopper2BGrilled2BCheese2BSandwich2B12B500fd186186.jpg', u'social_rank': 100.0, u'publisher_url': u'http://closetcooking.com', u'title': u'Jalapeno Popper Grilled Cheese Sandwich'}}


In [57]:
new_dict = extract_info(result)
print new_dict

defaultdict(<type 'dict'>, {u'35382': {'publisher': u'Closet Cooking', 'source_url': u'http://www.closetcooking.com/2011/04/jalapeno-popper-grilled-cheese-sandwich.html', 'image_url': u'http://static.food2fork.com/Jalapeno2BPopper2BGrilled2BCheese2BSandwich2B12B500fd186186.jpg', 'title': u'Jalapeno Popper Grilled Cheese Sandwich', 'ingredients': [u'2 jalapeno peppers, cut in half lengthwise and seeded', u'2 slices sour dough bread', u'1 tablespoon butter, room temperature', u'2 tablespoons cream cheese, room temperature', u'1/2 cup jack and cheddar cheese, shredded', u'1 tablespoon tortilla chips, crumbled\n']}})


In [59]:
write_to_json('recipe_info.json', new_dict)

In [11]:
with open('recipe_ids_pt2.pkl') as infile:
    id_list = pickle.load(infile)

In [1]:
1459 - 86


1373

In [9]:
## Now iteratively build up json file with other recipes.

for i, num in enumerate(id_list[:1459]): ## still need to run this!
    result = get_recipe_info(num)
    dict_ob = extract_info(result)
    write_to_json('recipe_info.json', dict_ob)
    if i % 500 == 0:
        print "Just finished number ", i
## 4/17 have first 1989 recipes.  
## 4/18 recipe_ids.pkl file finished

Just finished number  0
Just finished number  500
Just finished number  1000


In [65]:
len(id_list[6:2000])

1994

In [15]:
print json.dumps(test.json(), indent=4, separators=(',', ': '))

{
    "recipe": {
        "publisher": "Closet Cooking",
        "ingredients": [
            "4 small chicken breasts, pounded thin",
            "salt and pepper to taste",
            "4 jalapenos, diced",
            "4 ounces cream cheese, room temperature",
            "1 cup cheddar cheese, shredded",
            "8 slices bacon\n"
        ],
        "f2f_url": "http://food2fork.com/view/35120",
        "source_url": "http://www.closetcooking.com/2012/11/bacon-wrapped-jalapeno-popper-stuffed.html",
        "recipe_id": "35120",
        "image_url": "http://static.food2fork.com/Bacon2BWrapped2BJalapeno2BPopper2BStuffed2BChicken2B5002B5909939b0e65.jpg",
        "social_rank": 100.0,
        "publisher_url": "http://closetcooking.com",
        "title": "Bacon Wrapped Jalapeno Popper Stuffed Chicken"
    }
}


In [None]:
## should be through # 1373 on recipes_ids_pt2.pkl

In [1]:
1373+2500 #4/19


3873