Yelp API Guide: https://docs.developer.yelp.com/docs/fusion-intro

## Aggregating Yelp Restaurant Data
Uses the v3/businesses/search endpoint.

In [228]:
import json
import os
import requests
import time
import urllib.request

In [None]:
# Aggregate Manhattan specific restuarants by neighborhood for more granular results.
neighborhoods = [
    "Alphabet City",
    "Battery Park City",
    "Bowery",
    "Bryant Park",
    "Carnegie Hill",
    "Central Park",
    "Chelsea",
    "Chinatown",
    "Civic Center",
    "Clinton",
    "East Harlem",
    "East Village",
    "Financial District",
    "Flatiron",
    "Fort George",
    "Garment District",
    "Gramercy",
    "Greenwich Village",
    "Hamilton Heights",
    "Harlem",
    "Hells Kitchen",
    "Hudson Heights",
    "Hudson Square",
    "Hudson Yards",
    "Inwood",
    "Kips Bay",
    "Lenox Hill",
    "Lincoln Square",
    "Little Italy",
    "Lower East Side",
    "Manhattan Valley",
    "Manhattanville",
    "Meatpacking",
    "Midtown",
    "Midtown East",
    "Midtown South",
    "Midtown West",
    "Morningside Heights",
    "Murray Hill",
    "Noho",
    "Nolita",
    "NoMad",
    "Roosevelt Island",
    "Soho",
    "Stuyvesant Town",
    "Sutton Place",
    "Times Square",
    "Theater District",
    "Tribeca",
    "Tudor City",
    "Turtle Bay",
    "Two Bridges",
    "Union Square",
    "Upper East Side",
    "Upper West Side",
    "Washington Heights",
    "Washington Square Park",
    "West Harlem",
    "West Village",
    "Yorkville",
]

In [None]:
# Yelp allows inly 500 API calls per day, so we rotate btwn 3 different keys for maximum data collection.
kKeyIndex = 0
keys = [
    "ZF5VOfPCUWtK2C4_ZMpMrO3FxyS6EGlN_aCjNPBTYZyHhmMZvi7sADCFioEuDUalKlL_83AGB1fWkICmFeHudLzmUhtUq589kgKpnfQbQoT2BMznqTLJ2cIX1RRAZXYx",
    "QOCKsANBYQUN4Fmrxh23mAl5Bjbi69gv3W7ChGNOmp98Q3124aytz9F2MzEPhmKOXa6EomrQAjLeGEZuvlrbsR5Q_KSnsST7Ona_K0_wafErqsrxsd68aCSe9j9IZXYx",
    "NO9vZwZGnE58R8YbQDEPC90SlZ2eok4O4aYkdIxH96vUZMeSCDCvIZYY7L3VxWVYiMITiaMIkOBPRdtOgkR52BwBexnpVDDmhcjWClFRgu8uByoBopPAP8stZUBIZXYx"  
]

In [None]:
url = 'https://api.yelp.com/v3/businesses/search'
headers = {
    "accept": "application/json",
    "Authorization": "Bearer " + keys[kKeyIndex]
}

In [None]:
alias_to_content = {}

In [None]:
for neighborhood in neighborhoods:
    print("Fetching data for " + neighborhood)
    
    # Maximum results per API request.
    limit = 50
    location = neighborhood + ", Manhattan, NY"
    location = location.replace(" ", "+")
    
    # Get up to 1000 restaurants per neighborhood.
    for i in range(0, 1000, limit):        
        url_params = {
            "location": location,
            "term": "Restaurants",
            "limit": limit,
            "offset": i,
            "categories": "(restaurants, All)",
            "sort_by": "distance",
        }

        response = requests.get(url, headers=headers, params=url_params)
        
        # Max API calls gets a return status == 429!
        if response.status_code == 429: 
            print("Rotating key")
            kKeyIndex += 1
            headers["Authorization"] = "Bearer " + keys[kKeyIndex]
            response = requests.get(url, headers=headers, params=url_params)

        if response.status_code != 200:
            print(response.status_code)
            continue

        # If we already got all the businesses in a neighborhood.
        content = json.loads(response.content)
        if len(content["businesses"]) == 0:
            break

        for business in content["businesses"]:
            alias_to_content[business["alias"]] = business

In [None]:
# Write file.
file_path = "{}/restaurants.json".format(os.getcwd())
with open(file_path, "w") as fp:
    json.dump(alias_to_content, fp)

## Aggregating Yelp Reviews Data
No API for this. We call it with a sketchy endpoint.

In [231]:
# Read file.
file_path = "{}/restaurants.json".format(os.getcwd())
with open(file_path, "r") as json_file:
    alias_to_content = json.loads(json_file.read())

In [232]:
headers = {
    "Content-Type": "text/html; charset=UTF-8",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
}

In [233]:
for alias, content in alias_to_content.items():
    print("Fetching for " + alias)
    
    # If we already scraped reviews, do not do it again.
    # Scrapes take a long time, so this allows us to scrape over multiple runs.
    if "reviews" in content:
        continue
    
    # Gather 10 reviews per restaurant 5 times.
    for i in range(0, 50, 10):
        url = "https://www.yelp.com/biz/{}/props?start={}".format(alias, i)
        request = urllib.request.Request(url, headers=headers)
        
        # Sleep or else Yelp might block your IP.
        time.sleep(2)
        
        with urllib.request.urlopen(request) as response:
            if response.code != 200:
                print(response.code)
                continue
            
            try:
                reviews = json.loads(response.read())["bizDetailsPageProps"]["reviewFeedQueryProps"]["reviews"]
                content["reviews"] = content.get("reviews", [])
                
                for r in reviews:
                    content["reviews"].append({
                        "photoCount": r["user"]["photoCount"],
                        "reviewCount": r["user"]["reviewCount"],
                        "eliteYear": r["user"]["eliteYear"],
                        "localizedDate": r["localizedDate"],
                        "comment": r["comment"],
                        "rating": r["rating"],
                    })
                    
            except:
                continue

Fetching for le-pain-quotidien-new-york-137
Fetching for creperie-new-york-6
Fetching for dos-toros-taqueria-new-york-5
Fetching for umami-burger-new-york-17
Fetching for naya-brookfield-place-new-york-3
Fetching for sauce-pizzeria-new-york-8
Fetching for sams-crispy-chicken-brookfield-new-york
Fetching for hot-dog-cart-at-south-end-ave-new-york
Fetching for skinny-pizza-new-york
Fetching for tartinery-new-york-15
Fetching for chopt-creative-salad-co-new-york-40
Fetching for sams-crispy-chicken-new-york-4
Fetching for liberty-bistro-new-york-3
Fetching for ani-ramen-house-new-york
Fetching for bar-a-vin-new-york-2
Fetching for blue-ribbon-sushi-bar-hudson-eats-new-york
Fetching for chipotle-mexican-grill-new-york-73
Fetching for amazon-go-new-york-56
Fetching for black-seed-bagels-at-hudson-eats-new-york
Fetching for mighty-quinns-barbeque-new-york-4
Fetching for springbone-kitchen-new-york-7
Fetching for le-district-market-district-new-york
Fetching for l-appart-new-york
Fetching for 

UnicodeEncodeError: 'ascii' codec can't encode character '\xe9' in position 9: ordinal not in range(128)

In [230]:
# Write file.
file_path = "{}/restaurants.json".format(os.getcwd())
with open(file_path, 'w') as fp:
    json.dump(alias_to_content, fp)