In [8]:
# -*- coding: utf-8 -*-
"""
Yelp API v2.0 code sample.
This program demonstrates the capability of the Yelp API version 2.0
by using the Search API to query for businesses by a search term and location,
and the Business API to query additional information about the top result
from the search query.
Please refer to http://www.yelp.com/developers/documentation for the API documentation.
This program requires the Python oauth2 library, which you can install via:
`pip install -r requirements.txt`.
Sample usage of the program:
`python sample.py --term="bars" --location="San Francisco, CA"`
This program only works with python 2.7, not 3
"""

import argparse
import json
import pprint
import sys
import urllib
import urllib2
import oauth2
import numpy as np
from bs4 import BeautifulSoup
import pprint as pp
import lxml
import pymysql


# Global Variables
API_HOST = 'api.yelp.com'
DEFAULT_TERM = 'physicians'
DEFAULT_LOCATION = 'San Francisco, CA'
SEARCH_LIMIT = 20
SEARCH_PATH = '/v2/search/'
BUSINESS_PATH = '/v2/business/'

# OAuth credential placeholders that must be filled in by users.
CONSUMER_KEY = 'sN1u58mMljgThGKGwXS8tw'
CONSUMER_SECRET = 'NT7JYxw2VJ3q0INTbZEgyS7SQ0I'
TOKEN = 'a7V4mk5UGIuECu_MMrQxALvbsUSQsHrX'
TOKEN_SECRET = 'H5r3RQRjOSPL7IgnuSBfvSZxoJU'

In [4]:
def request(host, path, url_params=None):
    """Prepares OAuth authentication and sends the request to the API.
    Args:
        host (str): The domain host of the API.
        path (str): The path of the API after the domain.
        url_params (dict): An optional set of query parameters in the request.
    Returns:
        dict: The JSON response from the request.
    Raises:
        urllib2.HTTPError: An error occurs from the HTTP request.
    """
    url_params = url_params or {}
    url = 'http://{0}{1}?'.format(host, urllib.quote(path.encode('utf8')))

    consumer = oauth2.Consumer(CONSUMER_KEY, CONSUMER_SECRET)
    oauth_request = oauth2.Request(method="GET", url=url, parameters=url_params)

    oauth_request.update(
        {
            'oauth_nonce': oauth2.generate_nonce(),
            'oauth_timestamp': oauth2.generate_timestamp(),
            'oauth_token': TOKEN,
            'oauth_consumer_key': CONSUMER_KEY
        }
    )
    token = oauth2.Token(TOKEN, TOKEN_SECRET)
    oauth_request.sign_request(oauth2.SignatureMethod_HMAC_SHA1(), consumer, token)
    signed_url = oauth_request.to_url()
    
    print (u'Querying {0} ...'.format(url))

    conn = urllib2.urlopen(signed_url, None)
    try:
        response = json.loads(conn.read())
    finally:
        conn.close()

    return response

In [5]:
def search(term, location, limit, offset):
    """Query the Search API by a search term and location.
    Args:
        term (str): The search term passed to the API.
        location (str): The search location passed to the API.
    Returns:
        dict: The JSON response from the request.
    """
    
    url_params = {
        'term': term.replace(' ', '+'),
        'location': location.replace(' ', '+'),
        'limit': SEARCH_LIMIT,
        'offset': offset
    }
    return request(API_HOST, SEARCH_PATH, url_params=url_params)

In [6]:
def get_business(business_id):
    """Query the Business API by a business ID.
    Args:
        business_id (str): The ID of the business to query.
    Returns:
        dict: The JSON response from the request.
    """
    business_path = BUSINESS_PATH + business_id

    return request(API_HOST, business_path)

In [9]:
def query_api(term, location, limit, offset):
    """Queries the API by the input values from the user.
    Args:
        term (str): The search term to query.
        location (str): The location of the business to query.
    """
    response = search(term, location, limit, offset)
#     pprint.pprint(response, indent=2)
    return response

In [22]:
def get_business_from_API(response):
    businesses = response.get('businesses')

    if not businesses:
        print u'No businesses for {0} in {1} found.'.format(term, location)
        return
    return businesses

In [48]:
def get_html(business_id):
    
    yelp_url = 'http://www.yelp.com/biz/'+business_id
    response = urllib2.urlopen(yelp_url)
    html = response.read()
    return html

In [49]:
def save_myhtml(html_filename, html):
    # Save the html file

    with open(html_filename, 'wb') as f:
        f.write(html)

In [50]:
def format_rating(a_rating):
    # accepts one rating and returns a float btw 0.0 and 5.0
    r = str(a_rating)
    r = r.lstrip('<meta content="')
    r = float(r[0:3])
    return r

In [51]:
def scrape_reviews(html_filename):
    # open the html file/object with beautiful soup
    
    print(html_filename)
    soup = BeautifulSoup(open(html_filename))
    #tag = soup.b# not relevant here

    # Extract first 40 reviews per business (40 are listed on the page)
    ptext = soup.find_all('p', {'itemprop': 'description'})
    reviews = [ptext[i].get_text() for i in range(0,len(ptext))]
    
    # Extract star-rating and clean/align with the reviews
    star_text = soup.find_all('meta', {'itemprop':'ratingValue'})
    rating = [format_rating(star) for star in star_text]
    overall_rating = rating[0]
    rating_align = rating[1:-1]# for some unknown reason, there is an extra review that doesn't match up
    #print(rating_align)

    if len(rating_align)!=len(reviews):
        print('Length of ratings and reviews for this doc on this page do not match.')

    # Here I checked user-by-user that the star-ratings and reviews match up (at least on this page)
    # for i in range(0,len(reviews)):
    #     print(rating_clean[i],':',reviews[i],'\n')
    return reviews, rating_align

In [52]:
def insert_yelp_business(yelp_id, stars, reviews, city_id):
    con = False
    try:
        con = pymysql.connect(host='localhost', port=3307, user='root', passwd='', db='yelpdata')
        with con:
            cur = con.cursor()
            #print(type(yelp_id), type(stars), type(reviews)
            myvalues = "'" + yelp_id+"'," + str(stars) + "," + str(reviews) + "," + "1"
            sql = 'INSERT INTO business(yelp_id,stars,reviews,city_id) VALUES('+myvalues+')'
            cur.execute(sql)
            print sql
    except pymysql.Error, e:
        print "Error %d: %s" % (e.args[0],e.args[1])
        sys.exit(1)
    finally:
        if con:
            con.close()
        

In [55]:
def main():
#     parser = argparse.ArgumentParser()

#     parser.add_argument('-q', '--term', dest='term', default=DEFAULT_TERM, type=str, help='Search term (default: %(default)s)')
#     parser.add_argument('-l', '--location', dest='location', default=DEFAULT_LOCATION, type=str, help='Search location (default: %(default)s)')

#     input_values = parser.parse_args()#This throws an error in Jupyter because there is no input to parse

    try:
        #query_api(input_values.term, input_values.location)
        response = query_api(term=DEFAULT_TERM, location=DEFAULT_LOCATION, limit=20, offset=0)
    except urllib2.HTTPError as error:
        sys.exit('Encountered HTTP error {0}. Abort program.'.format(error.code))
    
    businesses = get_business_from_API(response)
    total_nbusinesses = response.get('total')#number of businesses
    print(total_nbusinesses)
    
    for i in range(0,20):# loop over 20 businesses on each YELP API query
        business_id = businesses[i]['id']
        bus_rating = businesses[i]['rating']
        bus_nreviews = businesses[i]['review_count']#for each business
        bus_categories = businesses[i]['categories']#e.g., Family Practice

        print u'Result for business "{0}" found:'.format(business_id)
        
        # Add business info to MySQL table: business (others are city and review)
        #insert_yelp_business(business_id, bus_rating, bus_nreviews, 1)
    
        html = get_html(business_id)
        html_filename = business_id +'.html'
    
        # Save html file or use an existing one
#         save_myhtml(html_filename, html)
        reviews,ratings = scrape_reviews(html_filename)
    
        #temp_url = 'total-care-plus-san-francisco.html'
#         temp_url = 'dan-kalshan-md-san-francisco-2.html'
#         reviews,ratings = scrape_reviews(temp_url)
    print(ratings)
    print(reviews[3])

    
if __name__ == '__main__':
   main()

Querying http://api.yelp.com/v2/search/? ...
2671
Result for business "dan-kalshan-md-san-francisco-2" found:
dan-kalshan-md-san-francisco-2.html
Length of ratings and reviews for this doc on this page do not match.
Result for business "dolhun-clinic-san-francisco-3" found:
dolhun-clinic-san-francisco-3.html


IOError: [Errno 2] No such file or directory: u'dolhun-clinic-san-francisco-3.html'