In [8]:
# -*- coding: utf-8 -*-
"""
Yelp API v2.0 code sample.
This program demonstrates the capability of the Yelp API version 2.0
by using the Search API to query for businesses by a search term and location,
and the Business API to query additional information about the top result
from the search query.
Please refer to http://www.yelp.com/developers/documentation for the API documentation.
This program requires the Python oauth2 library, which you can install via:
`pip install -r requirements.txt`.
Sample usage of the program:
`python sample.py --term="bars" --location="San Francisco, CA"`
This program only works with python 2.7, not 3
"""

import argparse
import json
import pprint
import sys
import urllib
import urllib2
import oauth2
import numpy as np
from bs4 import BeautifulSoup
import pprint as pp
import lxml
import pymysql


# Global Variables
API_HOST = 'api.yelp.com'
DEFAULT_TERM = 'physicians'
DEFAULT_LOCATION = 'San Francisco, CA'
SEARCH_LIMIT = 20
SEARCH_PATH = '/v2/search/'
BUSINESS_PATH = '/v2/business/'

# OAuth credential placeholders that must be filled in by users.
CONSUMER_KEY = 'sN1u58mMljgThGKGwXS8tw'
CONSUMER_SECRET = 'NT7JYxw2VJ3q0INTbZEgyS7SQ0I'
TOKEN = 'a7V4mk5UGIuECu_MMrQxALvbsUSQsHrX'
TOKEN_SECRET = 'H5r3RQRjOSPL7IgnuSBfvSZxoJU'

In [4]:
def request(host, path, url_params=None):
    """Prepares OAuth authentication and sends the request to the API.
    Args:
        host (str): The domain host of the API.
        path (str): The path of the API after the domain.
        url_params (dict): An optional set of query parameters in the request.
    Returns:
        dict: The JSON response from the request.
    Raises:
        urllib2.HTTPError: An error occurs from the HTTP request.
    """
    url_params = url_params or {}
    url = 'http://{0}{1}?'.format(host, urllib.quote(path.encode('utf8')))

    consumer = oauth2.Consumer(CONSUMER_KEY, CONSUMER_SECRET)
    oauth_request = oauth2.Request(method="GET", url=url, parameters=url_params)

    oauth_request.update(
        {
            'oauth_nonce': oauth2.generate_nonce(),
            'oauth_timestamp': oauth2.generate_timestamp(),
            'oauth_token': TOKEN,
            'oauth_consumer_key': CONSUMER_KEY
        }
    )
    token = oauth2.Token(TOKEN, TOKEN_SECRET)
    oauth_request.sign_request(oauth2.SignatureMethod_HMAC_SHA1(), consumer, token)
    signed_url = oauth_request.to_url()
    
    print (u'Querying {0} ...'.format(url))

    conn = urllib2.urlopen(signed_url, None)
    try:
        response = json.loads(conn.read())
    finally:
        conn.close()

    return response

In [5]:
def search(term, location, limit, offset):
    """Query the Search API by a search term and location.
    Args:
        term (str): The search term passed to the API.
        location (str): The search location passed to the API.
    Returns:
        dict: The JSON response from the request.
    """
    
    url_params = {
        'term': term.replace(' ', '+'),
        'location': location.replace(' ', '+'),
        'limit': SEARCH_LIMIT,
        'offset': offset
    }
    return request(API_HOST, SEARCH_PATH, url_params=url_params)

In [6]:
def get_business(business_id):
    """Query the Business API by a business ID.
    Args:
        business_id (str): The ID of the business to query.
    Returns:
        dict: The JSON response from the request.
    """
    business_path = BUSINESS_PATH + business_id

    return request(API_HOST, business_path)

In [10]:
def query_api(term, location, limit, offset):
    """Queries the API by the input values from the user.
    Args:
        term (str): The search term to query.
        location (str): The location of the business to query.
    """
    response = search(term, location, limit, offset)
#     pprint.pprint(response, indent=2)
    return response

In [11]:
def get_business_from_API(response):
    businesses = response.get('businesses')

    if not businesses:
        print u'No businesses for {0} in {1} found.'.format(term, location)
        return
    return businesses

In [12]:
def get_html(business_id):
    
    yelp_url = 'http://www.yelp.com/biz/'+business_id
    response = urllib2.urlopen(yelp_url)
    html = response.read()
    return html

In [28]:
def save_myhtml(html_filename, html):
    # Save the html file
    with open(html_filename, 'wb') as f:
        f.write(html)

In [26]:
def insert_yelp_business(yelp_id, stars, reviews, city_id):
    con = False
    try:
        con = pymysql.connect(host='localhost', port=3307, user='root', passwd='', db='yelpdata')
        with con:
            cur = con.cursor()
            #print(type(yelp_id), type(stars), type(reviews)
            myvalues = "'" + yelp_id+"'," + str(stars) + "," + str(reviews) + "," + "1"
            sql = 'INSERT INTO business(yelp_id,stars,reviews,city_id) VALUES('+myvalues+')'
            cur.execute(sql)
            print sql
    except pymysql.Error, e:
        print "Error %d: %s" % (e.args[0],e.args[1])
        sys.exit(1)
    finally:
        if con:
            con.close()
        

In [27]:
def format_html(string,int):
    # accepts a rating or date and returns a float btw 0.0 and 5.0, or date-string
    s = str(string)
    s = s.lstrip('[<meta content="')
    if int==0:
        s = float(s[0:3])#star ratings are always 3 characters long
    elif int==1:
        stop = s.find('"')#date lengths could vary
        s = s[0:stop]
    else: 
        print('Wrong integer given to format_rating. Must be 0 for rating or 1 for date: ',int)
    return s

In [39]:
def scrape_reviews(html_filename):
    # open the html file/object with beautiful soup
    
    soup = BeautifulSoup(open(html_filename), "lxml")

    #pp.pprint(ptext)#<class 'bs4.element.ResultSet'>
    ptext = soup.find_all('p', {'itemprop': 'description'})
    reviews = [ptext[i].get_text() for i in range(0,len(ptext))]

    # Examples of what the html looks like:
    # <meta itemprop="ratingValue" content="5.0">
    # <meta itemprop="datePublished" content="2013-03-27">

    rstars =[];rdates=[]
    rstars = [pt.parent.find_all('meta', {'itemprop':'ratingValue'}) for pt in ptext]
    rdates = [pt.parent.find_all('meta', {'itemprop':'datePublished'}) for pt in ptext]
    rstars =[format_html(rstar,0) for rstar in rstars]
    rdates =[format_html(rdate,1) for rdate in rdates]

    # There's an overall rating on this page I could also get, but it should match bstars

    # Ensure that ratings and reviews are vectors of the same length
    if len(rstars)!=len(reviews):
        print('Length of rstars and reviews for ',yelp_url[24:],' on this page do not match.')
        print('rstars len: ',len(rstars),' reviews len: ',len(reviews))
        rmin = min(len(rstars),len(reviews))
        rstars = rstars[0,rmin]; reviews = reviews[0,rmin]; rdates = rdates[0,rmin]

    #Here I manually verified that the star-ratings, reviews, and dates match up with a few pages
#     for i in range(0,len(reviews)):
#         print(rstars[i],rdates[i],':',reviews[i],'\n')
        
    return [rstars], [reviews]

In [40]:
def main():
#     parser = argparse.ArgumentParser()

#     parser.add_argument('-q', '--term', dest='term', default=DEFAULT_TERM, type=str, help='Search term (default: %(default)s)')
#     parser.add_argument('-l', '--location', dest='location', default=DEFAULT_LOCATION, type=str, help='Search location (default: %(default)s)')

#     input_values = parser.parse_args()#This throws an error in Jupyter because there is no input to parse

    try:
        #query_api(input_values.term, input_values.location)
        response = query_api(term=DEFAULT_TERM, location=DEFAULT_LOCATION, limit=20, offset=0)
    except urllib2.HTTPError as error:
        sys.exit('Encountered HTTP error {0}. Abort program.'.format(error.code))
    
    businesses = get_business_from_API(response)
    total_nbusinesses = response.get('total')#number of businesses
    print(total_nbusinesses)
    
    for i in range(0,1):# loop over 20 businesses on each YELP API query
        business_id = businesses[i]['id']
        bus_rating = businesses[i]['rating']
        bus_nreviews = businesses[i]['review_count']#for each business
        bus_categories = businesses[i]['categories']#e.g., Family Practice

        print u'Result for business "{0}" found:'.format(business_id)
        
        # Add business info to MySQL table: business (others are city and review)
        #insert_yelp_business(business_id, bus_rating, bus_nreviews, 1)
    
        html = get_html(business_id)

        # Save a temporary html file, because soup breaks if you pass it whole html obj
        #html_filename = business_id +'.html'
        html_filename = 'temp.html'
        save_myhtml(html_filename, html)

        rstars,reviews = scrape_reviews(html_filename)
    
        #temp_url = 'total-care-plus-san-francisco.html'
#         temp_url = 'dan-kalshan-md-san-francisco-2.html'
#         reviews,ratings = scrape_reviews(temp_url)
#     print(ratings)
#     print(reviews[3])

    
if __name__ == '__main__':
   main()

Querying http://api.yelp.com/v2/search/? ...
3090
Result for business "dan-kalshan-md-san-francisco-2" found:
(5.0, '2015-06-29', ':', u'Adding some more information. \xa0Another great thing about him is that he was willing to experiment with me for my fibromyalgia. \xa0I brought in a book by a specialist and he was willing to try the protocol. \xa0Many doctors would not do that. \xa0He won\'t do anything crazy but will consider something reasonable. \xa0He also made himself available when I was having an emergency severe tooth/earache. He was at an offsite seminar, but his receptionist asked him to call me and he did, prescribed something and I think he even checked in with me later on (or I called back). Either way, I got the treatment I needed.For those who think it takes a while to get ahold of him or his receptionist--please keep in mind it is one doctor and one receptionist. As you can see from all of the 5 star reviews, he spends time with his patients. Be a little "patient" you