# Scrape Restaurant Reviews from Yelp Using Business Name

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import time

In [51]:
#get the comment, comment_count and date by descending order
#if the comment_count is greater than 100, we take the first 100 comment
#if the comment_count is less than 100, we take as many comment as we can 
#if the number of reviews written by one reviewer is less than 10, we consider the review as unvalued
def get_new_date(alias,N=100):
    url = 'https://www.yelp.com/biz/'+alias+'?sort_by=date_desc'
    response = requests.get(url)
    if response.status_code != 200:
        print("==200")
        return None,None,None
    response_page = BeautifulSoup(response.content,'lxml')
    review_abbre = response_page.find_all('script',type='application/ld+json')
    try:
        review_text= json.loads(review_abbre[-1].get_text())
        date = review_text.get('review')[0].get('datePublished')
    except:
        print("1st try goes wrong")
        return None,None,None
    #date = review_text.get('review')[0].get('datePublished')
    count_list,author_list,review_list,rate_list = [],[],[],[]
    
    while len(count_list)< N and response.status_code == 200:

        total_author = response_page.find_all('div',{'class',"review review--with-sidebar"})
        for all_review in total_author:
            count = all_review.find('li',{'class':'review-count'}).find('b').get_text()
            if int(count)>=10:
                try:
                    count_list.append(count)
                    author_list.append(all_review.find('a',{'class','user-display-name'}).get_text())
                    review_list.append(all_review.find('p',{'lang':'en'}).get_text())
                    rate_list.append(float(all_review.find('img',class_='offscreen').get('alt')[:3]))
                except:
                    print("2nd try goes wrong")
                    return None, None, None
        try:
            url = response_page.find_all('a',{'class':'u-decoration-none next pagination-links_anchor'})[0].get('href')
        except:
            print("3rd try goes wrong")
            break
        response = requests.get(url)
        response_page = BeautifulSoup(response.content,'lxml')
    
    reviews=' '.join(review_list)
    avg_rate = sum(rate_list)/len(rate_list)
    
    return date,reviews,avg_rate

In [36]:
#get the oldest comment by ascending order
def get_old_date(alias):
    url = 'https://www.yelp.com/biz/'+alias+'?sort_by=date_asc'
    response = requests.get(url)
    if response.status_code != 200:
        return None
    contents = BeautifulSoup(response.content,'lxml')
    review_abbre = contents.find_all('script',type='application/ld+json')
    try:
        review_text= json.loads(review_abbre[-1].get_text())
        date = review_text.get('review')[0].get('datePublished')
    except:
        return None
    #date = review_text.get('review')[0].get('datePublished')
    return date

In [None]:
#loop through the original data and add new columns
df=pd.read_pickle('italian_551.pkl')
N=0
while N<=551: 
    business_new = df.iloc[N:N+20]
    business_new['old_date'] = business_new['alias'].apply(get_old_date)
    #try to not active the reCaptcha
    time.sleep(10)
    business_new['new_date_review_rate'] = business_new['alias'].apply(get_new_date)
    business_new['new_date']=business_new['new_date_review_rate'].apply(lambda x:x[0])
    business_new['review']=business_new['new_date_review_rate'].apply(lambda x:x[1])
    business_new['avg_rate']=business_new['new_date_review_rate'].apply(lambda x:x[2])
    business_new.to_pickle("./review/italian_{}.pkl".format(N))
    time.sleep(10)
    print(N)
    N+=20

In [44]:
#combine all the segmental dataframe to one
df=pd.read_pickle('./review/italian_0.pkl')
for x in range(20,560,20):
    new=pd.read_pickle('./review/italian_{}.pkl'.format(x))
    df=pd.concat([df,new])
df.to_pickle('italian_review_551.pkl')    

In [None]:
df

In [90]:
df['new_date_review_rate']

9       (2019-04-06, I met a friend at this restaurant...
10      (2019-04-09, Awesome food, delicious wines, ex...
15      (2019-04-15, Let me start off by saying that t...
24      (2019-04-13, This place was amazing!! Neapolit...
27      (2019-04-15, I was under the impressions based...
35      (2019-04-14, I visited this location yesterday...
42      (2019-04-16, Fumo is one of those classic lunc...
51      (2019-04-06, My wife and I had a wonderful mea...
61      (2019-02-22, In years of yore, Italians had th...
65      (2019-04-12, Walked in here for two slices and...
71      (2019-04-11, Small local restaurant.   For som...
73      (2019-04-14, I've eaten here twice and am very...
74      (2019-04-01, Always a great experience here. T...
83      (2019-02-19, (Based on a single Summer 2015 vi...
95      (2019-04-09, This fish ....caught me.  I highl...
96      (2019-04-16, Went with my friend and it was de...
120     (2019-04-10, The first time I was there, I had...
125     (2019-

In [2]:
df = pd.read_pickle('italian_review_551.pkl') 
df

Unnamed: 0,alias,categories,coordinates,display_phone,distance,id,image_url,is_closed,location,name,...,delivery,restaurant_reservation,dist_group,italian,label,old_date,new_date_review_rate,new_date,review,avg_rate
9,babbalucci-new-york,[italian],"{'latitude': 40.80894, 'longitude': -73.94496}",(646) 918-6572,531.822111,gyWc5qJRvu26LVQIShdR4Q,https://s3-media1.fl.yelpcdn.com/bphoto/3MOwPG...,False,"{'address1': '331 Lenox Ave', 'address2': '', ...",Babbalucci,...,True,False,0-1000m,True,1,2015-07-04,"(2019-04-06, I met a friend at this restaurant...",2019-04-06,I met a friend at this restaurant on a first d...,3.920792
10,pisticci-new-york,"[italian, breakfast_brunch]","{'latitude': 40.8141624, 'longitude': -73.960288}",(212) 932-3500,1140.188943,wZkZmjZEJDraLJgAalnHvA,https://s3-media1.fl.yelpcdn.com/bphoto/ERoh_M...,False,"{'address1': '125 La Salle St', 'address2': ''...",Pisticci,...,True,True,1000-2000m,True,1,2006-01-24,"(2019-04-09, Awesome food, delicious wines, ex...",2019-04-09,"Awesome food, delicious wines, excellent servi...",4.210526
15,vinatería-new-york-4,"[italian, spanish, seafood]","{'latitude': 40.8064392068585, 'longitude': -7...",(212) 662-8462,999.363229,MH08_pIRKsUSwfkVYxUE7w,https://s3-media2.fl.yelpcdn.com/bphoto/ZuDNNI...,False,"{'address1': '2211 Frederick Douglass Blvd', '...",VINATERÍA,...,False,False,0-1000m,True,1,2013-04-17,"(2019-04-15, Let me start off by saying that t...",2019-04-15,Let me start off by saying that the food was g...,3.465347
24,sottocasa-pizzeria-harlem-new-york,"[pizza, italian]","{'latitude': 40.805587, 'longitude': -73.947547}",(646) 928-4870,888.573617,NPnWsPxOpG91GHx74S27Uw,https://s3-media1.fl.yelpcdn.com/bphoto/jttl7I...,False,"{'address1': '227 Lenox Ave', 'address2': '', ...",Sottocasa Pizzeria - Harlem,...,False,False,0-1000m,True,1,2016-04-14,"(2019-04-13, This place was amazing!! Neapolit...",2019-04-13,This place was amazing!! Neapolitan pizza tend...,4.544643
27,grazie-new-york-3,[italian],"{'latitude': 40.77948, 'longitude': -73.9601}",(212) 717-4407,3951.457366,_3QRn51W3IoqBzc9fYet4Q,https://s3-media1.fl.yelpcdn.com/bphoto/IIbqHx...,False,"{'address1': '26 E 84th St', 'address2': '', '...",Grazie,...,True,True,3000-4000m,True,1,2006-01-13,"(2019-04-15, I was under the impressions based...",2019-04-15,I was under the impressions based on reviews h...,4.000000
35,lido-new-york,"[italian, breakfast_brunch, cocktailbars]","{'latitude': 40.8049797781916, 'longitude': -7...",(646) 490-8575,1170.550549,J9xVQScnr0lYWl61_mLXMA,https://s3-media4.fl.yelpcdn.com/bphoto/nlJAMT...,False,"{'address1': '2168 Frederick Douglass Blvd', '...",Lido,...,True,False,1000-2000m,True,1,2011-02-14,"(2019-04-14, I visited this location yesterday...",2019-04-14,I visited this location yesterday for brunch a...,4.190909
42,fumo-pizza-bar-pasta-new-york-3,"[pizza, italian, bars]","{'latitude': 40.821442, 'longitude': -73.9506357}",(646) 692-6675,937.057594,V6e4UBjFzdrcUgKbJTrZpA,https://s3-media3.fl.yelpcdn.com/bphoto/P0peGT...,False,"{'address1': '1600 Amsterdam Ave', 'address2':...",Fumo Pizza-Bar-Pasta,...,True,False,0-1000m,True,4,2016-03-05,"(2019-04-16, Fumo is one of those classic lunc...",2019-04-16,Fumo is one of those classic lunch spots for m...,4.196429
51,max-soha-new-york,[italian],"{'latitude': 40.811302, 'longitude': -73.958183}",(212) 531-2221,993.502933,vZ5-JXlJS75k8wmPNS5U5w,https://s3-media1.fl.yelpcdn.com/bphoto/BNvCZH...,False,"{'address1': '1274 Amsterdam Ave', 'address2':...",Max Soha,...,True,False,0-1000m,True,1,2005-11-09,"(2019-04-06, My wife and I had a wonderful mea...",2019-04-06,My wife and I had a wonderful meal at Max Soha...,4.208696
61,raos-new-york,[italian],"{'latitude': 40.79392, 'longitude': -73.93427}",(212) 722-6709,2416.414975,zo4JHfD2nWPUiqgsgDOaHQ,https://s3-media2.fl.yelpcdn.com/bphoto/3orQwP...,False,"{'address1': '455 E 114th St', 'address2': Non...",Rao's,...,False,False,2000-3000m,True,1,2006-05-12,"(2019-02-22, In years of yore, Italians had th...",2019-02-22,"In years of yore, Italians had this part of to...",4.491071
65,patsys-pizzeria-new-york-7,"[pizza, italian]","{'latitude': 40.79713, 'longitude': -73.93481}",(212) 534-9783,2079.944811,Kh0-HmlgmVUkRxR55_Vn1g,https://s3-media1.fl.yelpcdn.com/bphoto/hx0TGZ...,False,"{'address1': '2287 1st Ave', 'address2': '', '...",Patsy's Pizzeria,...,False,False,2000-3000m,True,1,2006-03-26,"(2019-04-12, Walked in here for two slices and...",2019-04-12,Walked in here for two slices and asked for Th...,4.109091
