# Scrape Restaurant Reviews from Yelp Using Business Name

In [6]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import time

In [51]:
#get the comment, comment_count and date by descending order
#if the comment_count is greater than 100, we take the first 100 comment
#if the comment_count is less than 100, we take as many comment as we can 
#if the number of reviews written by one reviewer is less than 10, we consider the review as unvalued
def get_new_date(alias,N=100):
    url = 'https://www.yelp.com/biz/'+alias+'?sort_by=date_desc'
    response = requests.get(url)
    if response.status_code != 200:
        print("==200")
        return None,None,None
    response_page = BeautifulSoup(response.content,'lxml')
    review_abbre = response_page.find_all('script',type='application/ld+json')
    try:
        review_text= json.loads(review_abbre[-1].get_text())
        date = review_text.get('review')[0].get('datePublished')
    except:
        print("1st try goes wrong")
        return None,None,None
    #date = review_text.get('review')[0].get('datePublished')
    count_list,author_list,review_list,rate_list = [],[],[],[]
    
    while len(count_list)< N and response.status_code == 200:

        total_author = response_page.find_all('div',{'class',"review review--with-sidebar"})
        for all_review in total_author:
            count = all_review.find('li',{'class':'review-count'}).find('b').get_text()
            if int(count)>=10:
                try:
                    count_list.append(count)
                    author_list.append(all_review.find('a',{'class','user-display-name'}).get_text())
                    review_list.append(all_review.find('p',{'lang':'en'}).get_text())
                    rate_list.append(float(all_review.find('img',class_='offscreen').get('alt')[:3]))
                except:
                    print("2nd try goes wrong")
                    return None, None, None
        try:
            url = response_page.find_all('a',{'class':'u-decoration-none next pagination-links_anchor'})[0].get('href')
        except:
            print("3rd try goes wrong")
            break
        response = requests.get(url)
        response_page = BeautifulSoup(response.content,'lxml')
    
    reviews=' '.join(review_list)
    avg_rate = sum(rate_list)/len(rate_list)
    
    return date,reviews,avg_rate

In [36]:
#get the oldest comment by ascending order
def get_old_date(alias):
    url = 'https://www.yelp.com/biz/'+alias+'?sort_by=date_asc'
    response = requests.get(url)
    if response.status_code != 200:
        return None
    contents = BeautifulSoup(response.content,'lxml')
    review_abbre = contents.find_all('script',type='application/ld+json')
    try:
        review_text= json.loads(review_abbre[-1].get_text())
        date = review_text.get('review')[0].get('datePublished')
    except:
        return None
    #date = review_text.get('review')[0].get('datePublished')
    return date

In [None]:
#loop through the original data and add new columns
df=pd.read_pickle('italian_551.pkl')
N=0
while N<=551: 
    business_new = df.iloc[N:N+20]
    business_new['old_date'] = business_new['alias'].apply(get_old_date)
    #try to not active the reCaptcha
    time.sleep(10)
    business_new['new_date_review_rate'] = business_new['alias'].apply(get_new_date)
    business_new['new_date']=business_new['new_date_review_rate'].apply(lambda x:x[0])
    business_new['review']=business_new['new_date_review_rate'].apply(lambda x:x[1])
    business_new['avg_rate']=business_new['new_date_review_rate'].apply(lambda x:x[2])
    business_new.to_pickle("./review/italian_{}.pkl".format(N))
    time.sleep(10)
    print(N)
    N+=20

In [44]:
#combine all the segmental dataframe to one
df=pd.read_pickle('./review/italian_0.pkl')
for x in range(20,560,20):
    new=pd.read_pickle('./review/italian_{}.pkl'.format(x))
    df=pd.concat([df,new])
df.to_pickle('italian_review_551.pkl')

In [None]:
df

In [90]:
df['new_date_review_rate']

9       (2019-04-06, I met a friend at this restaurant...
10      (2019-04-09, Awesome food, delicious wines, ex...
15      (2019-04-15, Let me start off by saying that t...
24      (2019-04-13, This place was amazing!! Neapolit...
27      (2019-04-15, I was under the impressions based...
35      (2019-04-14, I visited this location yesterday...
42      (2019-04-16, Fumo is one of those classic lunc...
51      (2019-04-06, My wife and I had a wonderful mea...
61      (2019-02-22, In years of yore, Italians had th...
65      (2019-04-12, Walked in here for two slices and...
71      (2019-04-11, Small local restaurant.   For som...
73      (2019-04-14, I've eaten here twice and am very...
74      (2019-04-01, Always a great experience here. T...
83      (2019-02-19, (Based on a single Summer 2015 vi...
95      (2019-04-09, This fish ....caught me.  I highl...
96      (2019-04-16, Went with my friend and it was de...
120     (2019-04-10, The first time I was there, I had...
125     (2019-