In [1]:
from lxml import html  
import json
import requests
#from exceptions import ValueError
from time import sleep
from urllib.parse import quote, unquote
import re, urllib
import argparse
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
def GetParser(url):
    response = requests.get(url).text
    parser = html.fromstring(response)
    return parser

In [3]:
def BusinessInfoScrapper(parser):
    raw_name = parser.xpath("//h1[contains(@class,'page-title')]//text()")
    raw_claimed = parser.xpath("//span[contains(@class,'claim-status_icon--claimed')]/parent::div/text()")
    raw_reviews = parser.xpath("//div[contains(@class,'biz-main-info')]//span[contains(@class,'review-count rating-qualifier')]//text()")
    raw_category  = parser.xpath('//div[contains(@class,"biz-page-header")]//span[@class="category-str-list"]//a/text()')
    hours_table = parser.xpath("//table[contains(@class,'hours-table')]//tr")
    details_table = parser.xpath("//div[@class='short-def-list']//dl")
    raw_map_link = parser.xpath("//a[@class='biz-map-directions']/img/@src")
    raw_phone = parser.xpath(".//span[@class='biz-phone']//text()")
    raw_address = parser.xpath('//div[@class="mapbox-text"]//div[contains(@class,"map-box-address")]//text()')
    raw_wbsite_link = parser.xpath("//span[contains(@class,'biz-website')]/a/@href")
    raw_price_range = parser.xpath("//dd[contains(@class,'price-description')]//text()")
    raw_health_rating = parser.xpath("//dd[contains(@class,'health-score-description')]//text()")
    rating_histogram = parser.xpath("//table[contains(@class,'histogram')]//tr[contains(@class,'histogram_row')]")
    raw_ratings = parser.xpath("//div[contains(@class,'biz-page-header')]//div[contains(@class,'rating')]/@title")
    raw_neighborhood = parser.xpath("//div[@class='map-box-address u-space-l4']/span[@class='neighborhood-str-list']//text()")
    working_hours = []
    
    for hours in hours_table:
        raw_day = hours.xpath(".//th//text()")
        raw_timing = hours.xpath("./td//text()")
        day = ''.join(raw_day).strip()
        timing = ''.join(raw_timing).strip()
        working_hours.append({day:timing})
    info = []
    for details in details_table:
        raw_description_key = details.xpath('.//dt//text()')
        raw_description_value = details.xpath('.//dd//text()')
        description_key = ''.join(raw_description_key).strip()
        description_value = ''.join(raw_description_value).strip()
        info.append({description_key:description_value})

    ratings_histogram = [] 
    for ratings in rating_histogram:
        raw_rating_key = ratings.xpath(".//th//text()")
        raw_rating_value = ratings.xpath(".//td[@class='histogram_count']//text()")
        rating_key = ''.join(raw_rating_key).strip()
        rating_value = ''.join(raw_rating_value).strip()
        ratings_histogram.append({int(rating_key[0]):int(rating_value)})

    name = ''.join(raw_name).strip()
    phone = ''.join(raw_phone).strip()
    address = ' '.join(' '.join(raw_address).split())
    health_rating = ''.join(raw_health_rating).strip()
    price_range = ''.join(raw_price_range).strip()
    claimed_status = ''.join(raw_claimed).strip()
    reviews = int(''.join(raw_reviews).strip().replace(' reviews',''))
    category = ','.join(raw_category)
    cleaned_ratings = ''.join(raw_ratings).strip()

    if raw_wbsite_link:
        #pass
        decoded_raw_website_link = urllib.parse.unquote(raw_wbsite_link[0])
        website = re.findall("biz_redir\?url=(.*)&website_link",decoded_raw_website_link)[0]
    else:
        website = ''

    if raw_map_link:
        decoded_map_url =  urllib.parse.unquote(raw_map_link[0])
        map_coordinates = re.findall("center=([+-]?\d+.\d+,[+-]?\d+\.\d+)",decoded_map_url)[0].split(',')
        latitude = float(map_coordinates[0])
        longitude = float(map_coordinates[1])
    else:
        latitude = ''
        longitude = ''

    if raw_ratings:
        ratings = float(re.findall("\d+[.,]?\d+",cleaned_ratings)[0])
    else:
        ratings = 0

    if raw_neighborhood:
        neighborhood = ''.join(raw_neighborhood).strip()
    else:
        neighborhood = ''

    data={'working_hours':working_hours,
        'info':info,
        'ratings_histogram':ratings_histogram,
        'name':name,
        'phone':phone,
        'ratings':ratings,
        'address':address,
        'health_rating':health_rating,
        'price_range':price_range,
        'claimed_status':claimed_status,
        'reviews':reviews.replace(' reviews',''),
        'category':category,
        'website':website,
        'latitude':latitude,
        'longitude':longitude,
        'neighborhood': neighborhood,  
        'url':url
         }
    return data

In [4]:
def GetAllReivews(parser):
    '''Given the parsed first webpage of a restaurant on yelp, return all reviews of that restaurants'''
    review_dict = {'date': [], 'star': [], 'text': []}
    
    review_dates = parser.xpath("//div[@class='review-content']//span[@class='rating-qualifier']")
    for d in review_dates:
        date = ''.join(d.xpath(".//text()")).strip().split('\n')[0]
        review_dict['date'].append(date)    

    review_stars = parser.xpath("//div[@class='review review--with-sidebar']/div[@class='review-wrapper']/div[@class='review-content']/div[@class='biz-rating biz-rating-large clearfix']")
    for s in review_stars:
        star = float(''.join(s.xpath(".//@title")).strip().replace(' star rating',''))
        review_dict['star'].append(star)
        
    review_texts = parser.xpath("//div[@class='review review--with-sidebar']/div[@class='review-wrapper']/div[@class='review-content']/p")
    for t in review_texts:
        text = ' '.join(t.xpath(".//text()"))
        review_dict['text'].append(text)
    
    review = pd.DataFrame(review_dict)
    review['date'] =  pd.to_datetime(review['date'])
    
    review_pages_section = parser.xpath("//div[@class='arrange arrange--stack arrange--baseline arrange--6']//text()")     
    review_pages = [item for item in [e.replace('\n','').replace(' ','') for e in review_pages_section] if item != '' ]

    if 'Next' not in review_pages:
        return review
    else:
        nextpage = parser.xpath('//a[@class="u-decoration-none next pagination-links_anchor"]/@href')[0]
        nextparser = GetParser(nextpage)
        return review.append(GetAllReivews(nextparser), ignore_index=True)

In [5]:
url = 'https://www.yelp.com/biz/medi-by-bice-chicago'
#response = requests.get(url).text
parser = GetParser(url)
#BusinessInfoScrapper(parser)

In [6]:
BusinessInfoScrapper(parser)

{'address': '158 E Ontario St Chicago, IL 60611 b/t Fairbanks Ct & Mc Clurg Ct Near North Side, River East',
 'category': 'Italian',
 'claimed_status': 'Claimed',
 'health_rating': '',
 'info': [{'Takes Reservations': 'Yes'},
  {'Delivery': 'No'},
  {'Take-out': 'Yes'},
  {'Accepts Credit Cards': 'Yes'},
  {'Parking': 'Valet'},
  {'Good for Kids': 'No'},
  {'Good for Groups': 'Yes'},
  {'Attire': 'Dressy'},
  {'Alcohol': 'Full Bar'},
  {'Outdoor Seating': 'Yes'}],
 'latitude': '41.893624',
 'longitude': '-87.623224',
 'name': 'Medi By Bice',
 'neighborhood': 'Near North Side, River East',
 'phone': '(312) 664-1474',
 'price_range': '$31-60',
 'ratings': 3.5,
 'ratings_histogram': [{5: 19}, {4: 17}, {3: 15}, {2: 8}, {1: 9}],
 'reviews': '68',
 'url': 'https://www.yelp.com/biz/medi-by-bice-chicago',
 'website': 'http://www.medibybice.bicegroup.com',
 'working_hours': []}

In [37]:
d1 = {'a':'1', 'b':'2'}

In [39]:
d2 = {k:[v] for k, v in d1.items()}
d2

{'a': ['1'], 'b': ['2']}

In [40]:
test = pd.DataFrame(d2)

In [41]:
test

Unnamed: 0,a,b
0,1,2


In [42]:
test.to_csv('test'+'.csv', index=False)

In [7]:
reviews = GetAllReivews(parser).sort_values(by='date', ascending=False).reset_index(drop=True)
#reviews.sort_values(by='date', ascending=False).reset_index(drop=True)
reviews

Unnamed: 0,date,star,text
0,2010-12-27,1.0,I just got the word this place is CLOSED. We ...
1,2010-12-01,1.0,We saw this restaurant near our hotel and sinc...
2,2010-11-12,4.0,Went there for lunch and enjoyed their daily s...
3,2010-10-11,5.0,"Not sure what it is, but I think this place ha..."
4,2010-07-06,1.0,I had the worst dining experience of my life o...
5,2010-07-04,2.0,The food is definitely overpriced and bland. ...
6,2010-07-02,2.0,There is something about Italian chain restaur...
7,2010-06-16,2.0,"I feel like I went on a bad ""match date"" the o..."
8,2010-06-07,4.0,Just went back for a third time! Every time my...
9,2010-06-06,5.0,I love this place. The food is dependably grea...


In [63]:
reviews.reset_index(drop=True)

Unnamed: 0,date,star,text
0,2017-07-17,5.0,We are sad to hear of 42 grams closing. We had...
1,2017-05-16,5.0,A gold standard in fine dining. Jake and Alex...
2,2017-04-25,5.0,Wow... this is an experience. Probably top ...
3,2017-03-17,5.0,What an excellent dining experience! One of th...
4,2017-03-09,5.0,Of all the fine dining experiences I have had ...
5,2017-01-05,5.0,"Overall , amazing. Food worth all the praise p..."
6,2016-12-03,5.0,Fantastic. Extremely personable between the ch...
7,2016-11-22,5.0,Absurd. You will not forget eating here. One...
8,2016-11-12,5.0,42 grams provides a unique dining experience t...
9,2016-11-11,4.0,went there last year for our anniversary last....


In [50]:
def GetOnePageReivews(parser):
    import pandas as pd
    review_dict = {'date': [], 'star': [], 'text': []}
    
    review_dates = parser.xpath("//div[@class='review-content']//span[@class='rating-qualifier']")
    for d in review_dates:
        date = ''.join(d.xpath(".//text()")).strip().split('\n')[0]
        review_dict['date'].append(date)    

    review_stars = parser.xpath("//div[@class='review review--with-sidebar']/div[@class='review-wrapper']/div[@class='review-content']/div[@class='biz-rating biz-rating-large clearfix']")
    for s in review_stars:
        star = float(''.join(s.xpath(".//@title")).strip().replace(' star rating',''))
        review_dict['star'].append(star)
        
    review_texts = parser.xpath("//div[@class='review review--with-sidebar']/div[@class='review-wrapper']/div[@class='review-content']/p")
    for t in review_texts:
        text = ' '.join(t.xpath(".//text()"))
        review_dict['text'].append(text)
    
    review = pd.DataFrame(review_dict)
    review['date'] =  pd.to_datetime(review['date'])
    return review

In [56]:
OnePage = GetOnePageReivews(parser)
OnePage.sort_values(by='date', ascending=False)

Unnamed: 0,date,star,text
0,2017-07-17,5.0,We are sad to hear of 42 grams closing. We had...
1,2017-05-16,5.0,A gold standard in fine dining. Jake and Alex...
8,2017-04-25,5.0,Wow... this is an experience. Probably top ...
15,2017-03-17,5.0,What an excellent dining experience! One of th...
2,2017-03-09,5.0,Of all the fine dining experiences I have had ...
18,2017-01-05,5.0,"Overall , amazing. Food worth all the praise p..."
5,2016-11-12,5.0,42 grams provides a unique dining experience t...
7,2016-11-10,3.0,"Service was great, and the stories/description..."
3,2016-11-09,5.0,14 April 2016 - We were very fortunate to get ...
19,2016-11-06,5.0,Celebrating our 6th anniversary with my hero's...


In [49]:
' '.join(OnePage.iloc[0,2])

"We are sad to hear of 42 grams closing. We had dinner here November 2016 for Joe's birthday and it was one of the best fine dining experiences we've had. Set up as incredibly intimate setting with chef Jake Bickelhaupt, his sous chef, and Alexa Welsh, dinner consisted of us and 3 other couples that we got to know, engage and imbibe with during the delicious meal. Sitting at the counter, watching the food being prepared, and being able to chat with Chef Jake and Alexa was a rare treat. Hearing the backstory and inspiration for the dishes made each bite of the creative, inspired meal even more enjoyable. The BYOB \xa0was wonderful with recommendations for the meal suggested on the website. We ended up trying and sharing our beer, wine, and cider with the other couples there with us and and had such a good time we even ended up heading out to drinks afterwards with our new friends. We wish best of luck to Jake and Alexa and hope this closure is temporary!"

In [163]:
review = GetReivews(parser)
review_pages_section = parser.xpath("//div[@class='arrange arrange--stack arrange--baseline arrange--6']//text()")
review_pages = [item for item in [e.replace('\n','').replace(' ','') for e in review_pages_section] if item != '' ]
print(review_pages)
print(review)

while 'Next' in review_pages:
    #nextpage = parser.xpath('//a[@class="u-decoration-none next pagination-links_anchor"]/@href')
    nextlink = parser.xpath('//a[@class="u-decoration-none next pagination-links_anchor"]/@href')[0]
    print(nextlink)
    parser = GetParser(nextlink)
    review = review.append(GetReivews(parser),ignore_index=True)
    print(review)
    review_pages_section = parser.xpath("//div[@class='arrange arrange--stack arrange--baseline arrange--6']//text()")
    review_pages = [item for item in [e.replace('\n','').replace(' ','') for e in review_pages_section] if item != '' ]
    print(review_pages)

    
#     return review
# else:
#     nextpage = parser.xpath('//div[@class="arrange arrange--baseline"]/div[@class="arrange_unit"]/a/@href')[0]
#     nextparser = GetParser(nextpage)
#     return review.append(GetReivews_reCurse(nextparser))

['Page1of4', '1', '2', '3', '4', 'Next']
          date             star  \
0    7/17/2017  5.0 star rating   
1    5/16/2017  5.0 star rating   
2     3/9/2017  5.0 star rating   
3    11/9/2016  5.0 star rating   
4    10/8/2016  5.0 star rating   
5   11/12/2016  5.0 star rating   
6    5/30/2016  5.0 star rating   
7   11/10/2016  3.0 star rating   
8    4/25/2017  5.0 star rating   
9    9/13/2016  5.0 star rating   
10    1/2/2016  5.0 star rating   
11    9/5/2016  5.0 star rating   
12   3/21/2016  5.0 star rating   
13   11/5/2016  5.0 star rating   
14   8/11/2016  5.0 star rating   
15   3/17/2017  5.0 star rating   
16   12/3/2015  4.0 star rating   
17  12/15/2015  4.0 star rating   
18    1/5/2017  5.0 star rating   
19   11/6/2016  5.0 star rating   

                                                 text  
0   [We are sad to hear of 42 grams closing. We ha...  
1   [A gold standard in fine dining. , Jake and Al...  
2   [Of all the fine dining experiences I have had...  

          date             star  \
0    7/17/2017  5.0 star rating   
1    5/16/2017  5.0 star rating   
2     3/9/2017  5.0 star rating   
3    11/9/2016  5.0 star rating   
4    10/8/2016  5.0 star rating   
5   11/12/2016  5.0 star rating   
6    5/30/2016  5.0 star rating   
7   11/10/2016  3.0 star rating   
8    4/25/2017  5.0 star rating   
9    9/13/2016  5.0 star rating   
10    1/2/2016  5.0 star rating   
11    9/5/2016  5.0 star rating   
12   3/21/2016  5.0 star rating   
13   11/5/2016  5.0 star rating   
14   8/11/2016  5.0 star rating   
15   3/17/2017  5.0 star rating   
16   12/3/2015  4.0 star rating   
17  12/15/2015  4.0 star rating   
18    1/5/2017  5.0 star rating   
19   11/6/2016  5.0 star rating   
20   7/22/2016  5.0 star rating   
21  11/11/2016  4.0 star rating   
22   12/3/2016  5.0 star rating   
23  11/22/2016  5.0 star rating   
24   4/26/2016  3.0 star rating   
25   11/6/2016  1.0 star rating   
26   11/3/2016  1.0 star rating   
27   7/30/2015  5.0 

In [138]:
review

Unnamed: 0,date,star,text
0,7/17/2017,5.0 star rating,[We are sad to hear of 42 grams closing. We ha...
1,5/16/2017,5.0 star rating,"[A gold standard in fine dining. , Jake and Al..."
2,3/9/2017,5.0 star rating,[Of all the fine dining experiences I have had...
3,11/9/2016,5.0 star rating,"[14 April 2016 -, We were very fortunate to ge..."
4,10/8/2016,5.0 star rating,[***The very personal and intimate 42 grams is...
5,11/12/2016,5.0 star rating,[42 grams provides a unique dining experience ...
6,5/30/2016,5.0 star rating,[This was the best meal have had in Chicago in...
7,11/10/2016,3.0 star rating,"[Service was great, and the stories/descriptio..."
8,4/25/2017,5.0 star rating,[Wow... this is an experience. Probably top...
9,9/13/2016,5.0 star rating,[Wow. I made a reservation 3 months in advance...


In [135]:
a = 1
while a < 5:
    a = a+1
    print(a)

2
3
4
5


In [127]:
df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))
df

Unnamed: 0,A,B
0,1,2
1,3,4


In [129]:
df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'))
df2

Unnamed: 0,A,B
0,5,6
1,7,8


In [130]:
df.append(df2, i)

Unnamed: 0,A,B
0,1,2
1,3,4
0,5,6
1,7,8


In [131]:
df

Unnamed: 0,A,B
0,1,2
1,3,4


In [11]:
reviews = GetReivews_reCurse(parser)

In [12]:
reviews

Unnamed: 0,date,star,text
0,7/17/2017,5.0 star rating,[We are sad to hear of 42 grams closing. We ha...
1,5/16/2017,5.0 star rating,"[A gold standard in fine dining. , Jake and Al..."
2,3/9/2017,5.0 star rating,[Of all the fine dining experiences I have had...
3,11/9/2016,5.0 star rating,"[14 April 2016 -, We were very fortunate to ge..."
4,10/8/2016,5.0 star rating,[***The very personal and intimate 42 grams is...
5,11/12/2016,5.0 star rating,[42 grams provides a unique dining experience ...
6,5/30/2016,5.0 star rating,[This was the best meal have had in Chicago in...
7,11/10/2016,3.0 star rating,"[Service was great, and the stories/descriptio..."
8,4/25/2017,5.0 star rating,[Wow... this is an experience. Probably top...
9,9/13/2016,5.0 star rating,[Wow. I made a reservation 3 months in advance...


In [117]:
review_pages = parser.xpath("//div[@class='arrange arrange--stack arrange--baseline arrange--6']//text()")
review_pages

['\n                    ',
 '\n        Page 1 of 4\n    ',
 '\n\n                        ',
 '\n        ',
 '\n\n                    ',
 '\n                        ',
 '1',
 '\n                    ',
 '\n                    ',
 '\n                        ',
 '\n                            2\n                        ',
 '\n                    ',
 '\n                    ',
 '\n                        ',
 '\n                            3\n                        ',
 '\n                    ',
 '\n                    ',
 '\n                        ',
 '\n                            4\n                        ',
 '\n                    ',
 '\n\n                ',
 '\n                    ',
 '\n                        ',
 'Next',
 '\n                        ',
 '\n    ',
 '\n        ',
 '\n    ',
 '\n',
 '\n                    ',
 '\n                ',
 '\n        ',
 '\n    ',
 '\n\n            ']

In [86]:
review_pages = [item for item in [e.replace('\n','').replace(' ','') for e in review_pages] if item != '' ]

In [87]:
review_pages

['Page1of4', '1', '2', '3', '4', 'Next']

In [141]:
if 'Next' in review_pages:
    nextpage = parser.xpath('//div[@class="arrange arrange--baseline"]/div[@class="arrange_unit"]/a/@href')[0]

In [142]:
review_pages

['Page1of4', '1', '2', '3', '4', 'Next']

In [143]:
'Next' in review_pages

True

In [159]:
parser = GetParser('https://www.yelp.com/biz/42-grams-chicago?start=40')

In [160]:
nextpage = parser.xpath('//a[@class="u-decoration-none next pagination-links_anchor"]/@href') #.extract()
#nextpage = parser.xpath('//div[@class="arrange arrange--baseline"]/div[@class="arrange_unit"]/a[@class="u-decoration-none next pagination-links_anchor"]/@href') #.extract()
nextpage

['https://www.yelp.com/biz/42-grams-chicago?start=60']

In [118]:
p2 = GetParser(nextpage)

In [120]:
GetReivews(p2)

Unnamed: 0,date,star,text
0,7/22/2016,5.0 star rating,[Loved Alexa and Jake. The setup is very comfo...
1,11/11/2016,4.0 star rating,[went there last year for our anniversary last...
2,12/3/2016,5.0 star rating,[Fantastic. Extremely personable between the c...
3,11/22/2016,5.0 star rating,[Absurd. You will not forget eating here. On...
4,4/26/2016,3.0 star rating,[Dined here on our recent trip to Chicago last...
5,11/6/2016,1.0 star rating,[I can't believe this place has 2 stars. Alexi...
6,11/3/2016,1.0 star rating,"[we booked months in advance, of course they i..."
7,7/30/2015,5.0 star rating,[Doesn't get much better than this. 13 course...
8,11/21/2014,1.0 star rating,[I decided this was where I wanted to celebrat...
9,7/27/2014,5.0 star rating,[An intimate and adventurous experience. All ...


In [70]:
zaidis = 'https://www.yelp.com/biz/zaidis-naperville'
response = requests.get(zaidis).text
parser = html.fromstring(response)
review_pages = parser.xpath("//div[@class='arrange arrange--stack arrange--baseline arrange--6']//text()")
#review_pages =[item for item in [e.replace('\n','').replace(' ','') for e in review_pages] if item != '' ]
review_pages

['\n                    ', '\n        Page 1 of 1\n    ', '\n\n            ']

In [34]:
''.join(review_pages).strip()

'Page1of41234Next'

In [17]:
for element in review_pages:
    element.replace('\n','')

In [20]:
review_pages[0].replace('\n','')

'                    '

In [15]:
''.join(review_pages).strip()

'Page 1 of 4\n    \n\n                        \n        \n\n                    \n                        1\n                    \n                    \n                        \n                            2\n                        \n                    \n                    \n                        \n                            3\n                        \n                    \n                    \n                        \n                            4\n                        \n                    \n\n                \n                    \n                        Next'

In [93]:
review_ratings = parser.xpath("//div[@class='review review--with-sidebar']/div[@class='review-wrapper']/div[@class='review-content']/div[@class='biz-rating biz-rating-large clearfix']")
                              
                              

In [94]:
for r in review_ratings:
    rating = ''.join(r.xpath(".//@title")).strip()
    review_star.append(rating)
review_star

['5.0 star rating',
 '4.0 star rating',
 '3.0 star rating',
 '4.0 star rating',
 '5.0 star rating',
 '5.0 star rating',
 '4.0 star rating',
 '5.0 star rating',
 '3.0 star rating',
 '5.0 star rating',
 '5.0 star rating',
 '5.0 star rating',
 '5.0 star rating',
 '5.0 star rating',
 '5.0 star rating',
 '5.0 star rating',
 '4.0 star rating',
 '5.0 star rating',
 '5.0 star rating',
 '4.0 star rating']

In [80]:
rdates[11].split('\n')[0]

'7/19/2017'