## Collecting Yelp Reviews

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json
import re
import time



In [84]:
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9'}
start_page = 420
#url="https://www.yelp.ca/biz/meet-on-main-vancouver?start="+str(start_page)+"&sort_by=date_desc"
url = "https://www.yelp.ca/biz/meet-on-main-vancouver?"
response = requests.get(url, headers=headers)

In [85]:
content = response.content
soup = BeautifulSoup(content,"lxml")

In [86]:
reviews = json.loads(soup.find('script', type='application/ld+json').string)['review']
reviews[0]

{'author': 'Tyson L.',
 'datePublished': '2021-06-26',
 'reviewRating': {'ratingValue': 5},
 'description': 'Rarely will you find MeeT empty -- an arbitrary indicator of popularity if you care about that -- as the vegan food is of high quality with solid portions. \n\nAfter multiple visits, some with Vegans, others with the &quot;oh but you need animal protein to survive cohort&quot;, 90% of guests would recommend the restaurant for its all-inclusive, varying menu.\n\nMy test for a good restaurant is whether I could replicate the dishes and/or be willing to create the dishes, and I could not with the same flavors.\n\nPersonally a big fan of the Macro bowl or the Meet burger, but most friends are content with the selection of poutines and the Mighty Mac.\n\nGood quality food is good quality food, and whether you&apos;re vegan or not you will be satisfied'}

In [55]:
sample_rev = reviews[0]['description']
sample_rev

'Rarely will you find MeeT empty -- an arbitrary indicator of popularity if you care about that -- as the vegan food is of high quality with solid portions. \n\nAfter multiple visits, some with Vegans, others with the &quot;oh but you need animal protein to survive cohort&quot;, 90% of guests would recommend the restaurant for its all-inclusive, varying menu.\n\nMy test for a good restaurant is whether I could replicate the dishes and/or be willing to create the dishes, and I could not with the same flavors.\n\nPersonally a big fan of the Macro bowl or the Meet burger, but most friends are content with the selection of poutines and the Mighty Mac.\n\nGood quality food is good quality food, and whether you&apos;re vegan or not you will be satisfied'

In [20]:
regex = r'&.{4};'
unwanted = set()
punc = '''!()-[]{};:'"\,<>./?@#$^&*_~'''

In [None]:
for i in re.finditer(regex, sample_rev):
    unwanted.add(i.group())

In [2]:
def clean_sent(text):
    cleaned = []
    for word in text.split():
        if not word.isalpha():
            for unwanted_string in unwanted:
                if unwanted_string in word:
                    word = word.replace(unwanted_string, "")
            word = word.strip(punc)
            cleaned.append(word)
        else:
            cleaned.append(word)
    
    return " ".join(cleaned)

In [58]:
clean_sent(sample_rev)

'Rarely will you find MeeT empty  an arbitrary indicator of popularity if you care about that  as the vegan food is of high quality with solid portions After multiple visits some with Vegans others with the oh but you need animal protein to survive cohort 90% of guests would recommend the restaurant for its all-inclusive varying menu My test for a good restaurant is whether I could replicate the dishes and/or be willing to create the dishes and I could not with the same flavors Personally a big fan of the Macro bowl or the Meet burger but most friends are content with the selection of poutines and the Mighty Mac Good quality food is good quality food and whether youre vegan or not you will be satisfied'

In [3]:
def get_rating(rating_dict):
    return rating_dict['ratingValue']

In [60]:
yelp_df = pd.DataFrame.from_dict(reviews)
yelp_df['reviewRating'] = yelp_df['reviewRating'].apply(get_rating)
yelp_df['description'] = yelp_df['description'].apply(clean_sent)
yelp_df

Unnamed: 0,author,datePublished,reviewRating,description
0,Tyson L.,2021-06-26,5,Rarely will you find MeeT empty an arbitrary ...
1,Melissa E.,2021-10-09,4,Im not a vegan but I love trying innovative ve...
2,Matt B.,2021-07-30,4,Im much more sold on this place nowadays I ord...
3,Jill N.,2021-03-18,5,Went here for St Patties Day Our server Derek ...
4,Maling S.,2020-09-17,5,Sept 17 and went with my brother for vegan foo...
5,Dana S.,2021-02-18,5,Ive ordered takeout from the Yaletown location...
6,Michelle H.,2020-05-08,4,I havent had this is YEARS I used to be SO obs...
7,Jasmine Y.,2021-09-15,5,Ordered takeout online to the Main Street loca...
8,Aaron W.,2021-05-24,1,Saw 3 patio tables empty and barely any line u...
9,Jennifer B.,2020-03-12,5,Love the MeeT restaurants in Vancouver Ive eat...


In [141]:
yelp_df.to_csv('yelp_revs.csv', index=False)

### Collecting reviews from multiple restaurants

Get the links from different restaurants. Here, I searched for Vegan restaurants in Vancouver.

In [4]:
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9'}
start_page = 420
url = "https://www.yelp.ca/search?find_desc=Vegan%20Restaurants&find_loc=Vancouver%2C%20BC"
response = requests.get(url, headers=headers)

In [5]:
content = response.content
soup = BeautifulSoup(content,"lxml")

In [6]:
resto_links = []
for a in soup.find_all('a', {"class": "css-1f2a2s6"}):
    if a['href'][1:4] == 'biz':
        resto_links.append((a['name'], a['href']))
        
resto_links

[('MILA Plant-Based', '/biz/mila-plant-based-vancouver?osq=Vegan+Restaurants'),
 ('Lotus Seed Vegan', '/biz/lotus-seed-vegan-vancouver?osq=Vegan+Restaurants'),
 ('MeeT in Gastown', '/biz/meet-in-gastown-vancouver?osq=Vegan+Restaurants'),
 ('Vegan Cave Cafe', '/biz/vegan-cave-cafe-vancouver?osq=Vegan+Restaurants'),
 ('BeetBox', '/biz/beetbox-vancouver?osq=Vegan+Restaurants'),
 ('Nuba in Gastown', '/biz/nuba-in-gastown-vancouver-2?osq=Vegan+Restaurants'),
 ('Tama Organic Life',
  '/biz/tama-organic-life-vancouver?osq=Vegan+Restaurants'),
 ('Do Chay', '/biz/do-chay-vancouver-2?osq=Vegan+Restaurants'),
 ('CHAU Veggie Express',
  '/biz/chau-veggie-express-vancouver-2?osq=Vegan+Restaurants'),
 ('Buddha-Full', '/biz/buddha-full-north-vancouver?osq=Vegan+Restaurants')]

Now we do the same thing as Part 1 but we will get the reviews from each of the links instead of just one link.

In [7]:
resto_revs = []
for name, link in resto_links:
    url = 'https://www.yelp.ca' + na
    response = requests.get(url, headers=headers)
    content = response.content
    soup = BeautifulSoup(content, "lxml")
    reviews = json.loads(soup.find('script', type='application/ld+json').string)['review']
    for rev in reviews:
        rev['name'] = name
        
    resto_revs.extend(reviews)

NameError: name 'resto' is not defined

In [117]:
all_reviews = ""
for rev in resto_revs:
    all_reviews = all_reviews + rev['description'] + " "

In [128]:
regex = r'(&.{4};|&.{4})'
unwanted = set()
punc = '''!()-[]{};:'"\,<>./?@#$^&*_~'''

In [129]:
for i in re.finditer(regex, all_reviews):
    unwanted.add(i.group())
    
unwanted

{'&apos;', '&quot;'}

In [130]:
yelp_df = pd.DataFrame.from_dict(resto_revs)
yelp_df['reviewRating'] = yelp_df['reviewRating'].apply(get_rating)
yelp_df['description'] = yelp_df['description'].apply(clean_sent)
yelp_df

Unnamed: 0,author,datePublished,reviewRating,description,name
0,Mateo D.,2020-07-07,5,Amazing service by Shefali Jess and Jaycobe Th...,MILA Plant-Based
1,Nicole B.,2021-01-01,3,Went to the North van location yesterday and f...,MILA Plant-Based
2,Paula F.,2019-08-14,4,Wonderful little vegan vegetarian eatery next ...,MILA Plant-Based
3,Josephine W.,2020-03-08,4,I dropped in to Buddhafull during the Hot Choc...,MILA Plant-Based
4,Meredith D.,2021-03-01,4,Have been in here a couple times and hadnt rev...,MILA Plant-Based
...,...,...,...,...,...
195,Emily D.,2018-01-21,1,This is my first visit Very sad first impressi...,Tama Organic Life
196,Sangeetha M.,2018-02-14,3,Tbh I went to take pictures on the swing The d...,Tama Organic Life
197,Thayssa S.,2019-06-11,5,Very healthy food and tasty I love their açai ...,Tama Organic Life
198,Simon W.,2017-05-08,2,The place has got an eat-pray-love vibe to it ...,Tama Organic Life


In [131]:
yelp_df.to_csv('yelp_revs.csv', index=False)

## Selenium