In [12]:
import pandas as pd
import csv
from bs4 import BeautifulSoup as bs
import re
import requests
import datetime as dt
import time

In [13]:
"""
This function is to parse out content using a regular expression pattern from a string.
<param>regex_pattern</param>
<param>string</param>
"""

def parse_definition(regex_pattern, string):
    result = re.compile(regex_pattern, flags=re.MULTILINE|re.DOTALL)
    # Checking if the patten works for the string
    if not result.search(string):
        return "None"
    else:
        return result.search(string).group(1)

In [14]:
# Example store
url = "https://www.yelp.com/biz/kempt-mens-hair-irvine-2?osq=kempt"

In [15]:
my_dict = {"user": [],
            "body_review": [],
            "rating": [],
            "possible_rating": [],
            "location": [],
            "date_review": [],
            "useful": [],
            "id_review": [],
            "source": []}

In [16]:
#Grabbing HTML content
response = requests.get(url)
soup = bs(response.text, 'html.parser')
reviews_per_page = 20
total_reviews = int(parse_definition('\\{\"reviewCount\": (\\d+),', response.text))
max_pages_to_scrape = int((total_reviews / reviews_per_page) + 1)
yelp_reviews = soup.find_all("div", attrs={'class': 'review review--with-sidebar'})
store_name = soup.find("h1", attrs={'class':'biz-page-title'}).text.strip()
review_pages = 1

In [18]:
print(f"Total Reviews: {total_reviews}")
print(f"Expected Review Pages: {max_pages_to_scrape}")

Total Reviews: 70
Expected Review Pages: 4


In [19]:
while review_pages <= max_pages_to_scrape:
    print(f"Review Page: {review_pages}")
    yelp_reviews = soup.find_all("div", attrs={'class': 'review review--with-sidebar'})
    #looping through each review to get the content
    for i in yelp_reviews:
        user = i.find('a', attrs={'id':'dropdown_user-name'}).text    
        body_review = i.find("p", attrs={'lang':'en'}).text  
        rating = parse_definition('i-stars i-stars--regular-[^\"]+\" title=\"([\d\\.]+)[^\"]+\"', str(i))    
        location = i.select(".user-location > b")[0].text    
        date_review = parse_definition("<span class=\"rating-qualifier\">\s*([\d\\/]+)[^<]+<", str(i))
        useful = parse_definition('Useful<\\/span>\\s*<span class=\"count\">(\\d+)<', str(i))    
        id_review = parse_definition('data-review-id=\"([^\"]+)\"', str(i))

        if id_review in my_dict["id_review"]:
            pass

        my_dict["user"].append(user)
        my_dict["body_review"].append(body_review)
        my_dict["rating"].append(float(rating))
        my_dict["possible_rating"].append(5.0)
        my_dict["location"].append(location)
        my_dict["date_review"].append(date_review)
        my_dict["useful"].append(useful)
        my_dict["id_review"].append(id_review)
        my_dict["source"].append("yelp")
        
       
    try:
        review_pages+=1
        next_page_url = soup.find("a", class_="u-decoration-none next pagination-links_anchor").attrs['href']
        # Sleep for 3 seconds to avoid blocks
        time.sleep(3)
        response = requests.get(next_page_url)
        soup = bs(response.text, 'html.parser')
    except:
        break

Review Page: 1
Review Page: 2
Review Page: 3
Review Page: 4


In [20]:
# Checking how many users
# Length should match total_reviews
users = my_dict["user"]
if len(users) != total_reviews:
    print(f"Please check crawler. Length of users, {len(users)}, don't match total reviews, {total_reviews}.")
else:
    print(f"Scrape completed! Length of users, {len(users)}, matches with the total reviews, {total_reviews}.")

Scrape completed! Length of users, 70, matches with the total reviews, 70.


In [23]:
yelp_reviews_df = pd.DataFrame(my_dict)
yelp_reviews_df.head()

Unnamed: 0,body_review,date_review,id_review,location,possible_rating,rating,source,useful,user
0,"I've been waiting for my 3rd cut to post this,...",5/30/2018,VQhZMCY6b7jKyr-HDbrRaQ,"Irvine, CA",5.0,5.0,yelp,,Brian L.
1,I have been to KEMPT twice and both experience...,5/2/2018,jaL8RSQ0SsxagLiNq0G1Pg,"Irvine, CA",5.0,4.0,yelp,,Daniel E.
2,"I went on yelp, messaged 3 different salons an...",4/27/2018,O2hNoxxyvwPONcWoWkBp0w,"Santa Ana, CA",5.0,5.0,yelp,,jesse n.
3,Found this place for my boyfriend and let me t...,6/14/2018,7b-FVnYllrQGBbivJlP4UQ,"Irvine, CA",5.0,5.0,yelp,,Olivia C.
4,Well i feel really satisfied with my new hair ...,6/15/2018,8xg0pEpGQU1L8sNH2s6LCg,"Westminster, CA",5.0,5.0,yelp,,José A.


In [22]:
# Saving into a csv file
yelp_reviews_df.to_csv(f"reviews/{store_name.replace(' ','_')}.csv", encoding='utf-8')