In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from datetime import datetime
import re
import glob

In [2]:
HTML_PATH = "./html/"

# Parse reviews

## Define class
This class is for parsering html files of reviews and extracting the following information:
- id: unique id of restaurant on Tripadvisor
- reviewDate: date of the review
- reviewRating: rating of the review
- title: title of the review
- content: content of the review (text)

In [3]:
class ReviewsParser():
    def __init__(self):
        self.folders = glob.glob(HTML_PATH+"*")
        self.restaurantId_list = []
        self.reviewDate_list = []
        self.rating_list = []
        self.title_list = []
        self.content_list = []
        self.restaurantId = ""
        
    
    def parseRestaurants(self):
        total = len(self.folders)
        for i, folder in enumerate(self.folders):
            self.restaurantId = re.search(r'\d+', folder).group()
            files = glob.glob(folder+"/*.html")
            
            for file in files:
                self.parseReviews(file)
            
            print(f"\rDone: {i+1}/{total}", end="", flush=True)
        
        self.makeDataframe()
    
    
    def parseReviews(self, file):
        soup = BeautifulSoup(open(file, encoding="utf-8"), "html.parser")
        reviews = soup.find_all("div", class_="ui_column is-9")
        
        self.restaurantId_list += [self.restaurantId] * len(reviews)
        self.reviewDate_list += list(map(self.getDate, reviews))
        self.rating_list += list(map(self.getRating, reviews))
        self.title_list += list(map(self.getTitle, reviews))
        self.content_list += list(map(self.getContent, reviews))
    
    
    # date
    def getDate(self, review):
        reviewDate_tag = review.find("div", class_="prw_rup prw_reviews_stay_date_hsx")
        reviewDate = reviewDate_tag.get_text(strip=True).split(":")
        if len(reviewDate)==1:
            return None
        reviewDate = reviewDate[1]
        reviewDate = datetime.strptime(reviewDate, '%B %Y').date()
        return reviewDate

    # rating
    def getRating(self, review):
        rating_tag = review.find("span", class_="ui_bubble_rating")
        rating = rating_tag.get("class")[1]
        rating = int(0.1*float(rating.split("_")[1]))
        return rating

    # title
    def getTitle(self, review):
        title_tag = review.find("span", class_="noQuotes")
        title = title_tag.get_text(strip=True)
        return title

    # content
    def getContent(self, review):
        content_tag = review.find("p", class_="partial_entry")
        content = content_tag.get_text(strip=True)
        return content

    
    def makeDataframe(self):
        self.df = pd.DataFrame({
            "id": self.restaurantId_list,
            "reviewDate": self.reviewDate_list,
            "reviewRating": self.rating_list,
            "title": self.title_list,
            "content": self.content_list
        })
        

## do parse

In [4]:
p = ReviewsParser()

In [5]:
p.parseRestaurants()

Done: 293/293

In [8]:
p.df.to_csv("review_data.csv", encoding='utf-8', index=False)