In [1]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
import pandas as pd
import time 
import re
import os

In [2]:
BASE_URL = "https://www.tripadvisor.com"
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
CHROME_DRIVER = "./chromedriver-win64/chromedriver.exe"

# Make restaurants list

## class: RestaurantListGetter

This class is for getting the list of restaurants which satisfy the following conditions:
- located in Zurich
- offers dinner
- offers Swiss dishes

In [1]:
class RestaurantListGetter():
    def __init__(self):
        self.rank_list = []
        self.name_list = []
        self.html_list = []   
    
    def addInfo(self, offset):        
        # set url and params
        url = "https://www.tripadvisor.com/RestaurantSearch"
        params = {
            "Action": "PAGE",
            "ajax": "1",
            "availSearchEnabled": "true",
            "sortOrder": "relevance",
            "geo": "188113",
            "itags": "10591,16556",
            "cat": "10628",
            "zfp": "10599",
            "eaterydate": "2023_10_01",
            "date": "2023-10-02",
            "time": "20:00:00",
            "people": "2",
            "o": offset
        }
        
        # request url
        try:
            r = requests.get(url, params=params, headers=HEADERS, timeout=10)
        except requests.exceptions.RequestException as e:
            print(f"Error: {e}")
        
        # add info
        soup = BeautifulSoup(r.content, "html.parser")
        
        # html,rank,name
        a_tags = soup.find_all("a", class_="Lwqic Cj b")
        self.html_list += [a_tag.get("href") for a_tag in a_tags]
        text_list = [a_tag.get_text(strip=True).split('.', 1) for a_tag in a_tags]
        self.rank_list += [int(text[0]) if len(text)==2 else 0 for text in text_list]
        self.name_list += [text[1] if len(text)==2 else text[0] for text in text_list]
       
        
    def makeDataFrame(self):
        for i in range(10):
            offset = "a"+str(30*i)
            self.addInfo(offset)
            time.sleep(1)
            
        self.df = pd.DataFrame({
            "rank": self.rank_list,
            "name": self.name_list,
            "html": self.html_list
        })
        

## make restaurants list

In [236]:
rl = RestaurantListGetter()
rl.makeDataFrame()

In [241]:
rl.df.to_csv("./restaurantList.csv", index=False)

# Get html files

## class: HtmlGetter

This class is for getting html files of each restaurant using selenium library.  
Note:
- Make sure that webdriver (chrome driver) is located in CHROME_DRIVER folder, which is defined the top of this notebook.
- Html files will be saved in "html" folder.

In [27]:
class HtmlGetter():
    def __init__(self, url, driver):
        self.url = url
        self.driver = driver
        self.restaurantId = self.getRestaurantId()
        self.relativeUrl = self.url.replace(BASE_URL, "")
        self.hasNextPage = True
        self.soup = None
        
        
    def getRestaurantId(self):
        pattern = re.compile(r'-d(\d+)-')
        match = pattern.search(self.url)
        if match:
            return match.group(1)
        else:
            return None


    def saveAllHtml(self):
        while self.hasNextPage:
            self.saveEachHtml()
            self.checkNextPage()         
    
    
    def saveEachHtml(self):
        # access url
        self.driver.get(self.url)
        self.driver.refresh()
        
        # wait to avoid overload for server
        time.sleep(1)
        
        # click "More" to show full text        
        more_links = self.driver.find_elements_by_css_selector('span.taLnk.ulBlueLinks')
        if len(more_links)>0:
            more_links[0].click()
        
        # wait to avoid overload for server
        time.sleep(1)
        
        # extract review part
        self.soup = BeautifulSoup(self.driver.page_source, "html.parser")
        reviews = self.soup.find(id="taplc_location_reviews_list_resp_rr_resp_0")
        
        # delete img link
        for img_tag in reviews.find_all("img"):
            img_tag.decompose()
        
        # save html
        dir_path = f"./html/{self.restaurantId}"
        os.makedirs(dir_path, exist_ok=True)
        filename = f"{dir_path}/{self.relativeUrl}"
        with open(filename, "w", encoding="utf_8_sig") as f:
             f.write(reviews.prettify())
        
        # check if next page exists
        self.checkNextPage()
    
    
    def checkNextPage(self):
        next_button = self.soup.find("a", class_="nav next ui_button primary")
        if (not next_button) or (next_button.get('href')==""):
            self.hasNextPage = False
        else:
            self.relativeUrl = next_button.get('href')
            self.url = BASE_URL + self.relativeUrl   

## load restaurants list

In [28]:
# get html list of restaurants
rl = pd.read_csv("./restaurantList.csv")
html_list = list(rl["html"])

## save html files

In [29]:
# set web driver
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(options=options, executable_path=CHROME_DRIVER)

In [36]:
# iterate over restaurants
for i, html in enumerate(html_list):
    # set url
    url = BASE_URL + html
    
    # scraping all review pages
    hg = HtmlGetter(url, driver)
    hg.saveAllHtml()
    
    # display progress
    total = len(html_list)
    print(f"\rDone: {i+1}/{total}", end="", flush=True)

Done: 11/11

In [37]:
# quit web driver
driver.quit()