In [322]:
import os
import time
import requests
try:
    import pyarrow
except:
    %pip install pyarrow
    import pyarrow
try:
    import bs4
    from bs4 import BeautifulSoup
except:
    %pip install beautifulsoup4
    import bs4
    from bs4 import BeautifulSoup
try:
    import pandas as pd
except:
    %pip install pandas
    import pandas as pd

from urllib.parse import urlparse

import re

Collecting pyarrow
  Downloading pyarrow-18.1.0-cp311-cp311-win_amd64.whl.metadata (3.4 kB)
Downloading pyarrow-18.1.0-cp311-cp311-win_amd64.whl (25.1 MB)
   ---------------------------------------- 0.0/25.1 MB ? eta -:--:--
   ------------------------------- -------- 19.9/25.1 MB 104.6 MB/s eta 0:00:01
   ---------------------------------------- 25.1/25.1 MB 99.4 MB/s eta 0:00:00
Installing collected packages: pyarrow
Successfully installed pyarrow-18.1.0
Note: you may need to restart the kernel to use updated packages.


In [179]:
from typing import TypedDict

class Restaurant(TypedDict):
    source_page: str
    name: str
    claimed: bool
    review_number: int
    overall_rating: float
    ranking: dict[str:int]
    geographic_location: str
    price_range:str
    address: str
    phone_number: str
    opening_hours: list[str]
    detailed_rating: dict[str:int]
    details: dict[str: list[str]]

class Reviews(TypedDict):
    n_contrib: int
    review_score: float
    review_title: str
    review_body: str
    visit_date: dict[str:str]
    visit_context: str
    review_date: dict[str:str]


In [259]:
headers = {
        "User-Agent": "Mozilla/5.0 AppleWebKit/605.1.15 Version/17.4.1 Safari/605.1.15",
#        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,/;q=0.8",
        "Accept-Language": "fr-FR,fr;q=0.8",
        "Accept-Encoding": "gzip, deflate",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-User": "?1",
        "Cache-Control": "max-age=0",
    }

urls = [
    "https://www.tripadvisor.fr/Restaurant_Review-g187265-d7612326-Reviews-L_Argot-Lyon_Rhone_Auvergne_Rhone_Alpes.html",
    "https://www.tripadvisor.fr/Restaurant_Review-g187265-d949361-Reviews-Le_Casse_Museau-Lyon_Rhone_Auvergne_Rhone_Alpes.html", 
    "https://www.tripadvisor.fr/Restaurant_Review-g187265-d14913909-Reviews-BLO_Restaurant-Lyon_Rhone_Auvergne_Rhone_Alpes.html",
    "https://www.tripadvisor.fr/Restaurant_Review-g187265-d4059959-Reviews-Mama_Restaurant_Lyon-Lyon_Rhone_Auvergne_Rhone_Alpes.html",
    "https://www.tripadvisor.fr/Restaurant_Review-g187265-d5539701-Reviews-L_Institut_Restaurant-Lyon_Rhone_Auvergne_Rhone_Alpes.html",
    "https://www.tripadvisor.fr/Restaurant_Review-g187265-d1331945-Reviews-La_Gargotte-Lyon_Rhone_Auvergne_Rhone_Alpes.html",
    "https://www.tripadvisor.fr/Restaurant_Review-g187265-d4993538-Reviews-Le_Boeuf_D_argent-Lyon_Rhone_Auvergne_Rhone_Alpes.html",
    "https://www.tripadvisor.fr/Restaurant_Review-g187265-d1395616-Reviews-Restaurant_Le_Musee-Lyon_Rhone_Auvergne_Rhone_Alpes.html",
    "https://www.tripadvisor.fr/Restaurant_Review-g187265-d12337867-Reviews-Aromatic_Restaurant-Lyon_Rhone_Auvergne_Rhone_Alpes.html"
]

In [308]:
#restaurant_scraper(urls=["https://www.tripadvisor.fr/Restaurant_Review-g187265-d4059959-Reviews-Mama_Restaurant_Lyon-Lyon_Rhone_Auvergne_Rhone_Alpes.html"], save_path="data", headers=headers)



In [328]:
from bs4 import BeautifulSoup
import bs4
from urllib.parse import urljoin

def restaurant_scraper(urls: list[str], save_path: str, headers: dict[str]):
    names = [urls[i].split('-')[4] for i in range(len(urls))]
    for i in range(len(urls)):
        filename = os.path.join(save_path, f"{names[i]}.html")
        os.makedirs(save_path, exist_ok=True)
        r = requests.get(urls[i], headers= headers)
        with open(filename, "w") as f:
            f.write(r.text)
            time.sleep(5)
       
def parse_reviews(restaurant: dict[any]) -> list[Reviews]:
    REVIEWS_PER_PAGE = 15
    SLEEP_TIME = 10

    url = restaurant["source_page"].split("-")
    n_reviews = int(restaurant["review_number"])
    n_review_pages =  n_reviews // 15 + 1 if n_reviews % 15 != 0 else 0
    reviews = []
    
    for n_page in range(1,n_review_pages+1):
        fail = 0
        url.insert(4, f"or{REVIEWS_PER_PAGE * n_page}")
        r = requests.get("-".join(url), headers=headers)
        while r.status_code != 200:
            fail +=1
            print("request failed {fail} times.")
            time.sleep(SLEEP_TIME * fail)
            r = requests.get("-".join(url), headers=headers)
            assert fail < 10, "Failed to recover the reviews 10 times. Please try again later."
        soup_r = BeautifulSoup(r.text).find_all("div", {"data-automation": "reviewCard"})
        print(f"parsing reviews page {n_page} out of {n_review_pages}.")
        for current_review in soup_r:
            review = {
                "n_contrib": int(current_review.select_one("span.b").text) if current_review.select_one("span.b") is not None else '-999',
                "review_score" : float(current_review.select_one('title').text.split()[0].replace(",", ".")),
                "review_title" : current_review.find("div", {"data-test-target": "review-title"}).text,
                "review_body" : current_review.select('div[data-test-target="review-body"] > span > div > div')[0].text,
                "visit_date" : dict(zip(["month", "year"],current_review.select_one('div[data-test-target="review-title"]').findNextSibling().text.split()[:2])),
                "visit_context": current_review.select_one('div[data-test-target="review-title"]').findNextSibling().text.split()[-1],
                "review_date" : dict(zip(["day", "month", "year"],current_review.find_all('div', recursive=False)[-1].text.split()[2:]))
            }
            reviews.append(review)
        time.sleep(SLEEP_TIME)
    return reviews

In [329]:
def is_URL(path: str) -> bool:
    try:
        parsed_path = urlparse(path)
        return all([parsed_path.scheme, parsed_path.netloc])
    except:
        return False

def get_page(url: str) -> str:
    headers = {
            "User-Agent": "Mozilla/5.0 AppleWebKit/605.1.15 Version/17.4.1 Safari/605.1.15",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,/;q=0.8",
            "Accept-Language": "fr-FR,fr;q=0.8",
            "Accept-Encoding": "gzip, deflate",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
            "Sec-Fetch-Dest": "document",
            "Sec-Fetch-Mode": "navigate",
            "Sec-Fetch-Site": "none",
            "Sec-Fetch-User": "?1",
            "Cache-Control": "max-age=0",
        }
    fail = 0
    while True:
        r = requests.get(url, headers= headers)
        if r.status_code == 200:
            return r.text
            break
        fail += 1
        assert fail < 10, f"Failed to grab the page 10 times with error {r.status.code}. Please try again later."
        print(f"Failed to grab the page {fail} times. Waiting {fail * 5}s before retrying.")
        time.sleep(5 * fail)
    return r.text

def get_file(file_path: str) -> str:
    assert os.path.isfile(file_path), f"File {file_path} does not exist or is not a file."
    with open(file_path, "r") as f:
        html = f.read()
    return html

def page_parser(file_path: str) -> Restaurant:

    if is_URL(file_path):
        print("downloading page")
        html = get_page(file_path)
    else:
        print("recovering file from disk")
        html = get_file(file_path)

    soup = BeautifulSoup(html)
    restaurant_details = soup.find("div", {"data-test-target": "restaurant-detail-info"})
    photos = soup.find("div", {"data-section-signature": "photo_viewer"})
    photos_dict={}
    for img in photos.findAll("img", alt=lambda x: x and x.strip()):
        photos_dict[img["alt"]] = img["src"]
    overview_tabs = soup.find("div", {"data-automation": "OVERVIEW_TAB_ELEMENT"})
    reviews_ratings = soup.select('div[aria-label="Filtrer les avis"] > div > div')
    categories = [cat.select_one("div.o").text for cat in reviews_ratings]
    numbers = [tag.select_one("div:nth-child(3)").text.replace(u'\u202f', '') for tag in reviews_ratings]
    opening_days = soup.select("div.f.e > div.f.e.Q3 > div.f")
    

    restaurant = {
        "source_page" : soup.find("link", {"rel": "canonical"})["href"],
        "name" : restaurant_details.find("h1").text,
        "claimed" : True if restaurant_details.select("span > div")[0].text == 'Page attribuée' else False,
        "review_number" : int(soup.find("a", {"href": "#REVIEWS"}).select_one("div > span").text.replace(u"\u202f", "").split()[0]),
        "overall_rating" : float(restaurant_details.find("title").text.split()[0].replace(",", ".")),
        "ranking" : {"rank": int(restaurant_details.find("b").parent.text.split()[1]),
                "over": int(restaurant_details.find("b").parent.text.replace(u"\u202f", "").split()[3])},
        "geographic_location" : restaurant_details.find("b").parent.text.split()[-1],
        "price_range" : restaurant_details.find_all("div", recursive=False)[1].find_all("span", recursive=False)[2].text.split(",")[0],
        "address" : restaurant_details.select("span > div")[3].text.strip(),
        "phone_number" : restaurant_details.find_all("div", recursive=False)[2].find_all("span", recursive=False)[1].text,
        "opening_hours" : [tag.text for line in opening_days for tag in line],
        "travelers_choice" : overview_tabs.select("div.biGQs._P.pZUbB.KxBGd > div")[0].text if len(overview_tabs.select("div.biGQs._P.pZUbB.KxBGd > div")) != 0 else '',

        "detailed_rating" : {cat: int(num) for cat, num in zip(categories, numbers)},

        "details" : {tag.text: re.split(r'(?=[A-Z])',tag.findNextSibling().text)[1:] for tag in soup.select("div > div.Wf")},
        "photos": photos_dict
    }
    reviews = parse_reviews(restaurant=restaurant)
    restaurant["reviews"] = reviews
    return restaurant
    

Load page from tripadvisor

In [None]:
restaurant = page_parser("https://www.tripadvisor.fr/Restaurant_Review-g187265-d7612326-Reviews-L_Argot-Lyon_Rhone_Auvergne_Rhone_Alpes.html")
df = pd.DataFrame.from_dict(restaurant, orient='index').transpose()
os.makedirs("parsed", exist_ok=True)
df.to_parquet("./parsed/"+restaurant["name"]+".parquet")

Load html files from {PATH_TO_HTML} folder

In [327]:
PATH_TO_HTML = "data"

files = [os.path.join(os.getcwd(),PATH_TO_HTML, file) for file in os.listdir(PATH_TO_HTML) if os.path.isfile(os.path.join(PATH_TO_HTML, file))]
files

['c:\\Users\\Joel\\Documents\\M2 Sise\\M2 SISE - NLP Text Mining\\Projet\\data\\Aromatic_Restaurant.html',
 'c:\\Users\\Joel\\Documents\\M2 Sise\\M2 SISE - NLP Text Mining\\Projet\\data\\BLO_Restaurant.html',
 'c:\\Users\\Joel\\Documents\\M2 Sise\\M2 SISE - NLP Text Mining\\Projet\\data\\La_Gargotte.html',
 'c:\\Users\\Joel\\Documents\\M2 Sise\\M2 SISE - NLP Text Mining\\Projet\\data\\Le_Boeuf_D_argent.html',
 'c:\\Users\\Joel\\Documents\\M2 Sise\\M2 SISE - NLP Text Mining\\Projet\\data\\Le_Casse_Museau.html',
 'c:\\Users\\Joel\\Documents\\M2 Sise\\M2 SISE - NLP Text Mining\\Projet\\data\\L_Argot.html',
 'c:\\Users\\Joel\\Documents\\M2 Sise\\M2 SISE - NLP Text Mining\\Projet\\data\\L_Institut_Restaurant.html',
 'c:\\Users\\Joel\\Documents\\M2 Sise\\M2 SISE - NLP Text Mining\\Projet\\data\\Mama_Restaurant_Lyon.html',
 'c:\\Users\\Joel\\Documents\\M2 Sise\\M2 SISE - NLP Text Mining\\Projet\\data\\Restaurant_Le_Musee.html']

In [313]:
restaurant = page_parser(files[0])
df = pd.DataFrame.from_dict(restaurant, orient='index').transpose()
df.to_parquet(restaurant["name"]+".parquet")

recovering file from disk
