In [None]:
#pip install --upgrade selenium urllib3
#pip install --upgrade webdriver-manager

In [1]:
pip install selenium

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
########## code for gathering information on vegetarian restaurants in Paris from yelp site

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json
import time
import os
import re  # Added for regular expressions

def extract_restaurant_details(driver, link):
    """
    Extract detailed information about a restaurant, including:
    Name, Rating, Reviews Count, Category, Address, and up to 5 reviews.
    """
    details = {
        "Name": "N/A",
        "Rating": "N/A",
        "Reviews_Count": "N/A",
        "Category": "N/A",
        "Address": "N/A",
        "Reviews": []
    }
    try:
        driver.get(link)
        time.sleep(3)  # Wait for the page to load

        # Extract the restaurant's name (mandatory field)
        try:
            details["Name"] = driver.find_element(By.CSS_SELECTOR, "h1.y-css-olzveb").text.strip()
        except Exception as e:
            print(f"Error extracting name: {e}")

        # Extract the restaurant's rating (optional field)
        try:
            rating_text = driver.find_element(By.CSS_SELECTOR, "span.y-css-1jz061g").text.strip()
            # Extract only numbers or set "N/A" if no numbers are found
            details["Rating"] = rating_text if re.match(r"^\d+(\.\d+)?$", rating_text) else "N/A"
        except:
            details["Rating"] = "N/A"

        # Extract the number of reviews (optional field)
        try:
            reviews_count = driver.find_element(By.CSS_SELECTOR, "a.y-css-1ijjqcc").text.strip()
            reviews_count_cleaned = reviews_count.replace("(", "").replace(")", "").split()[0]
            # Extract only numbers or set "N/A" if no numbers are found
            details["Reviews_Count"] = reviews_count_cleaned if reviews_count_cleaned.isdigit() else "N/A"
        except:
            details["Reviews_Count"] = "N/A"

        # Extract the restaurant's category (mandatory field)
        try:
            details["Category"] = driver.find_element(By.CSS_SELECTOR, "span.y-css-1cafv3i").text.strip()
        except Exception as e:
            print(f"Error extracting category: {e}")

        # Extract the restaurant's address (mandatory field)
        try:
            details["Address"] = driver.find_element(By.CSS_SELECTOR, "p.y-css-jbomhy").text.strip()
        except Exception as e:
            details["Address"] = "N/A"  # Set to "N/A" if address is not found

        # Extract up to 5 reviews (optional field)
        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, "p.comment__09f24__D0cxf span.raw__09f24__T4Ezm"))
            )
            review_elements = driver.find_elements(By.CSS_SELECTOR, "p.comment__09f24__D0cxf span.raw__09f24__T4Ezm")
            for review_element in review_elements[:5]:  # Limit to 5 reviews
                details["Reviews"].append(review_element.text.strip())
        except:
            details["Reviews"] = []

    except Exception as e:
        print(f"Error extracting details for {link}: {e}")

    return details

def get_all_restaurant_links(driver):
    """Visit all pages to collect restaurant links"""
    base_url = "https://www.yelp.fr/search?find_desc=Restaurants&find_loc=Paris&cflt=vegetarian"
    driver.get(base_url)

    links = []
    page = 1
    while True:
        print(f"Scraping links on page {page}...")
        try:
            # Wait for the page to load
            WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div[data-testid='serp-ia-card']"))
            )
            restaurant_cards = driver.find_elements(By.CSS_SELECTOR, "div[data-testid='serp-ia-card']")
            
            for card in restaurant_cards:
                try:
                    link = card.find_element(By.CSS_SELECTOR, "a.y-css-1ijjqcc").get_attribute("href").strip()
                    if link:
                        links.append(link)
                except:
                    continue
            
            # Move to the next page
            next_button = driver.find_elements(By.CSS_SELECTOR, "a.next-link")
            if next_button:
                driver.execute_script("arguments[0].click();", next_button[0])
                time.sleep(5)  # Wait after page navigation
                page += 1
            else:
                print("Last page reached.")
                break
        except Exception as e:
            print(f"Error on page {page}: {e}")
            break

    return links


def save_to_json(data, filename):
    """Save data to a JSON file"""
    output_path = f"output/{filename}"
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    print(f"Data saved to {output_path}")

def main():
    chrome_driver_path = "C:/Users/tipss/Downloads/chromedriver-win64/chromedriver.exe"
    service = Service(chrome_driver_path)
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")
    driver = webdriver.Chrome(service=service, options=options)

    try:
        # Step 1: Collect all restaurant links
        restaurant_links = get_all_restaurant_links(driver)
        save_to_json(restaurant_links, "restaurant_links.json")

        # Step 2: Visit each link to collect detailed information and reviews
        all_restaurant_details = []
        total_links = len(restaurant_links)

        for idx, link in enumerate(restaurant_links, start=1):
            print(f"Scraping details for restaurant {idx}/{total_links}: {link}")
            restaurant_details = extract_restaurant_details(driver, link)
            all_restaurant_details.append(restaurant_details)

        # Save the results to restaurant_details.json
        save_to_json(all_restaurant_details, "restaurant_details.json")

    finally:
        driver.quit()


if __name__ == "__main__":
    main()


Scraping links on page 1...
Scraping links on page 2...
Scraping links on page 3...
Scraping links on page 4...
Scraping links on page 5...
Scraping links on page 6...
Scraping links on page 7...
Scraping links on page 8...
Scraping links on page 9...
Scraping links on page 10...
Scraping links on page 11...
Scraping links on page 12...
Scraping links on page 13...
Scraping links on page 14...
Scraping links on page 15...
Last page reached.
Data saved to output/restaurant_links.json
Data saved to output/restaurant_details.json
