# Rate My Beer Scraper

In [2]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

In [3]:
# Set up ChromeDriver path
driver_path = "/Users/ramzikattan/Downloads/chromedriver-mac-arm64/chromedriver"
chrome_path = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"

# Configure Chrome options
chrome_options = Options()
chrome_options.binary_location = chrome_path

# Set up the Chrome WebDriver service
service = Service(driver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)

# Open the webpage
url = "https://www.ratebeer.com/top-beers?time=all"
driver.get(url)
time.sleep(5)  # Wait for the page to load

# Parse the page with BeautifulSoup
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')

# Close the browser after fetching the page
driver.quit()

# Find all <a> tags with the class containing the beer name and link
beer_links = soup.find_all('a', class_="MuiTypography-root")

# Extract the name and URL for each beer
beers = []
for beer in beer_links:
    name = beer.get_text(strip=True)  # Get the text (beer name)
    link = beer['href']  # Get the URL
    full_link = "https://www.ratebeer.com" + link  # Construct full URL
    beers.append({'name': name, 'link': full_link})

# Print the extracted beer names and their URLs
#for beer in beers:
 #   print(f"Beer: {beer['name']} - URL: {beer['link']}")

In [4]:
# Clean the matrix for only the beer links
cleaned_beers = [beer for beer in beers[28:128] if beer['name']]  # Only keep non-empty names

In [7]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time

# Configure Chrome options for headless mode
chrome_options = Options()
chrome_options.binary_location = chrome_path  # Path to your Chrome browser
chrome_options.add_argument('--headless')  # Enable headless mode to run faster
chrome_options.add_argument('--disable-gpu')  # Disable GPU acceleration (for better performance in headless mode)
chrome_options.add_argument('--no-sandbox')  # Added for safe execution in certain environments
chrome_options.add_argument('--disable-dev-shm-usage')  # Avoid issues with shared memory

# Set up the Chrome WebDriver service
service = Service(driver_path)  # Path to your ChromeDriver

# Create a new instance of the Chrome driver
driver = webdriver.Chrome(service=service, options=chrome_options)

# Function to click "Show More" buttons on the current page
def click_show_more_buttons():
    # Locate all "Show More" buttons on the page
    show_more_xpath = '//span[contains(@class, "MuiButton-label") and text()="Show more"]'
    show_more_buttons = driver.find_elements(By.XPATH, show_more_xpath)

    # Click each "Show More" button
    for button in show_more_buttons:
        try:
            if button.is_displayed():
                driver.execute_script("arguments[0].click();", button)
                time.sleep(1)  # Wait for the content to expand
        except:
            pass  # Skip if there's an issue clicking the button

# Function to scrape reviews on the current page
def scrape_reviews():
    reviews = []
    ratings_xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "bRPQdN", " " )) and contains(concat( " ", @class, " " ), concat( " ", "MuiTypography-subtitle1", " " ))]'  # Adjust this based on your page's structure for ratings
    messages_xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "pre-wrap", " " )) and contains(concat( " ", @class, " " ), concat( " ", "MuiTypography-body1", " " ))]'  # Adjust this based on your page's structure for review messages

    # Find all the review ratings and messages on the current page
    ratings = driver.find_elements(By.XPATH, ratings_xpath)
    messages = driver.find_elements(By.XPATH, messages_xpath)

    for i, message in enumerate(messages):
        reviews.append({
            'rating': ratings[i].text if i < len(ratings) else None,  # Handle index if ratings and messages mismatch
            'message': message.text
        })

    return reviews

# Function to scrape reviews for a single beer URL with a limit of 250 reviews
def scrape_beer_reviews(beer_name, url, review_limit=250):
    all_reviews = []
    
    driver.get(url)
    time.sleep(2)  # Let the page load fully

    # Handle the cookies banner by accepting it
    try:
        accept_cookies_id = 'onetrust-accept-btn-handler'
        accept_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, accept_cookies_id)))
        accept_button.click()
        time.sleep(2)
    except:
        print("No cookies banner found or failed to dismiss.")
        pass

    # Loop through pages and scrape reviews until the limit is reached
    while len(all_reviews) < review_limit:
        # Click all "Show More" buttons to expand reviews
        click_show_more_buttons()

        # Scrape reviews on the current page
        page_reviews = scrape_reviews()
        all_reviews.extend(page_reviews)

        # Check if we've hit the review limit
        if len(all_reviews) >= review_limit:
            all_reviews = all_reviews[:review_limit]  # Truncate to exactly the review limit
            break

        # Find the "Next" button and move to the next page
        try:
            next_button_xpath = '//button[@aria-label="Next page" and contains(@class, "MuiIconButton-root")]'
            next_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, next_button_xpath)))
            next_button.click()

            # Wait for the next page to load
            time.sleep(2)
        except:
            # If there's no "Next" button, break the loop
            print(f"Finished scraping {beer_name}.")
            break

    return all_reviews

# Initialize an empty DataFrame to store all the reviews
df = pd.DataFrame(columns=['Beer Name', 'URL', 'Rating', 'Review'])

# Iterate through each beer in the cleaned_beers list
for beer in cleaned_beers:
    beer_name = beer['name']
    beer_url = beer['link']

    print(f"Scraping reviews for {beer_name} at {beer_url}...")

    # Scrape reviews for the current beer with a limit of 250 reviews
    reviews = scrape_beer_reviews(beer_name, beer_url, review_limit=250)

    # Create a DataFrame for the reviews of the current beer
    beer_df = pd.DataFrame(reviews)
    beer_df['Beer Name'] = beer_name
    beer_df['URL'] = beer_url

    # Append the DataFrame for this beer to the overall DataFrame
    df = pd.concat([df, beer_df], ignore_index=True)

# Save the DataFrame to a CSV file
df.to_csv('beer_reviews.csv', index=False, encoding='utf-8')

# Close the browser after scraping all URLs
driver.quit()

print("Scraping completed. Data saved to 'A.beer_reviews.csv'")


Scraping reviews for Toppling Goliath Kentucky Brunch🇺🇸Stout - Imperial Flavored / Pastry at https://www.ratebeer.com/beer/toppling-goliath-kentucky-brunch/166019/...
Finished scraping Toppling Goliath Kentucky Brunch🇺🇸Stout - Imperial Flavored / Pastry.
Scraping reviews for Närke Kaggen Stormaktsporter🇸🇪Stout - Imperial at https://www.ratebeer.com/beer/naerke-kaggen-stormaktsporter/58057/...
No cookies banner found or failed to dismiss.
Scraping reviews for Schramm's The Heart of Darkness🇺🇸Mead - Melomel / Fruited at https://www.ratebeer.com/beer/schramm-s-the-heart-of-darkness/231441/...
No cookies banner found or failed to dismiss.
Finished scraping Schramm's The Heart of Darkness🇺🇸Mead - Melomel / Fruited.
Scraping reviews for Westvleteren 12🇧🇪Quadrupel / Abt at https://www.ratebeer.com/beer/westvleteren-12/4934/...
No cookies banner found or failed to dismiss.
Scraping reviews for B. Nektar Ken Schramm Signature Series - The Heart of Darkness🇺🇸Mead - Melomel / Fruited at https://w