In [None]:
# Webscraping Rotten Tomatoes Critic Reviews (Python 3 + Selenium)
# Updated 2026 to work with RT's current JavaScript-rendered site.
#
# Requirements:
#   pip install selenium beautifulsoup4 pandas lxml webdriver-manager

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException, NoSuchElementException, ElementClickInterceptedException
)
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import os

In [None]:
# ── Configuration ──────────────────────────────────────────────
# Change this slug to scrape a different movie.
# The slug is the part after /m/ in the RT URL, e.g.:
#   https://www.rottentomatoes.com/m/the_hurt_locker  →  "the_hurt_locker"

MOVIE_SLUG = "the_hurt_locker"

# Where to save the output CSV (defaults to this notebook's directory)
OUTPUT_DIR = os.path.dirname(os.path.abspath("__file__"))

# Set to True to run Chrome visibly (useful for debugging selectors)
HEADLESS = True

In [None]:
# ── Helper: create a Selenium Chrome driver ───────────────────

def make_driver(headless=True):
    """Create and return a Chrome WebDriver."""
    opts = Options()
    if headless:
        opts.add_argument("--headless=new")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--window-size=1920,1080")
    opts.add_argument(
        "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
    )
    service = Service(ChromeDriverManager().install())
    return webdriver.Chrome(service=service, options=opts)

In [None]:
# ── Step 1: Load the page and discover the current HTML structure ──
# Run this cell first to see what the rendered page looks like.
# If RT changes their layout, inspect this output to update selectors below.

url = f"https://www.rottentomatoes.com/m/{MOVIE_SLUG}/reviews"
print(f"Loading: {url}")

driver = make_driver(headless=HEADLESS)
driver.get(url)

# Wait for reviews to load (up to 15 seconds)
time.sleep(5)

# Save a snapshot of the rendered page source for inspection
soup = BeautifulSoup(driver.page_source, "lxml")

# Print a summary of what we find
review_rows = soup.find_all("div", class_="review-row")
print(f"Found {len(review_rows)} review rows with class 'review-row'")

# Try alternative selectors if the above didn't work
if not review_rows:
    # Look for any elements containing review-like content
    for tag in ["review-row", "critic-review", "review_table_row", "review-card"]:
        found = soup.find_all(attrs={"class": lambda c: c and tag in c})
        if found:
            print(f"Found {len(found)} elements matching class containing '{tag}'")

    # Also check for data-qa attributes (RT uses these)
    data_qa_elements = soup.find_all(attrs={"data-qa": True})
    qa_values = set(el["data-qa"] for el in data_qa_elements)
    print(f"\ndata-qa attributes found: {sorted(qa_values)}")

print("\n--- First 5000 chars of page body for inspection ---")
body = soup.find("body")
print(body.get_text()[:5000] if body else "No body found")

In [None]:
# ── Step 2: Click "Load More" to get all reviews ─────────────
# RT uses cursor-based pagination with a "Load More" button.
# This cell clicks it repeatedly until all reviews are loaded.

def load_all_reviews(driver, max_clicks=200, pause_range=(0.5, 2.0)):
    """Click the 'Load More' button until all reviews are loaded."""
    clicks = 0
    while clicks < max_clicks:
        try:
            # Try common selectors for the Load More button
            load_more = None
            for selector in [
                "button[data-qa='dlp-load-more-button']",
                "button.load-more-button",
                "rt-button[data-loadmore]",
                "button:has-text('Load More')",
            ]:
                try:
                    load_more = driver.find_element(By.CSS_SELECTOR, selector)
                    break
                except NoSuchElementException:
                    continue

            # Fallback: find any button with "Load More" or "Show More" text
            if load_more is None:
                buttons = driver.find_elements(By.TAG_NAME, "button")
                for btn in buttons:
                    if "load more" in btn.text.lower() or "show more" in btn.text.lower():
                        load_more = btn
                        break

            if load_more is None or not load_more.is_displayed():
                print(f"No more 'Load More' button found after {clicks} clicks. All reviews loaded.")
                break

            # Scroll to the button and click it
            driver.execute_script("arguments[0].scrollIntoView(true);", load_more)
            time.sleep(0.3)
            try:
                load_more.click()
            except ElementClickInterceptedException:
                driver.execute_script("arguments[0].click();", load_more)

            clicks += 1
            if clicks % 10 == 0:
                print(f"  Clicked 'Load More' {clicks} times...")

            time.sleep(random.uniform(*pause_range))

        except Exception as e:
            print(f"Stopped after {clicks} clicks: {e}")
            break

    return clicks

total_clicks = load_all_reviews(driver)
print(f"Done. Clicked 'Load More' {total_clicks} times total.")

In [None]:
# ── Step 3: Parse all critic reviews from the rendered page ───

def parse_critic_reviews(driver):
    """
    Parse critic reviews from the fully-rendered page.
    
    This function tries multiple selector strategies to be resilient
    to minor RT layout changes. If it returns 0 reviews, run Step 1
    again with HEADLESS=False to inspect the page visually.
    """
    soup = BeautifulSoup(driver.page_source, "lxml")
    reviews = []

    # ── Strategy 1: look for review-row divs ──
    rows = soup.find_all("div", class_="review-row")
    if rows:
        print(f"Strategy 1: Found {len(rows)} 'review-row' elements")
        for i, row in enumerate(rows, start=1):
            review = {"Review ID": i}

            # Critic name
            critic_el = (
                row.find("a", class_=lambda c: c and "critic" in c.lower())
                or row.find("a", attrs={"data-qa": "review-critic-link"})
                or row.find("rt-link", attrs={"slot": "criticName"})
            )
            review["Critic"] = critic_el.get_text(strip=True) if critic_el else ""

            # Publication
            pub_el = (
                row.find("a", class_=lambda c: c and "publication" in c.lower())
                or row.find("em")
                or row.find("rt-link", attrs={"slot": "publicationName"})
            )
            review["Publication"] = pub_el.get_text(strip=True) if pub_el else ""

            # Date
            date_el = (
                row.find("span", class_=lambda c: c and "date" in c.lower())
                or row.find(attrs={"data-qa": "review-date"})
                or row.find("rt-text", attrs={"slot": "displayDate"})
            )
            review["Date"] = date_el.get_text(strip=True) if date_el else ""

            # Review text
            text_el = (
                row.find("p", class_=lambda c: c and "review" in c.lower())
                or row.find("div", class_=lambda c: c and "review" in c.lower())
                or row.find(attrs={"data-qa": "review-text"})
                or row.find("rt-text", attrs={"slot": "reviewQuote"})
            )
            review["Review Text"] = text_el.get_text(strip=True) if text_el else ""

            # Sentiment (Fresh / Rotten)
            sentiment = ""
            sent_el = row.find(attrs={"data-qa": lambda v: v and "tomatometer" in v.lower()}) if row else None
            if sent_el:
                sentiment = "Fresh" if "fresh" in str(sent_el).lower() else "Rotten"
            else:
                for attr_val in [str(row)]:
                    if "fresh" in attr_val.lower() and "rotten" not in attr_val.lower():
                        sentiment = "Fresh"
                    elif "rotten" in attr_val.lower():
                        sentiment = "Rotten"
            review["Sentiment"] = sentiment

            reviews.append(review)
        return reviews

    # ── Strategy 2: look for elements with data-qa attributes ──
    rows = soup.find_all(attrs={"data-qa": "critic-review"})
    if not rows:
        rows = soup.find_all(attrs={"data-qa": lambda v: v and "review" in v})
    if rows:
        print(f"Strategy 2: Found {len(rows)} data-qa review elements")
        for i, row in enumerate(rows, start=1):
            texts = [t.strip() for t in row.stripped_strings]
            reviews.append({
                "Review ID": i,
                "Critic": texts[0] if len(texts) > 0 else "",
                "Publication": texts[1] if len(texts) > 1 else "",
                "Date": texts[-1] if len(texts) > 2 else "",
                "Review Text": " ".join(texts[2:-1]) if len(texts) > 3 else "",
                "Sentiment": "",
            })
        return reviews

    # ── Strategy 3: broad fallback — grab all visible text blocks ──
    print("WARNING: Could not find reviews with known selectors.")
    print("Run Step 1 with HEADLESS=False and inspect the page manually.")
    print("Then update the selectors in this function.")
    return reviews

reviews = parse_critic_reviews(driver)
print(f"\nTotal reviews parsed: {len(reviews)}")
if reviews:
    print(f"\nSample review:\n{reviews[0]}")

In [None]:
# ── Step 4: Save to CSV ───────────────────────────────────────

df = pd.DataFrame(reviews)
csv_filename = f"{MOVIE_SLUG}_critic_reviews.csv"
csv_path = os.path.join(OUTPUT_DIR, csv_filename)
df.to_csv(csv_path, index=False, encoding="utf-8")
print(f"Saved {len(df)} reviews to: {csv_path}")
df.head(10)

In [None]:
# ── Step 5: Clean up ──────────────────────────────────────────

driver.quit()
print("Browser closed.")