In [None]:
# Webscraping Rotten Tomatoes Critic Reviews (Python 3 + Selenium)
# Updated 2026 to work with RT's current JavaScript-rendered site.
#
# Strategy:
#   1. Use Selenium to load the reviews page and click "Load More"
#   2. Intercept the internal API (XHR/fetch) responses for clean JSON data
#   3. Fall back to HTML parsing if API interception doesn't work
#
# Requirements:
#   pip install selenium beautifulsoup4 pandas lxml webdriver-manager

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException, NoSuchElementException, ElementClickInterceptedException
)
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import json
import time
import random
import re
import os

In [None]:
# ── Configuration ──────────────────────────────────────────────
# Change this slug to scrape a different movie.
# The slug is the part after /m/ in the RT URL, e.g.:
#   https://www.rottentomatoes.com/m/the_hurt_locker  →  "the_hurt_locker"

MOVIE_SLUG = "the_hurt_locker"

# Where to save the output CSV (defaults to this notebook's directory)
OUTPUT_DIR = os.path.dirname(os.path.abspath("__file__"))

# Set to True to run Chrome visibly (useful for debugging selectors)
HEADLESS = True

In [None]:
# ── Helper: create a Selenium Chrome driver ───────────────────

def make_driver(headless=True, enable_network_logging=True):
    """Create and return a Chrome WebDriver with optional network logging."""
    opts = Options()
    if headless:
        opts.add_argument("--headless=new")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--window-size=1920,1080")
    opts.add_argument(
        "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
    )

    # Enable performance logging to capture network requests/responses
    if enable_network_logging:
        opts.set_capability("goog:loggingPrefs", {"performance": "ALL"})

    service = Service(ChromeDriverManager().install())
    return webdriver.Chrome(service=service, options=opts)

In [None]:
# ── Step 1: Load the page and discover its structure ──────────
# This cell loads the reviews page, waits for JS rendering, and then
# inspects both the DOM structure and the network requests RT makes.
# Use this to debug selectors if the scraper breaks after an RT update.

url = f"https://www.rottentomatoes.com/m/{MOVIE_SLUG}/reviews"
print(f"Loading: {url}")

driver = make_driver(headless=HEADLESS)
driver.get(url)
time.sleep(5)  # wait for JS to render

# ── Inspect network logs for API endpoints ──
print("=" * 60)
print("NETWORK REQUESTS (looking for review-related API calls):")
print("=" * 60)
logs = driver.get_log("performance")
api_urls = set()
for entry in logs:
    try:
        msg = json.loads(entry["message"])["message"]
        if msg["method"] == "Network.requestWillBeSent":
            req_url = msg["params"]["request"]["url"]
            if any(kw in req_url.lower() for kw in ["review", "napi", "critic"]):
                api_urls.add(req_url)
                print(f"  → {req_url}")
    except (KeyError, json.JSONDecodeError):
        pass
if not api_urls:
    print("  (no review-related API calls detected yet)")

# ── Inspect the rendered DOM ──
print("\n" + "=" * 60)
print("DOM INSPECTION:")
print("=" * 60)
soup = BeautifulSoup(driver.page_source, "lxml")

# Check for review-row divs
review_rows = soup.find_all("div", class_="review-row")
print(f"  div.review-row: {len(review_rows)}")

# Check for data-qa attributes
data_qa_elements = soup.find_all(attrs={"data-qa": True})
qa_values = sorted(set(el["data-qa"] for el in data_qa_elements))
print(f"  data-qa attributes: {qa_values}")

# Check for custom RT web components
all_tags = set(tag.name for tag in soup.find_all(True))
rt_tags = sorted(t for t in all_tags if t and t.startswith("rt-"))
print(f"  Custom rt-* components: {rt_tags}")

# Look for any element with 'review' in class name
review_classes = set()
for el in soup.find_all(attrs={"class": True}):
    for cls in el.get("class", []):
        if "review" in cls.lower():
            review_classes.add(cls)
print(f"  Classes containing 'review': {sorted(review_classes)}")

# Print first chunk of visible text for sanity check
print(f"\n--- First 3000 chars of visible text ---")
body = soup.find("body")
if body:
    text = body.get_text(separator="\n", strip=True)
    print(text[:3000])

In [None]:
# ── Step 2: Click "Load More" and capture network responses ───
# RT loads reviews via internal API calls (XHR/fetch). Each time we
# click "Load More", new review data is fetched. We capture both
# the API JSON responses AND keep the final rendered HTML as fallback.

def extract_review_json_from_logs(driver):
    """Extract review data from Chrome performance logs (network responses)."""
    reviews_from_api = []
    logs = driver.get_log("performance")
    for entry in logs:
        try:
            msg = json.loads(entry["message"])["message"]
            if msg["method"] == "Network.responseReceived":
                resp_url = msg["params"]["response"]["url"]
                if any(kw in resp_url.lower() for kw in ["review", "/napi/"]):
                    request_id = msg["params"]["requestId"]
                    try:
                        body = driver.execute_cdp_cmd(
                            "Network.getResponseBody",
                            {"requestId": request_id}
                        )
                        data = json.loads(body.get("body", "{}"))
                        reviews_from_api.append({
                            "url": resp_url,
                            "data": data
                        })
                    except Exception:
                        pass
        except (KeyError, json.JSONDecodeError):
            pass
    return reviews_from_api

def find_load_more_button(driver):
    """Find the Load More button using multiple selector strategies."""
    # Try CSS selectors first
    for selector in [
        "button[data-qa='dlp-load-more-button']",
        "rt-button[data-loadmore]",
        "[data-qa='load-more-btn']",
        "button.load-more-button",
        "button.js-load-more-btn",
    ]:
        try:
            btn = driver.find_element(By.CSS_SELECTOR, selector)
            if btn.is_displayed():
                return btn
        except NoSuchElementException:
            continue

    # Try XPath for buttons containing "Load More" text
    try:
        btn = driver.find_element(
            By.XPATH, "//button[contains(translate(., 'LOADMRE', 'loadmre'), 'load more')]"
        )
        if btn.is_displayed():
            return btn
    except NoSuchElementException:
        pass

    # Fallback: iterate all buttons
    for btn in driver.find_elements(By.TAG_NAME, "button"):
        txt = btn.text.strip().lower()
        if "load more" in txt or "show more" in txt:
            if btn.is_displayed():
                return btn

    # Also check inside shadow DOM of rt-button elements
    try:
        rt_buttons = driver.find_elements(By.CSS_SELECTOR, "rt-button")
        for rt_btn in rt_buttons:
            shadow = driver.execute_script("return arguments[0].shadowRoot", rt_btn)
            if shadow:
                inner_btn = shadow.find_element(By.CSS_SELECTOR, "button")
                if "load more" in inner_btn.text.strip().lower():
                    return rt_btn  # click the outer rt-button
    except Exception:
        pass

    return None

def load_all_reviews(driver, max_clicks=200, pause_range=(1.0, 2.5)):
    """Click 'Load More' until all reviews are loaded. Returns captured API data."""
    all_api_data = []
    clicks = 0

    while clicks < max_clicks:
        load_more = find_load_more_button(driver)

        if load_more is None:
            print(f"No more 'Load More' button after {clicks} clicks. All reviews loaded.")
            break

        # Scroll to button and click
        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", load_more)
        time.sleep(0.3)
        try:
            load_more.click()
        except ElementClickInterceptedException:
            driver.execute_script("arguments[0].click();", load_more)

        clicks += 1
        if clicks % 10 == 0:
            print(f"  Clicked 'Load More' {clicks} times...")

        time.sleep(random.uniform(*pause_range))

        # Capture any API responses from this click
        api_data = extract_review_json_from_logs(driver)
        all_api_data.extend(api_data)

    return clicks, all_api_data

total_clicks, captured_api_data = load_all_reviews(driver)
print(f"\nDone. Clicked 'Load More' {total_clicks} times.")
print(f"Captured {len(captured_api_data)} API responses containing review data.")

# Show what API URLs were captured
if captured_api_data:
    print("\nCaptured API endpoints:")
    for item in captured_api_data[:5]:
        print(f"  → {item['url']}")
        if isinstance(item['data'], dict):
            print(f"    Keys: {list(item['data'].keys())[:10]}")

In [None]:
# ── Step 3: Parse reviews (API JSON → HTML fallback) ──────────

def parse_reviews_from_api(api_data_list):
    """
    Try to extract structured reviews from captured API JSON responses.
    RT's internal API typically returns reviews with fields like:
    critic, publication, quote/review, date, sentiment, score, etc.
    
    Since RT may change field names, this inspects the JSON structure
    and tries common patterns.
    """
    reviews = []
    seen = set()

    for item in api_data_list:
        data = item["data"]

        # The API response might have reviews at the top level or nested
        review_lists = []
        if isinstance(data, list):
            review_lists.append(data)
        elif isinstance(data, dict):
            # Check common keys where reviews might be stored
            for key in ["reviews", "items", "results", "data", "critics", "rows"]:
                if key in data and isinstance(data[key], list):
                    review_lists.append(data[key])
            # Also check nested: data.reviews, data.pageInfo, etc.
            for key, val in data.items():
                if isinstance(val, dict):
                    for subkey in ["reviews", "items", "results"]:
                        if subkey in val and isinstance(val[subkey], list):
                            review_lists.append(val[subkey])

        for review_list in review_lists:
            for r in review_list:
                if not isinstance(r, dict):
                    continue

                # Extract fields using common RT API field names
                critic = (
                    r.get("criticName") or r.get("critic", {}).get("name", "")
                    if isinstance(r.get("critic"), dict) else r.get("critic", "")
                )
                publication = (
                    r.get("publicationName") or r.get("publication", {}).get("name", "")
                    if isinstance(r.get("publication"), dict) else r.get("publication", "")
                )
                review_text = (
                    r.get("quote") or r.get("reviewText") or r.get("text")
                    or r.get("review") or ""
                )
                date = (
                    r.get("creationDate") or r.get("date") or r.get("reviewDate")
                    or r.get("publicationDate") or ""
                )
                sentiment = r.get("sentiment") or r.get("tomatometerState") or ""
                if not sentiment:
                    score_val = r.get("score") or r.get("rating") or r.get("originalScore")
                    if isinstance(score_val, str) and ("fresh" in score_val.lower()):
                        sentiment = "Fresh"
                    elif isinstance(score_val, str) and ("rotten" in score_val.lower()):
                        sentiment = "Rotten"
                    # Check for isFresh / isRotten boolean flags
                    if r.get("isFresh") or r.get("isPositive"):
                        sentiment = "Fresh"
                    elif r.get("isRotten") or r.get("isNegative"):
                        sentiment = "Rotten"

                original_score = r.get("originalScore") or r.get("score") or ""
                review_url = r.get("url") or r.get("reviewUrl") or r.get("link") or ""
                is_top_critic = r.get("isTopCritic", False)

                # De-duplicate by (critic, review_text[:50])
                dedup_key = (str(critic), str(review_text)[:50])
                if dedup_key in seen:
                    continue
                seen.add(dedup_key)

                reviews.append({
                    "Critic": str(critic).strip(),
                    "Publication": str(publication).strip(),
                    "Date": str(date).strip(),
                    "Review Text": str(review_text).strip(),
                    "Sentiment": str(sentiment).strip(),
                    "Original Score": str(original_score).strip(),
                    "Review URL": str(review_url).strip(),
                    "Top Critic": is_top_critic,
                })

    return reviews


def parse_reviews_from_html(driver):
    """
    Fallback: parse reviews directly from the rendered HTML/DOM.
    Uses multiple strategies to handle RT's evolving page structure.
    """
    soup = BeautifulSoup(driver.page_source, "lxml")
    reviews = []

    # ── Strategy A: div.review-row ──
    rows = soup.find_all("div", class_="review-row")
    if rows:
        print(f"  HTML Strategy A: Found {len(rows)} 'review-row' elements")
        for row in rows:
            review = {}

            # Critic name — try several selectors
            for el in [
                row.find("a", attrs={"data-qa": "review-critic-link"}),
                row.find("a", class_=lambda c: c and "critic" in c.lower()),
                row.find("rt-link", attrs={"slot": "criticName"}),
                row.find("a", href=lambda h: h and "/critic/" in h),
            ]:
                if el:
                    review["Critic"] = el.get_text(strip=True)
                    break
            else:
                review["Critic"] = ""

            # Publication
            for el in [
                row.find("a", class_=lambda c: c and "publication" in c.lower()),
                row.find("rt-link", attrs={"slot": "publicationName"}),
                row.find("em"),
                row.find("a", href=lambda h: h and "/source/" in h),
            ]:
                if el:
                    review["Publication"] = el.get_text(strip=True)
                    break
            else:
                review["Publication"] = ""

            # Date
            for el in [
                row.find(attrs={"data-qa": "review-date"}),
                row.find("rt-text", attrs={"slot": "displayDate"}),
                row.find("span", class_=lambda c: c and "date" in c.lower()),
            ]:
                if el:
                    review["Date"] = el.get_text(strip=True)
                    break
            else:
                review["Date"] = ""

            # Review text
            for el in [
                row.find(attrs={"data-qa": "review-text"}),
                row.find("rt-text", attrs={"slot": "reviewQuote"}),
                row.find("p", class_=lambda c: c and "review" in c.lower()),
                row.find("div", class_=lambda c: c and "review-text" in c.lower()),
            ]:
                if el:
                    review["Review Text"] = el.get_text(strip=True)
                    break
            else:
                review["Review Text"] = ""

            # Sentiment
            row_html = str(row).lower()
            if "certified-fresh" in row_html or "certified_fresh" in row_html:
                review["Sentiment"] = "Certified Fresh"
            elif "fresh" in row_html and "rotten" not in row_html:
                review["Sentiment"] = "Fresh"
            elif "rotten" in row_html:
                review["Sentiment"] = "Rotten"
            else:
                review["Sentiment"] = ""

            review["Original Score"] = ""
            review["Review URL"] = ""
            review["Top Critic"] = False
            reviews.append(review)

        return reviews

    # ── Strategy B: data-qa based ──
    rows = soup.find_all(attrs={"data-qa": lambda v: v and "review" in v.lower()})
    if rows:
        print(f"  HTML Strategy B: Found {len(rows)} data-qa review elements")
        for row in rows:
            texts = [t.strip() for t in row.stripped_strings if t.strip()]
            reviews.append({
                "Critic": texts[0] if len(texts) > 0 else "",
                "Publication": texts[1] if len(texts) > 1 else "",
                "Date": texts[-1] if len(texts) > 2 else "",
                "Review Text": " ".join(texts[2:-1]) if len(texts) > 3 else "",
                "Sentiment": "",
                "Original Score": "",
                "Review URL": "",
                "Top Critic": False,
            })
        return reviews

    # ── Strategy C: use JavaScript to access shadow DOM ──
    try:
        js_reviews = driver.execute_script("""
            const rows = document.querySelectorAll(
                'div[class*="review"], [data-qa*="review"], .review-row'
            );
            return Array.from(rows).map(r => r.innerText);
        """)
        if js_reviews:
            print(f"  HTML Strategy C (JS): Found {len(js_reviews)} text blocks")
            for text in js_reviews:
                lines = [l.strip() for l in text.split('\n') if l.strip()]
                reviews.append({
                    "Critic": lines[0] if len(lines) > 0 else "",
                    "Publication": lines[1] if len(lines) > 1 else "",
                    "Date": lines[-1] if len(lines) > 2 else "",
                    "Review Text": " ".join(lines[2:-1]) if len(lines) > 3 else "",
                    "Sentiment": "",
                    "Original Score": "",
                    "Review URL": "",
                    "Top Critic": False,
                })
            return reviews
    except Exception:
        pass

    print("WARNING: No reviews found with any strategy.")
    print("Set HEADLESS = False, re-run Step 1, and inspect the page in the browser.")
    return reviews


# ── Try API data first, fall back to HTML ──
reviews = []
if captured_api_data:
    reviews = parse_reviews_from_api(captured_api_data)
    print(f"Parsed {len(reviews)} reviews from captured API responses.")

if not reviews:
    print("API parsing yielded 0 reviews. Falling back to HTML parsing...")
    reviews = parse_reviews_from_html(driver)
    print(f"Parsed {len(reviews)} reviews from HTML.")

# Add review IDs
for i, r in enumerate(reviews, start=1):
    r["Review ID"] = i

print(f"\nTotal reviews: {len(reviews)}")
if reviews:
    print(f"\nSample review:")
    for k, v in reviews[0].items():
        print(f"  {k}: {v}")

In [None]:
# ── Step 4: Save to CSV ───────────────────────────────────────

df = pd.DataFrame(reviews)

# Reorder columns
col_order = ["Review ID", "Critic", "Publication", "Date", "Review Text",
             "Sentiment", "Original Score", "Top Critic", "Review URL"]
df = df[[c for c in col_order if c in df.columns]]

csv_filename = f"{MOVIE_SLUG}_critic_reviews.csv"
csv_path = os.path.join(OUTPUT_DIR, csv_filename)
df.to_csv(csv_path, index=False, encoding="utf-8")
print(f"Saved {len(df)} reviews to: {csv_path}")
print(f"\nColumn summary:")
for col in df.columns:
    non_empty = df[col].astype(str).str.strip().ne("").sum()
    print(f"  {col}: {non_empty}/{len(df)} non-empty")
df.head(10)

In [None]:
# ── Step 5: Clean up ──────────────────────────────────────────

driver.quit()
print("Browser closed.")