In [10]:
import time
import asyncio
import json
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout



In [8]:
import json
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout
from dateutil import parser as dateparser

START_URL = "https://www.goodreads.com/book/show/32758901-all-systems-red/reviews"


# -----------------------------
# Helpers
# -----------------------------

def normalize_date(date_str):
    if not date_str:
        return None
    try:
        return dateparser.parse(date_str).date().isoformat()
    except:
        return date_str.strip()


def review_signature(r):
    return (
        (r["username"] or "").strip(),
        (r["date"] or "").strip(),
        r["text"][:200]
    )


async def expand_review_text(page):
    """
    Expand review-level 'Show more' links/buttons.
    """
    expanders = await page.query_selector_all(
        "article a:has-text('Show more'), article button:has-text('Show more')"
    )
    for el in expanders:
        try:
            await page.evaluate("(e) => e.click()", el)
            await page.wait_for_timeout(120)
        except:
            pass


async def extract_current_reviews(page):
    """
    Extract the CURRENT batch of reviews (‚âà30).
    Dates are pulled from subdued metadata spans (not <time>).
    """
    return await page.eval_on_selector_all(
        "article, div[data-testid='review']",
        r"""
        cards => cards.map(card => {
            const username =
                card.querySelector('a[href*="/user/show"]')?.innerText?.trim() ?? null;

            // Goodreads date is usually subdued metadata text
            let date = null;
            const spans = Array.from(card.querySelectorAll('span'))
                .map(s => s.innerText.trim())
                .filter(t =>
                    t.match(/\d{4}|ago|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec/)
                );

            if (spans.length > 0) {
                date = spans[spans.length - 1];
            }

            return {
                username,
                date,
                text: card.innerText.trim()
            };
        })
        """
    )


async def find_page_level_load_more(page):
    """
    Find page-level 'Show more reviews' (NOT inside a review card).
    """
    buttons = await page.query_selector_all("button:has-text('Show more')")
    for btn in buttons:
        inside_review = await btn.evaluate(
            "el => el.closest('article, div[data-testid=\"review\"]') !== null"
        )
        if not inside_review:
            return btn
    return None


# -----------------------------
# Main scraper
# -----------------------------

async def scrape_goodreads_reviews(
    max_pages=20,
    manual_unlock=True
):
    all_reviews = []
    seen_reviews = set()
    seen_page_heads = set()

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)

        # üîë SHORT viewport is REQUIRED
        context = await browser.new_context(
            viewport={"width": 1280, "height": 700}
        )
        page = await context.new_page()

        await page.goto(START_URL, wait_until="domcontentloaded")
        await page.wait_for_timeout(4000)

        # -----------------------------
        # Manual unlock (optional)
        # -----------------------------
        if manual_unlock:
            print("\nüö® MANUAL STEP (ONCE) üö®")
            print("‚Ä¢ Scroll normally")
            print("‚Ä¢ Click 'Show more reviews' ONCE")
            print("‚Ä¢ Confirm reviews change")
            input("üëâ Press Enter to continue automation...")

        # -----------------------------
        # Pagination loop (REPLACE model)
        # -----------------------------
        for page_num in range(max_pages):
            print(f"\n--- Review batch {page_num + 1} ---")

            # Expand visible review text FIRST
            await expand_review_text(page)

            current_reviews = await extract_current_reviews(page)
            if not current_reviews:
                print("No reviews found. Stopping.")
                break

            page_head = current_reviews[0]["text"][:200]
            if page_head in seen_page_heads:
                print("Reached repeated page. Done.")
                break
            seen_page_heads.add(page_head)

            new_count = 0
            for r in current_reviews:
                r["date"] = normalize_date(r["date"])
                sig = review_signature(r)
                if sig not in seen_reviews:
                    seen_reviews.add(sig)
                    all_reviews.append(r)
                    new_count += 1

            print(f"Collected {new_count} new reviews (total {len(all_reviews)})")

            # Find page-level pagination button
            load_more = await find_page_level_load_more(page)
            if not load_more:
                print("No 'Show more reviews' button found. Done.")
                break

            before_text = current_reviews[0]["text"]

            await load_more.scroll_into_view_if_needed()
            await page.wait_for_timeout(500)
            await page.evaluate("(b) => b.click()", load_more)

            # Wait for content replacement
            try:
                await page.wait_for_timeout(1500)
                after_reviews = await extract_current_reviews(page)
                if after_reviews and after_reviews[0]["text"] == before_text:
                    print("Page did not change. Stopping.")
                    break
            except PlaywrightTimeout:
                print("Timeout waiting for next page.")
                break

        await browser.close()

    return all_reviews




In [9]:
reviews = await scrape_goodreads_reviews(
    max_pages=5,
    manual_unlock=True
)

print(f"Final review count: {len(reviews)}")


üö® MANUAL STEP (ONCE) üö®
‚Ä¢ Scroll normally
‚Ä¢ Click 'Show more reviews' ONCE
‚Ä¢ Confirm reviews change


üëâ Press Enter to continue automation... 



--- Review batch 1 ---
Collected 31 new reviews (total 31)

--- Review batch 2 ---
Collected 30 new reviews (total 61)

--- Review batch 3 ---
Collected 30 new reviews (total 91)

--- Review batch 4 ---
Collected 30 new reviews (total 121)

--- Review batch 5 ---
Collected 30 new reviews (total 151)
Final review count: 151


In [10]:
reviews

[{'username': '',
  'date': '2017-08-29',
  'text': 'K.J. Charles\nAuthor\xa0\n65 books\n12.1k followers\nFollow\nRead\nAugust 29, 2017\nMagnificent. Tor.com continue to rock it with this fantastic tale of a self-hacked cyborg security unit with a gloriously bad attitude. It just wants to sit there downloading soaps but HEY HO SABOTAGE, PEOPLE TO SAVE, PEOPLE TO KILL.\n\nTerrific writing, lovely characterisation, very funny, and there will be more! Consider me glued.\nfun-fun-fun\n \nnovella\n \noh-god-where-is-the-next-book\n \n...more\n128 likes\n3 comments\nLike\nComment'},
 {'username': '',
  'date': 'read-in-2019',
  'text': "Melanie (meltotheany)\n1,195 reviews\n102k followers\nFollow\nAugust 23, 2019\nI really enjoyed this! And it had such a great discussion on what it means to be human and what we are all willing to do in the name of trust. I can't wait to continue on with this series! <3\n\nBlog | Instagram | Twitter | Tumblr | Youtube | Twitch\n\nBuddy read with Lea! ‚ù§\n\n‚

In [11]:
with open("dates_151_goodreads_reviews.json", "w", encoding="utf-8") as f:
    json.dump(reviews, f, ensure_ascii=False, indent=2)

In [25]:
len(reviews)

31

In [13]:
from collections import Counter


In [14]:
texts = [r["text"].strip() for r in reviews if r.get("text")]
text_counts = Counter(texts)

exact_dupes = {t: c for t, c in text_counts.items() if c > 1}

print(f"Exact text duplicates: {len(exact_dupes)}")

Exact text duplicates: 0
