In [78]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException

# --- CONFIGURATION ---
BASE_URL = "https://rollcall.com/factbase/"
CSV_FILENAME = "trump_transcripts.csv"


def setup_driver(headless=False):
    options = webdriver.ChromeOptions()

    # 1. Basic Ad/Popup Suppression
    options.add_argument("--disable-popup-blocking")
    options.add_argument("--disable-notifications") # Blocks "Show Notifications" prompts
    options.add_argument("--disable-infobars")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    # 2. Aggressive Page Loading (Don't wait for ad scripts)
    options.page_load_strategy = 'eager'

    if headless:
        options.add_argument("--headless=new") # Newer headless mode is more stable

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.execute_cdp_cmd("Network.enable", {})

    # Set blocked URLs (basic pattern matching)
    driver.execute_cdp_cmd("Network.setBlockedURLs", {
        "urls": [
            "*.doubleclick.net",
            "*.googleadservices.com",
            "*.googlesyndication.com",
            "*.moatads.com",
            "*ads*",
        ]
    })
    driver.set_page_load_timeout(20)
    return driver


def load_and_scroll_list(driver, num_scrolls=3):
    """Navigates to the main list and scrolls down."""
    print(f"Opening {BASE_URL}...")

    try:
        driver.get(BASE_URL)
    except TimeoutException:
        print("Page loading took too long! Stopping extra loading and continuing...")
        driver.execute_script("window.stop();")  # Force stop the browser loading spinner
    except Exception as e:
        print(f"Warning during load: {e}")

    # Now we rely on Explicit Waits (WebDriverWait) to ensure the *important* content is there
    print("Verifying content loaded...")
    try:
        # Wait up to 15s for the list of items (rows) to actually appear
        WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'row')]"))
        )
        print('Initial content loaded successfully.')
    except Exception:
        print("Warning: Content might not be fully visible yet.")

    print('Starting to scroll...')
    for i in range(num_scrolls):
        close_popups(driver)
        print(f"[List Page] Scrolling down ({i + 1}/{num_scrolls})...")
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Wait for Vue.js to render new items

    print("[List Page] Finished scrolling.")


def get_transcript_links(driver):
    """Finds all 'View Transcript' buttons."""
    links = []
    print("Extracting links...")

    try:
        # Looking for the 'View Transcript' buttons based on your screenshot
        elements = driver.find_elements(By.CSS_SELECTOR, "a[title='View Transcript']")

        print(f"DEBUG: Found {len(elements)} raw elements. Processing...")

        for elem in elements:
            try:
                url = elem.get_attribute('href')
                if url and url not in links:
                    links.append(url)
            except:
                continue

    except Exception as e:
        print(f"[Error] Could not extract links: {e}")

    print(f"[List Page] Found {len(links)} unique transcript links.")
    return links

In [62]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import time

# 1. Setup Chrome options
chrome_options = Options()

# 2. Add the path to your downloaded .crx file
# Make sure the file is in the same folder or provide the full path
chrome_options.add_extension('/Users/kangheecho/Library/Application Support/Google/Chrome/Default/Extensions/ddkjiahejlhfcafbddmgiahcphecmpfh/2025.1207.2142_0.crx')

# 3. Initialize the driver with options
driver = webdriver.Chrome(options=chrome_options)

# 4. Test it out
driver.get("https://www.speedtest.net") # A site usually heavy with ads
time.sleep(10)
driver.quit()

In [63]:
from selenium import webdriver

driver = webdriver.Chrome()

# Enable network tracking
driver.execute_cdp_cmd("Network.enable", {})

# Set blocked URLs (basic pattern matching)
driver.execute_cdp_cmd("Network.setBlockedURLs", {
    "urls": [
        "*.doubleclick.net",
        "*.googleadservices.com",
        "*.googlesyndication.com",
        "*.moatads.com",
        "*ads*",
    ]
})

driver.get("https://www.speedtest.net")

In [5]:
# driver = setup_driver()
# try:
#     load_and_scroll_list(driver)
#     links = get_transcript_links(driver)
#
#     print("\n--- RESULTS ---")
#     print(links)
# finally:
#     driver.quit()

In [77]:
import re


def scrape_single_transcript(driver, url):
    """
    Scrapes a page and returns a dictionary with a UNIQUE ID to handle multiple events per day.
    """

    try:
        print(f"   -> Visiting: {url}")
        driver.get(url)
    except TimeoutException:
        print("   [Warn] Page load timed out (renderer error). Stopping load and proceeding...")
        driver.execute_script("window.stop();") # Force stop the loading spinner
    except Exception as e:
        print(f"   [Error] Unexpected error during get(): {e}")
        return []

    # Create a unique ID from the URL slug (e.g., 'trump-rally-ohio-dec-15')
    # This guarantees that two events on the same day never collide.
    slug = url.rstrip('/').split('/')[-1]

    print(f"   -> Visiting: {url}")
    print(f"   -> Event ID: {slug}")

    driver.get(url)

    event_data = {
        "unique_id": slug,         # PRIMARY KEY
        "date": None,
        "event_name": None,
        "url": url,
        "transcript": []
    }

    try:
        wait = WebDriverWait(driver, 15)

        # --- A. METADATA EXTRACTION ---
        try:
            # 1. Get Title
            title_elem = wait.until(EC.presence_of_element_located((By.TAG_NAME, "h1")))
            raw_title = title_elem.text.strip()
            event_data["event_name"] = raw_title

            # 2. Extract Date from Title (e.g. "December 13, 2025")
            # We look for the pattern: Month Date, Year at the end of the line
            date_match = re.search(r'([A-Z][a-z]+ \d{1,2}, \d{4})$', raw_title)
            if date_match:
                event_data["date"] = date_match.group(1)
            else:
                # If date isn't in title, try to pull it from the URL or set unknown
                event_data["date"] = "Unknown"

        except Exception as e:
            print(f"   [Warning] Metadata issue: {e}")

        # --- B. TRANSCRIPT EXTRACTION ---
        # Locate the specific container class from your screenshot
        rows = driver.find_elements(By.XPATH, "//div[contains(@class, 'flex gap-4 py-2')]")

        if len(rows) == 0:
            close_popups(driver)
            rows = driver.find_elements(By.XPATH, "//div[contains(@class, 'flex gap-4 py-2')]")

        transcript_list = []

        for row in rows:
            try:
                # 1. Speaker Name (in the <h2> tag)
                speaker_elem = row.find_element(By.TAG_NAME, "h2")
                speaker = speaker_elem.text.strip()

                # 2. The Remark Text (in the 'leading-loose' div)
                text_elem = row.find_element(By.XPATH, ".//div[contains(@class, 'leading-loose')]")
                remark = text_elem.text.strip()

                # 3. Timestamp (Optional: helps sort chronologically inside the event)
                timestamp = "00:00:00"
                try:
                    time_elem = row.find_element(By.XPATH, ".//span[contains(@class, 'text-gray-600')]")
                    # Clean text like "00:12:30-00:12:45 (15 sec)" -> "00:12:30"
                    timestamp = time_elem.text.split('-')[0].strip()
                except:
                    pass

                if speaker and remark:
                    transcript_list.append({
                        "time": timestamp,
                        "speaker": speaker,
                        "remark": remark
                    })

            except Exception:
                continue # Skip malformed rows

        event_data["transcript"] = transcript_list

    except Exception as e:
        print(f"   [Error] Failed to scrape {url}: {e}")
        return []

    return [event_data]

In [48]:
def close_popups(driver):
    """
    Tries to find and click common 'Close' buttons (X, Close, Dismiss).
    """
    print("   [Defense] Scanning for popups...")

    # Common selectors for ad close buttons
    popup_selectors = [
        "button[aria-label='Close']",
        "button.close",
        "div[class*='modal'] button",
        "svg[data-icon='times']",    # FontAwesome 'X' icon
        "//button[contains(text(), 'Close')]",
        "//button[contains(text(), 'No Thanks')]"
    ]

    for selector in popup_selectors:
        try:
            # Determine if it's CSS or XPath
            by_method = By.XPATH if "//" in selector else By.CSS_SELECTOR

            buttons = driver.find_elements(by_method, selector)
            for btn in buttons:
                if btn.is_displayed():
                    btn.click()
                    print("   [Defense] Closed a popup!")
                    time.sleep(1) # Wait for animation
                    return True # Success
        except:
            continue

    return False

In [73]:
driver = setup_driver()
print(len(scrape_single_transcript(driver, "https://rollcall.com/factbase/trump/transcript/donald-trump-press-gaggle-kennedy-center-red-carpet-december-7-2025/")[0]['transcript']))

   -> Visiting: https://rollcall.com/factbase/trump/transcript/donald-trump-press-gaggle-kennedy-center-red-carpet-december-7-2025/
   -> Event ID: donald-trump-press-gaggle-kennedy-center-red-carpet-december-7-2025
194


In [79]:
import pandas as pd
import json
import time
import random

def scrape_transcripts():
    # Setup with the 'eager' strategy and timeout options we configured earlier
    driver = setup_driver(headless=False)
    all_events = []
    failed_urls = []

    # CONFIG
    OUTPUT_FILE = "trump_data.csv"
    SAVE_EVERY_N = 5  # Save progress every 5 links

    try:
        print("--- Step A: Loading & Scrolling ---")
        # Note: 40 scrolls is heavy. If this crashes, try doing 10 at a time.
        load_and_scroll_list(driver, num_scrolls=40)

        print("--- Step B: Gathering Links ---")
        links = get_transcript_links(driver)
        print(f"--- Found {len(links)} links to process ---")

        for i, link in enumerate(links):
            print(f"\n[{i+1}/{len(links)}] Processing: {link}")

            try:
                # 1. Scrape the single page
                # This is wrapped in a TRY block so a TimeoutException here
                # won't kill the entire loop.
                new_data = scrape_single_transcript(driver, link)

                # 2. Validation
                if not new_data:
                    print(f"   [!!!] WARNING: No data extracted (Page might be video-only).")
                    failed_urls.append(link)
                else:
                    # Check size just for logging
                    transcript_len = len(new_data[0].get('transcript', []))
                    print(f"   [OK] Success. Extracted {transcript_len} speech blocks.")

                    # 3. Add to main list
                    all_events.extend(new_data)

                # 4. Incremental Save (Safety Net)
                if len(all_events) > 0 and (i + 1) % SAVE_EVERY_N == 0:
                    print(f"   [SAVING] Checkpoint reached. Saving {len(all_events)} rows...")
                    save_to_csv(all_events, OUTPUT_FILE)

                # 5. Politeness Sleep
                # Random sleep reduces the chance of renderer hangs/blocks
                time.sleep(random.uniform(1.5, 3.5))

            except Exception as e:
                # If a page crashes (Timeout, ElementClickIntercepted, etc.), we log and move on.
                print(f"   [ERROR] Failed to process link: {e}")
                failed_urls.append(link)
                # Optional: Force stop loading if it was a timeout that lingered
                try: driver.execute_script("window.stop();")
                except: pass

    except KeyboardInterrupt:
        print("\n[!] Script interrupted by user. Saving collected data...")

    finally:
        # Final Report & Save
        print("\n" + "="*30)
        print(f"FINISHED. Total Rows in DataFrame: {len(all_events)}")
        print(f"Total Failed URLs: {len(failed_urls)}")
        if failed_urls:
            print(f"Sample failed URL: {failed_urls[0]}")
        print("="*30)

        if all_events:
            save_to_csv(all_events, OUTPUT_FILE)
            print(f"Final data saved to {OUTPUT_FILE}")

        driver.quit()

def save_to_csv(data, filename):
    """Helper to handle the JSON serialization and saving"""
    df = pd.DataFrame(data)
    # Convert the list-of-dicts in 'transcript' to a JSON string for CSV storage
    if 'transcript' in df.columns:
        df['transcript'] = df['transcript'].apply(json.dumps)
    df.to_csv(filename, index=False)

In [80]:
scrape_transcripts()

--- Step A: Loading & Scrolling ---
Opening https://rollcall.com/factbase/...
Verifying content loaded...
Initial content loaded successfully.
Starting to scroll...
   [Defense] Scanning for popups...
[List Page] Scrolling down (1/40)...
   [Defense] Scanning for popups...
[List Page] Scrolling down (2/40)...
   [Defense] Scanning for popups...
[List Page] Scrolling down (3/40)...
   [Defense] Scanning for popups...
[List Page] Scrolling down (4/40)...
   [Defense] Scanning for popups...
[List Page] Scrolling down (5/40)...
   [Defense] Scanning for popups...
[List Page] Scrolling down (6/40)...
   [Defense] Scanning for popups...
[List Page] Scrolling down (7/40)...
   [Defense] Scanning for popups...
[List Page] Scrolling down (8/40)...
   [Defense] Scanning for popups...
[List Page] Scrolling down (9/40)...
   [Defense] Scanning for popups...
[List Page] Scrolling down (10/40)...
   [Defense] Scanning for popups...
[List Page] Scrolling down (11/40)...
   [Defense] Scanning for popu

In [82]:
df = pd.read_csv("trump_data.csv")

In [89]:
print(df.shape)

(1427, 5)


In [91]:
df.columns

Index(['unique_id', 'date', 'event_name', 'url', 'transcript'], dtype='object')

In [85]:
df.head()

Unnamed: 0,unique_id,date,event_name,url,transcript
0,donald-trump-remarks-mexican-border-medal-pres...,"December 15, 2025",Remarks: Donald Trump Presents Mexican Border ...,https://rollcall.com/factbase/trump/transcript...,"[{""time"": ""00:00:00"", ""speaker"": ""Donald Trump..."
1,donald-trump-speech-white-house-christmas-rece...,"December 14, 2025",Speech: Donald Trump Addresses a White House C...,https://rollcall.com/factbase/trump/transcript...,"[{""time"": ""00:00:00"", ""speaker"": ""Donald Trump..."
2,donald-trump-remarks-after-marine-one-arrival-...,"December 13, 2025",Remarks: Donald Trump Speaks to Reporters Afte...,https://rollcall.com/factbase/trump/transcript...,"[{""time"": ""00:00:00"", ""speaker"": ""Donald Trump..."
3,donald-trump-press-gaggle-before-marine-one-de...,"December 13, 2025",Press Gaggle: Donald Trump Speaks to Reporters...,https://rollcall.com/factbase/trump/transcript...,"[{""time"": ""00:00:00"", ""speaker"": ""Question"", ""..."
4,donald-trump-remarks-1980-olympic-hockey-team-...,"December 12, 2025",Remarks: Donald Trump Signs a Bill Honoring th...,https://rollcall.com/factbase/trump/transcript...,"[{""time"": ""00:00:00"", ""speaker"": ""Donald Trump..."
