In [1]:
import sys
from datetime import datetime
import os
import re
import glob

# Use absolute path to project root to avoid directory confusion
project_root = "/Users/jstenger/Documents/repos/kalshi-research"
sys.path.insert(0, project_root)

from src.speech_scraper import TrumpSpeechScraper
from src.process_transcripts import process_all_transcripts, process_new_transcripts

# Add src to path for imports
src_path = os.path.join(project_root, 'src')
if src_path not in sys.path:
    sys.path.append(src_path)

# Find the highest numbered transcript_urls file using absolute path
transcript_urls_dir = os.path.join(project_root, "data/transcript-urls")
transcript_urls_pattern = os.path.join(transcript_urls_dir, "transcript_urls*.txt")

existing_files = glob.glob(transcript_urls_pattern)

if existing_files:
    # Extract numbers from filenames
    numbers = []
    for file in existing_files:
        try:
            # Extract number from filename like "transcript_urls23.txt"
            filename = os.path.basename(file)
            
            # More robust number extraction using regex
            match = re.search(r'transcript_urls(\d+)\.txt$', filename)
            if match:
                num = int(match.group(1))
                numbers.append(num)
        except (ValueError, IndexError):
            continue
    
    if numbers:
        latest_num = max(numbers)
        next_num = latest_num + 1
        
        # Find the latest transcript_urls file
        latest_file = os.path.join(transcript_urls_dir, f"transcript_urls{latest_num}.txt")
        
        # Read the first URL from the latest file to get the most recent date
        try:
            with open(latest_file, 'r', encoding='utf-8') as f:
                first_url = f.readline().strip()
                if first_url:
                    # Extract date from URL (format: month-day-year)
                    date_match = re.search(r'(\w+)-(\d+)-(\d+)$', first_url)
                    if date_match:
                        month_name, day, year = date_match.groups()
                        # Convert month name to number
                        month_map = {
                            'january': 1, 'february': 2, 'march': 3, 'april': 4,
                            'may': 5, 'june': 6, 'july': 7, 'august': 8,
                            'september': 9, 'october': 10, 'november': 11, 'december': 12
                        }
                        month_num = month_map.get(month_name.lower())
                        if month_num:
                            latest_date = datetime(int(year), month_num, int(day))
                            print(f"Latest transcript date: {latest_date.strftime('%B %d, %Y')}")
        except (FileNotFoundError, Exception):
            pass
    else:
        next_num = 1
else:
    next_num = 1

save_path = os.path.join(transcript_urls_dir, f"transcript_urls{next_num}.txt")
print(f"Using save path: {save_path}")


scraper = TrumpSpeechScraper(url="https://rollcall.com/factbase/", save_path=save_path,  data_dir="/Users/jstenger/Documents/repos/kalshi-research/data")

Latest transcript date: September 14, 2025
Using save path: /Users/jstenger/Documents/repos/kalshi-research/data/transcript-urls/transcript_urls27.txt


In [10]:
from datetime import timedelta
    
# Set target date to collect transcripts from
target_date = latest_date - timedelta(days=3)

# Collect transcript URLs
urls = scraper.collect_transcript_urls(target_date=target_date)

print(f"Collection complete. Found {len(urls)} transcript URLs.")

Starting URL collection...
Will only collect transcripts after September 06, 2025
Setting up Chrome WebDriver...
Attempting to use ChromeDriverManager...
Initializing Chrome WebDriver (attempt 1/3)...
Loading URL: https://rollcall.com/factbase/
Page title: Factbase search - Roll Call
Waiting for content to load...
Content element not found: Message: 
Stacktrace:
0   chromedriver                        0x0000000104c7f674 cxxbridge1$str$ptr + 2882776
1   chromedriver                        0x0000000104c775a0 cxxbridge1$str$ptr + 2849796
2   chromedriver                        0x00000001047a10dc _RNvCslE4myv9Sbr3_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 73436
3   chromedriver                        0x00000001047e8474 _RNvCslE4myv9Sbr3_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 365172
4   chromedriver                        0x000000010482966c _RNvCslE4myv9Sbr3_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 631916
5   chromedriver                        0x00000001047dc130

In [11]:
scraper.process_transcripts(url_path=save_path)

Loaded 22 URLs from /Users/jstenger/Documents/repos/kalshi-research/data/transcript-urls/transcript_urls26.txt


Processing transcripts:   5%|▍         | 1/22 [00:05<01:57,  5.60s/it]


Skipping existing transcript: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/interview/2025-09-09_no_transcript_-_garrett_haake_interviews_donald_trump_for_nbc_news.txt


Processing transcripts:  27%|██▋       | 6/22 [00:09<00:14,  1.13it/s]


Skipping existing transcript: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/donald trump vlog/2025-09-09_innovative_housing_showcase.txt


Processing transcripts:  41%|████      | 9/22 [00:12<00:12,  1.08it/s]


Skipping existing transcript: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/donald trump vlog/2025-09-09_law_and_order.txt


Processing transcripts:  59%|█████▉    | 13/22 [00:14<00:06,  1.36it/s]


Skipping existing transcript: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/press briefing/2025-09-09_tom_homan_speaks_to_reporters_outside_the_white_house.txt


Processing transcripts:  64%|██████▎   | 14/22 [00:14<00:05,  1.51it/s]


Skipping existing transcript: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/press gaggle/2025-09-07_donald_trump_speaks_to_reporters_before_marine_one_departure.txt


Processing transcripts:  82%|████████▏ | 18/22 [00:17<00:02,  1.89it/s]


Skipping existing transcript: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/interview/2025-09-06_lara_trump_interviews_jd_vance_for_her_my_view_show_on_fox_news.txt


Processing transcripts:  86%|████████▋ | 19/22 [00:18<00:01,  1.62it/s]


Skipping existing transcript: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/press gaggle/2025-09-07_donald_trump_speaks_to_reporters_after_air_force_one_arrival.txt


Processing transcripts: 100%|██████████| 22/22 [00:23<00:00,  1.08s/it]


Finished processing. Collected 15 speeches.





In [5]:
process_all_transcripts(data_prefix="/Users/jstenger/Documents/repos/kalshi-research/data")


Processing file: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/press briefing/2025-04-11_karoline_leavitt_holds_a_press_briefing_at_the_white_house.txt
Is press briefing: True
Path parts: ['', 'users', 'jstenger', 'documents', 'repos', 'kalshi-research', 'data', 'raw-transcripts', 'press briefing', '2025-04-11_karoline_leavitt_holds_a_press_briefing_at_the_white_house.txt']
Processed: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/press briefing/2025-04-11_karoline_leavitt_holds_a_press_briefing_at_the_white_house.txt

Processing file: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/press briefing/2025-08-06_tom_homan_speaks_to_reporters_outside_the_white_house.txt
Is press briefing: True
Path parts: ['', 'users', 'jstenger', 'documents', 'repos', 'kalshi-research', 'data', 'raw-transcripts', 'press briefing', '2025-08-06_tom_homan_speaks_to_reporters_outside_the_white_house.txt']
Processed: /Users/jstenger/Documents/re

In [7]:
import glob

# Get all transcript files
transcript_files = glob.glob("/Users/jstenger/Documents/repos/kalshi-research/data/processed-transcripts/*/*.txt")

# Check each file and delete if empty
for file in transcript_files:
    with open(file, 'r', encoding='utf-8') as f:
        content = f.read().strip()
        if not content:  # If file is empty or only whitespace
            os.remove(file)
            print(f"Deleted empty file: {file}")


Deleted empty file: /Users/jstenger/Documents/repos/kalshi-research/data/processed-transcripts/press briefing/2025-08-06_tom_homan_speaks_to_reporters_outside_the_white_house.txt
Deleted empty file: /Users/jstenger/Documents/repos/kalshi-research/data/processed-transcripts/press briefing/2025-04-08_pam_bondi_speaks_to_reporters_outside_the_white_house.txt
Deleted empty file: /Users/jstenger/Documents/repos/kalshi-research/data/processed-transcripts/press briefing/2025-07-24_russell_vought_speaks_to_reporters_outside_the_white_house.txt
Deleted empty file: /Users/jstenger/Documents/repos/kalshi-research/data/processed-transcripts/press briefing/2025-05-01_tom_homan_speaks_to_reporters_outside_the_white_house.txt
Deleted empty file: /Users/jstenger/Documents/repos/kalshi-research/data/processed-transcripts/press briefing/2025-07-11_russell_vought_speaks_to_reporters_outside_the_white_house.txt
Deleted empty file: /Users/jstenger/Documents/repos/kalshi-research/data/processed-transcripts/