In [1]:
import sys
from datetime import datetime
import os
import re
import glob

# Use absolute path to project root to avoid directory confusion
project_root = "/Users/jstenger/Documents/repos/kalshi-research"
sys.path.insert(0, project_root)

from src.speech_scraper import TrumpSpeechScraper
from src.process_transcripts import process_all_transcripts, process_new_transcripts, process_vance_transcripts

# Add src to path for imports
src_path = os.path.join(project_root, 'src')
if src_path not in sys.path:
    sys.path.append(src_path)

# Find the highest numbered transcript_urls file using absolute path
transcript_urls_dir = os.path.join(project_root, "data/transcript-urls")
transcript_urls_pattern = os.path.join(transcript_urls_dir, "transcript_urls*.txt")

existing_files = glob.glob(transcript_urls_pattern)

if existing_files:
    # Extract numbers from filenames
    numbers = []
    for file in existing_files:
        try:
            # Extract number from filename like "transcript_urls23.txt"
            filename = os.path.basename(file)
            
            # More robust number extraction using regex
            match = re.search(r'transcript_urls(\d+)\.txt$', filename)
            if match:
                num = int(match.group(1))
                numbers.append(num)
        except (ValueError, IndexError):
            continue
    
    if numbers:
        latest_num = max(numbers)
        next_num = latest_num + 1
        
        # Find the latest transcript_urls file
        latest_file = os.path.join(transcript_urls_dir, f"transcript_urls{latest_num}.txt")
        
        # Read the first URL from the latest file to get the most recent date
        try:
            with open(latest_file, 'r', encoding='utf-8') as f:
                first_url = f.readline().strip()
                if first_url:
                    # Extract date from URL (format: month-day-year)
                    date_match = re.search(r'(\w+)-(\d+)-(\d+)$', first_url)
                    if date_match:
                        month_name, day, year = date_match.groups()
                        # Convert month name to number
                        month_map = {
                            'january': 1, 'february': 2, 'march': 3, 'april': 4,
                            'may': 5, 'june': 6, 'july': 7, 'august': 8,
                            'september': 9, 'october': 10, 'november': 11, 'december': 12
                        }
                        month_num = month_map.get(month_name.lower())
                        if month_num:
                            latest_date = datetime(int(year), month_num, int(day))
                            print(f"Latest transcript date: {latest_date.strftime('%B %d, %Y')}")
        except (FileNotFoundError, Exception):
            pass
    else:
        next_num = 1
else:
    next_num = 1

save_path = os.path.join(transcript_urls_dir, f"transcript_urls{next_num}.txt")
print(f"Using save path: {save_path}")


scraper = TrumpSpeechScraper(url="https://rollcall.com/factbase/", save_path=save_path,  data_dir="/Users/jstenger/Documents/repos/kalshi-research/data")

Latest transcript date: September 22, 2025
Using save path: /Users/jstenger/Documents/repos/kalshi-research/data/transcript-urls/transcript_urls29.txt


In [2]:
from datetime import timedelta
    
# Set target date to collect transcripts from
target_date = latest_date - timedelta(days=3)

# Collect transcript URLs
urls = scraper.collect_transcript_urls(target_date=target_date)

print(f"Collection complete. Found {len(urls)} transcript URLs.")

Starting URL collection...
Will only collect transcripts after September 19, 2025
Setting up Chrome WebDriver...
Attempting to use ChromeDriverManager...
Initializing Chrome WebDriver (attempt 1/3)...
Initialization attempt 1 failed: Message: Can not connect to the Service /Users/jstenger/.wdm/drivers/chromedriver/mac64/140.0.7339.207/chromedriver-mac-arm64/chromedriver

Retrying initialization...
Initializing Chrome WebDriver (attempt 2/3)...
Initialization attempt 2 failed: Message: Can not connect to the Service /Users/jstenger/.wdm/drivers/chromedriver/mac64/140.0.7339.207/chromedriver-mac-arm64/chromedriver

Retrying initialization...
Initializing Chrome WebDriver (attempt 3/3)...
Initialization attempt 3 failed: Message: Can not connect to the Service /Users/jstenger/.wdm/drivers/chromedriver/mac64/140.0.7339.207/chromedriver-mac-arm64/chromedriver

Error collecting URLs: Failed to initialize WebDriver after multiple attempts


Service process refused to terminate gracefully with SIGTERM, escalating to SIGKILL.
Traceback (most recent call last):
  File "/Users/jstenger/Documents/repos/kalshi-research/venv/lib/python3.13/site-packages/selenium/webdriver/common/service.py", line 179, in _terminate_process
    self.process.wait(60)
    ~~~~~~~~~~~~~~~~~^^^^
  File "/opt/homebrew/Cellar/python@3.13/3.13.3/Frameworks/Python.framework/Versions/3.13/lib/python3.13/subprocess.py", line 1277, in wait
    return self._wait(timeout=timeout)
           ~~~~~~~~~~^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Cellar/python@3.13/3.13.3/Frameworks/Python.framework/Versions/3.13/lib/python3.13/subprocess.py", line 2055, in _wait
    raise TimeoutExpired(self.args, timeout)
subprocess.TimeoutExpired: Command '['/Users/jstenger/.wdm/drivers/chromedriver/mac64/140.0.7339.207/chromedriver-mac-arm64/chromedriver', '--port=62574']' timed out after 60 seconds


Collection complete. Found 0 transcript URLs.


In [9]:
scraper.process_transcripts(url_path=save_path)

Loaded 28 URLs from /Users/jstenger/Documents/repos/kalshi-research/data/transcript-urls/transcript_urls28.txt


Processing transcripts:   4%|▎         | 1/28 [00:06<02:48,  6.26s/it]


Skipping existing transcript: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/remarks/2025-09-18_donald_trump_departs_windsor_castle_in_the_united_kingdom.txt


Processing transcripts:   7%|▋         | 2/28 [00:12<02:38,  6.09s/it]


Skipping existing transcript: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/remarks/2025-09-18_donald_trump_and_keir_starmer_announce_a_joint_tech_investment_deal.txt


Processing transcripts:  11%|█         | 3/28 [00:13<01:39,  4.00s/it]


Skipping existing transcript: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/interview/2025-09-18_martha_maccallum_of_fox_news_interviews_donald_trump_in_the_uk.txt


Processing transcripts:  32%|███▏      | 9/28 [00:18<00:19,  1.03s/it]


Skipping existing transcript: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/press conference/2025-09-18_donald_trump_and_keir_starmer_of_the_uk_hold_a_press_event.txt


Processing transcripts:  46%|████▋     | 13/28 [00:22<00:13,  1.10it/s]


Skipping existing transcript: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/press gaggle/2025-09-18_donald_trump_speaks_to_reporters_on_board_air_force_one.txt


Processing transcripts:  64%|██████▍   | 18/28 [00:24<00:04,  2.22it/s]


Skipping existing transcript: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/remarks/2025-09-17_donald_trump_speaks_briefly_at_a_chapel_in_windsor_castle_in_the_uk.txt

Skipping existing transcript: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/remarks/2025-09-17_donald_trump_tours_the_royal_collection_at_windsor_castle_in_the_uk.txt

Skipping existing transcript: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/interview/2025-09-17_jesse_watters_interviews_jd_vance_in_howell_michigan.txt

Skipping existing transcript: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/press gaggle/2025-09-16_donald_trump_speaks_to_reporters_at_winfield_house_in_london.txt


Processing transcripts:  68%|██████▊   | 19/28 [00:24<00:03,  2.53it/s]


Skipping existing transcript: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/remarks/2025-09-17_donald_trump_addresses_a_state_banquet_in_the_united_kingdom.txt


Processing transcripts:  71%|███████▏  | 20/28 [00:25<00:03,  2.34it/s]


Skipping existing transcript: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/remarks/2025-09-17_jd_vance_discusses_tax_savings_and_spending_cuts_in_howell_michigan.txt


Processing transcripts:  75%|███████▌  | 21/28 [00:25<00:03,  2.11it/s]


Skipping existing transcript: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/press gaggle/2025-09-16_donald_trump_speaks_to_reporters_on_board_air_force_one.txt


Processing transcripts:  82%|████████▏ | 23/28 [00:27<00:02,  1.70it/s]


Skipping existing transcript: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/press gaggle/2025-09-16_donald_trump_speaks_to_reporters_before_marine_one_departure.txt

Skipping existing transcript: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/interview/2025-09-13_no_transcript_-_peter_nicholas_of_nbc_news_interviews_donald_trump.txt


Processing transcripts:  86%|████████▌ | 24/28 [00:28<00:02,  1.53it/s]


Skipping existing transcript: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/remarks/2025-09-15_donald_trump_signs_a_memorandum_to_deploy_troops_in_memphis_tennessee.txt


Processing transcripts:  96%|█████████▋| 27/28 [00:28<00:00,  3.07it/s]


Skipping existing transcript: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/interview/2025-09-13_no_transcript_-_brooke_singman_of_fox_news_interviews_donald_trump.txt

Skipping existing transcript: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/press gaggle/2025-09-14_donald_trump_speaks_to_reporters_before_air_force_one_departure.txt

Skipping existing transcript: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/interview/2025-09-13_no_transcript_-_rob_schmitt_of_newsmax_interviews_donald_trump.txt


Processing transcripts: 100%|██████████| 28/28 [00:30<00:00,  1.08s/it]


Skipping existing transcript: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/remarks/2025-09-15_jd_vance_hosts_the_charlie_kirk_podcast_with_administration_officials.txt

Finished processing. Collected 9 speeches.





In [10]:
process_new_transcripts(data_prefix="/Users/jstenger/Documents/repos/kalshi-research/data")


Processing file: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/court transcript/2024-04-26_donald_trump_new_york_business_records_hush_money_trial.txt
Is press briefing: False
Path parts: ['', 'users', 'jstenger', 'documents', 'repos', 'kalshi-research', 'data', 'raw-transcripts', 'court transcript', '2024-04-26_donald_trump_new_york_business_records_hush_money_trial.txt']
Processed new transcript: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/court transcript/2024-04-26_donald_trump_new_york_business_records_hush_money_trial.txt

Processing file: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/court transcript/2024-05-20_donald_trump_new_york_business_records_hush_money_trial.txt
Is press briefing: False
Path parts: ['', 'users', 'jstenger', 'documents', 'repos', 'kalshi-research', 'data', 'raw-transcripts', 'court transcript', '2024-05-20_donald_trump_new_york_business_records_hush_money_trial.txt']
Processed new tra

In [11]:
import glob

# Get all transcript files
transcript_files = glob.glob("/Users/jstenger/Documents/repos/kalshi-research/data/processed-transcripts/*/*.txt")

# Check each file and delete if empty
for file in transcript_files:
    with open(file, 'r', encoding='utf-8') as f:
        content = f.read().strip()
        if not content:  # If file is empty or only whitespace
            os.remove(file)
            print(f"Deleted empty file: {file}")


Deleted empty file: /Users/jstenger/Documents/repos/kalshi-research/data/processed-transcripts/court transcript/2024-05-20_donald_trump_new_york_business_records_hush_money_trial.txt
Deleted empty file: /Users/jstenger/Documents/repos/kalshi-research/data/processed-transcripts/court transcript/2024-05-07_donald_trump_new_york_business_records_hush_money_trial.txt
Deleted empty file: /Users/jstenger/Documents/repos/kalshi-research/data/processed-transcripts/court transcript/2024-05-10_donald_trump_new_york_business_records_hush_money_trial.txt
Deleted empty file: /Users/jstenger/Documents/repos/kalshi-research/data/processed-transcripts/court transcript/2024-05-09_donald_trump_new_york_business_records_hush_money_trial.txt
Deleted empty file: /Users/jstenger/Documents/repos/kalshi-research/data/processed-transcripts/court transcript/2024-05-13_donald_trump_new_york_business_records_hush_money_trial.txt
Deleted empty file: /Users/jstenger/Documents/repos/kalshi-research/data/processed-tr

### vance

In [3]:
vance_patterns = [r'J.D.\s+Vance\s*:\s*', r'JD\s+Vance\s*:\s*']
process_new_transcripts(data_prefix="/Users/jstenger/Documents/repos/kalshi-research/data", output_dir="vance-processed-transcripts", speaker_patterns=vance_patterns)


Processing file: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/prepared remarks/2017-04-24_donald_trump_records_an_address_on_holocaust_remembrance_day.txt
Is press briefing: False
Path parts: ['', 'users', 'jstenger', 'documents', 'repos', 'kalshi-research', 'data', 'raw-transcripts', 'prepared remarks', '2017-04-24_donald_trump_records_an_address_on_holocaust_remembrance_day.txt']
Processed new transcript: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/prepared remarks/2017-04-24_donald_trump_records_an_address_on_holocaust_remembrance_day.txt

Processing file: /Users/jstenger/Documents/repos/kalshi-research/data/raw-transcripts/prepared remarks/2016-08-31_donald_trump_on_immigration_in_phoenix_arizona.txt
Is press briefing: False
Path parts: ['', 'users', 'jstenger', 'documents', 'repos', 'kalshi-research', 'data', 'raw-transcripts', 'prepared remarks', '2016-08-31_donald_trump_on_immigration_in_phoenix_arizona.txt']
Processed new transc

In [4]:
import glob
import os

# Get all Vance transcript files
vance_transcript_files = glob.glob("/Users/jstenger/Documents/repos/kalshi-research/data/vance-processed-transcripts/*/*.txt")

# Check each file and delete if empty
for file in vance_transcript_files:
    with open(file, 'r', encoding='utf-8') as f:
        content = f.read().strip()
        if not content:  # If file is empty or only whitespace
            os.remove(file)
            print(f"Deleted empty file: {file}")


Deleted empty file: /Users/jstenger/Documents/repos/kalshi-research/data/vance-processed-transcripts/prepared remarks/2017-04-24_donald_trump_records_an_address_on_holocaust_remembrance_day.txt
Deleted empty file: /Users/jstenger/Documents/repos/kalshi-research/data/vance-processed-transcripts/prepared remarks/2016-08-31_donald_trump_on_immigration_in_phoenix_arizona.txt
Deleted empty file: /Users/jstenger/Documents/repos/kalshi-research/data/vance-processed-transcripts/court transcript/2024-04-26_donald_trump_new_york_business_records_hush_money_trial.txt
Deleted empty file: /Users/jstenger/Documents/repos/kalshi-research/data/vance-processed-transcripts/court transcript/2024-05-20_donald_trump_new_york_business_records_hush_money_trial.txt
Deleted empty file: /Users/jstenger/Documents/repos/kalshi-research/data/vance-processed-transcripts/court transcript/2024-05-07_donald_trump_new_york_business_records_hush_money_trial.txt
Deleted empty file: /Users/jstenger/Documents/repos/kalshi-