# Trump Speech Analysis for 2025 SOTU Predictions

This notebook analyzes the frequency of specific phrases in Trump's speeches, with a focus on:
1. Previous State of the Union addresses
2. Recent speeches (since Jan 20, 2025 inauguration)
3. Overall usage patterns across all speeches

In [None]:
import os
import re
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

# Set style for plots
plt.style.use('default')  # Use default matplotlib style
sns.set_theme(style='whitegrid')  # Set seaborn style
plt.rcParams['figure.figsize'] = [12, 6]  # Set default figure size

In [None]:
# Define the phrases to search for
SEARCH_PHRASES = {
    'Immigration': r'\b(illegal\s+immigra(nt|tion|nts)|immigra(nt|tion|nts))\b',
    'America First': r'\bamerica\s+first\b',
    'America': r'\bamerica\b',
    'Border': r'\bborder(s)?\b',
    'DOGE': r'\b(doge|department\s+of\s+government\s+efficiency)\b',
    'AI': r'\b(ai|artificial\s+intelligence)\b',
    'Ceasefire': r'\bceasefire(s)?\b',
    'Middle Class': r'\bmiddle\s+class\b',
    'God': r'\bgod\b',
    'Elon': r'\b(elon|elon\s+musk)\b',
    'Drill Baby Drill': r'\bdrill\s+baby\s+drill\b',
    'Biden': r'\bbiden\b',
    'Make America Healthy Again': r'\bmake\s+america\s+healthy\s+again\b',
    'LA': r'\b(la|los\s+angeles)\b',
    'January 6': r'\b(january\s+6(th)?|6th\s+of\s+january)\b',
    'TikTok': r'\btiktok\b',
    'Crypto': r'\b(crypto|bitcoin)\b',
    'MAGA': r'\b(maga|make\s+america\s+great\s+again)\b',
    'Trans': r'\btrans\b',
    'Kamala': r'\bkamala\b',
    'Rigged': r'\b(rig(ged)?|rigging)\b',
    'Mandate': r'\bmandate(s|d)?\b',
    'Carnage': r'\bcarnage\b'
]

# Inauguration date
INAUGURATION_DATE = datetime(2025, 1, 20)

In [None]:
def count_phrases(text, phrases=SEARCH_PHRASES):
    """Count occurrences of phrases in text"""
    counts = {}
    for name, pattern in phrases.items():
        counts[name] = len(re.findall(pattern, text.lower()))
    return counts

def get_date_from_filename(filename):
    """Extract date from filename format YYYY-MM-DD_..."""
    date_str = filename.split('_')[0]
    return datetime.strptime(date_str, '%Y-%m-%d')

def read_transcript(filepath):
    """Read and return transcript text"""
    with open(filepath, 'r', encoding='utf-8') as f:
        return f.read()

def process_directory(directory):
    """Process all transcripts in a directory and its subdirectories"""
    results = []
    
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.txt'):
                filepath = os.path.join(root, file)
                try:
                    date = get_date_from_filename(file)
                    text = read_transcript(filepath)
                    counts = count_phrases(text)
                    
                    results.append({
                        'date': date,
                        'file': file,
                        'category': os.path.basename(root),
                        'text_length': len(text.split()),
                        **counts
                    })
                except Exception as e:
                    print(f"Error processing {filepath}: {str(e)}")
    
    return pd.DataFrame(results)

In [None]:
# Process all transcripts
df = process_directory('../data/processed-transcripts')
df = df.sort_values('date')

# Create separate dataframes for different time periods
df_post_inaug = df[df['date'] >= INAUGURATION_DATE]
df_sotu = df[df['category'] == 'sotu']

print(f"Total transcripts: {len(df)}")
print(f"Post-inauguration transcripts: {len(df_post_inaug)}")
print(f"SOTU addresses: {len(df_sotu)}")

In [None]:
# Calculate frequency per 1000 words for each period
def calculate_frequencies(df):
    total_words = df['text_length'].sum()
    frequencies = {}
    
    for phrase in SEARCH_PHRASES.keys():
        total_occurrences = df[phrase].sum()
        frequency = (total_occurrences / total_words) * 1000
        frequencies[phrase] = {
            'total_occurrences': total_occurrences,
            'frequency_per_1000': frequency
        }
    
    return pd.DataFrame(frequencies).T

# Calculate frequencies for each period
freq_all = calculate_frequencies(df)
freq_post_inaug = calculate_frequencies(df_post_inaug)
freq_sotu = calculate_frequencies(df_sotu)

# Combine into a single table
comparison_table = pd.DataFrame({
    'All Time Occurrences': freq_all['total_occurrences'],
    'All Time Freq (per 1000 words)': freq_all['frequency_per_1000'],
    'Post-Inaug Occurrences': freq_post_inaug['total_occurrences'],
    'Post-Inaug Freq (per 1000 words)': freq_post_inaug['frequency_per_1000'],
    'SOTU Occurrences': freq_sotu['total_occurrences'],
    'SOTU Freq (per 1000 words)': freq_sotu['frequency_per_1000']
})

# Sort by total occurrences
comparison_table = comparison_table.sort_values('All Time Occurrences', ascending=False)

# Display the table
pd.set_option('display.float_format', lambda x: '%.2f' % x)
comparison_table

## Trends Over Time

Let's visualize how the usage of the most frequent terms has changed over time:

In [None]:
# Get top 10 most frequent terms
top_terms = comparison_table.nlargest(10, 'All Time Occurrences').index

# Create a rolling average plot for each term
plt.figure(figsize=(15, 8))

for term in top_terms:
    # Calculate rolling average (30-day window)
    term_freq = df.set_index('date').resample('D')[term].mean().rolling(window=30, min_periods=1).mean()
    plt.plot(term_freq.index, term_freq.values, label=term, alpha=0.7)

plt.title('30-Day Rolling Average Usage of Top Terms')
plt.xlabel('Date')
plt.ylabel('Average Occurrences per Speech')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.grid(True, alpha=0.3)
plt.show()

## SOTU-Specific Analysis

Let's look at how these terms have been used in previous State of the Union addresses:

In [None]:
# Create a heatmap of term usage in SOTU addresses
sotu_pivot = df_sotu.pivot(index='date', columns=df_sotu.columns[4:-1].tolist(), values=df_sotu.columns[4:-1].tolist())

plt.figure(figsize=(15, 10))
sns.heatmap(sotu_pivot, cmap='YlOrRd', annot=True, fmt='g')
plt.title('Term Usage in State of the Union Addresses')
plt.xlabel('Terms')
plt.ylabel('Date')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## Recent Trends (Post-Inauguration)

Let's analyze how term usage has changed since the 2025 inauguration:

In [None]:
# Calculate weekly averages for post-inauguration period
weekly_avg = df_post_inaug.set_index('date').resample('W').mean()

# Plot weekly trends for top terms
plt.figure(figsize=(15, 8))

for term in top_terms[:5]:  # Top 5 terms for clarity
    plt.plot(weekly_avg.index, weekly_avg[term], label=term, marker='o', alpha=0.7)

plt.title('Weekly Average Usage of Top Terms (Post-Inauguration)')
plt.xlabel('Date')
plt.ylabel('Average Occurrences per Speech')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()