# 02 - NLP Processing Pipeline

This notebook processes budget speech PDFs through the NLP pipeline:
1. Extract text from PDFs
2. Tokenize into sentences
3. Classify sectors (soft probabilities)
4. Analyze sentiment
5. Score certainty and actionability

## Outputs
- Processed sentences with sector probabilities
- Sentiment scores
- Importance weights

In [None]:
# Setup
import sys
from pathlib import Path

# Add src to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Download NLTK data
import nltk
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)

print(f"Project root: {project_root}")

In [None]:
# Load configuration
import yaml

config_dir = project_root / 'config'

with open(config_dir / 'sectors.yaml', 'r', encoding='utf-8') as f:
    sectors_config = yaml.safe_load(f)

with open(config_dir / 'event_dates.yaml', 'r', encoding='utf-8') as f:
    event_dates = yaml.safe_load(f)

print(f"Loaded {len(sectors_config['sectors'])} sector definitions")
print(f"Loaded {len(event_dates['budget_events'])} budget events")

## 1. Load and Extract Text from Budget Speech

In [None]:
from src.nlp import extract_text_with_fallback, clean_text

# Find available speech PDFs
speech_files = list(project_root.glob('*.pdf'))
print(f"Found {len(speech_files)} budget speech PDFs:")
for f in sorted(speech_files):
    print(f"  - {f.name}")

In [None]:
# Process the most recent budget (2024-25)
target_speech = None
for f in speech_files:
    if '2024' in f.name or '202425' in f.name.replace('_', ''):
        target_speech = f
        break

if target_speech is None:
    target_speech = speech_files[-1]  # Use latest

print(f"Processing: {target_speech.name}")

# Extract text
raw_text = extract_text_with_fallback(str(target_speech))
print(f"\nExtracted {len(raw_text)} characters")
print("\nFirst 1000 characters:")
print(raw_text[:1000])

In [None]:
# Clean the text
cleaned_text = clean_text(raw_text)
print(f"Cleaned text: {len(cleaned_text)} characters")
print("\nSample (first 1500 chars):")
print(cleaned_text[:1500])

## 2. Tokenize into Sentences

In [None]:
from src.nlp import tokenize_speech, estimate_timestamps, validate_sentences
from datetime import datetime
from src.utils.time_utils import IST

# Tokenize
sentences = tokenize_speech(cleaned_text)
print(f"Tokenized into {len(sentences)} sentences")

# Sample sentences
print("\nSample sentences:")
for i, s in enumerate(sentences[:5]):
    print(f"  [{i}] {s['text'][:100]}...")

In [None]:
# Get budget timing from config
fiscal_year = '2024-25'
budget_info = event_dates['budget_events'].get(fiscal_year, {})

print(f"Budget {fiscal_year}:")
print(f"  Date: {budget_info.get('date')}")
print(f"  Speech: {budget_info.get('speech_start')} - {budget_info.get('speech_end')}")

# Create speech start time
speech_date = budget_info.get('date', '2024-07-23')
speech_start_time = budget_info.get('speech_start', '11:00')

speech_start = datetime.strptime(f"{speech_date} {speech_start_time}", "%Y-%m-%d %H:%M")
speech_start = IST.localize(speech_start)
print(f"  Speech start: {speech_start}")

In [None]:
# Estimate timestamps for each sentence
from src.ingestion import get_speech_duration_minutes

speech_duration = get_speech_duration_minutes(fiscal_year)
print(f"Speech duration: {speech_duration} minutes")

# Add timestamps
sentences = estimate_timestamps(sentences, speech_start, speech_duration)

# Validate
sentences = validate_sentences(sentences)

# Convert to DataFrame
sentences_df = pd.DataFrame(sentences)
print(f"\nSentences DataFrame shape: {sentences_df.shape}")
sentences_df.head()

## 3. Classify Sectors

In [None]:
from src.nlp import classify_sectors_batch

# Get sector keywords from config
sector_keywords = {}
for sector_key, sector_info in sectors_config['sectors'].items():
    sector_keywords[sector_key] = sector_info.get('keywords', [])

print(f"Loaded keywords for {len(sector_keywords)} sectors")

# Classify sentences
prob_cols = classify_sectors_batch(sentences_df['text'].tolist(), sector_keywords)

# Add to DataFrame
for col, probs in prob_cols.items():
    sentences_df[col] = probs

print(f"Added {len(prob_cols)} sector probability columns")

In [None]:
# View sector probabilities
prob_columns = [c for c in sentences_df.columns if c.startswith('prob_')]
print("Sector probability statistics:")
sentences_df[prob_columns].describe()

In [None]:
# Find sentences with high sector relevance
for sector in list(sector_keywords.keys())[:5]:
    col = f'prob_{sector}'
    if col in sentences_df.columns:
        top_sentences = sentences_df.nlargest(3, col)[['position', col, 'text']]
        print(f"\nTop sentences for {sector}:")
        for _, row in top_sentences.iterrows():
            print(f"  [{row[col]:.2f}] {row['text'][:80]}...")

## 4. Sentiment Analysis

In [None]:
from src.nlp import analyze_sentiment_batch, compute_fiscal_intensity

# Analyze sentiment
sentiments = analyze_sentiment_batch(sentences_df['text'].tolist())

# Add to DataFrame
for key in ['compound', 'positive', 'negative', 'neutral']:
    sentences_df[f'sentiment_{key}'] = [s.get(key, 0) for s in sentiments]

print("Sentiment statistics:")
sentences_df[['sentiment_compound', 'sentiment_positive', 'sentiment_negative']].describe()

In [None]:
# Fiscal intensity (monetary figures, percentages)
fiscal_scores = compute_fiscal_intensity(sentences_df['text'].tolist())
sentences_df['fiscal_intensity'] = [f.get('fiscal_intensity', 0) for f in fiscal_scores]

print(f"Fiscal intensity: mean={sentences_df['fiscal_intensity'].mean():.3f}")

# Show high fiscal intensity sentences
print("\nHigh fiscal intensity sentences:")
for _, row in sentences_df.nlargest(5, 'fiscal_intensity').iterrows():
    print(f"  [{row['fiscal_intensity']:.2f}] {row['text'][:100]}...")

In [None]:
# Visualize sentiment over speech
fig, axes = plt.subplots(2, 1, figsize=(14, 8), sharex=True)

# Sentiment compound
ax1 = axes[0]
ax1.plot(sentences_df['position'], sentences_df['sentiment_compound'], alpha=0.7)
ax1.axhline(y=0, color='gray', linestyle='--', alpha=0.5)
ax1.set_ylabel('Sentiment (Compound)')
ax1.set_title('Sentiment Over Speech')

# Rolling average
rolling_sentiment = sentences_df['sentiment_compound'].rolling(window=10, center=True).mean()
ax1.plot(sentences_df['position'], rolling_sentiment, color='red', linewidth=2, label='10-sentence MA')
ax1.legend()

# Fiscal intensity
ax2 = axes[1]
ax2.bar(sentences_df['position'], sentences_df['fiscal_intensity'], alpha=0.7, color='green')
ax2.set_xlabel('Sentence Position')
ax2.set_ylabel('Fiscal Intensity')
ax2.set_title('Fiscal Intensity Over Speech')

plt.tight_layout()
plt.show()

## 5. Certainty and Actionability Scoring

In [None]:
from src.nlp import score_certainty_batch, score_actionability_batch

# Score certainty
sentences_df['certainty_score'] = score_certainty_batch(sentences_df['text'].tolist())

# Score actionability
sentences_df['actionability_score'] = score_actionability_batch(sentences_df['text'].tolist())

print("Certainty and Actionability:")
print(sentences_df[['certainty_score', 'actionability_score']].describe())

In [None]:
# Calculate importance weight
sentences_df['importance_weight'] = (
    0.3 * sentences_df['certainty_score'] +
    0.4 * sentences_df['actionability_score'] +
    0.3 * sentences_df['fiscal_intensity']
)

print("\nMost important sentences:")
for _, row in sentences_df.nlargest(5, 'importance_weight').iterrows():
    print(f"\n[Weight: {row['importance_weight']:.2f}]")
    print(f"  Certainty: {row['certainty_score']:.2f}, Actionability: {row['actionability_score']:.2f}")
    print(f"  Text: {row['text'][:150]}...")

## 6. Save Processed Data

In [None]:
# Add metadata
sentences_df['fiscal_year'] = fiscal_year
sentences_df['budget_date'] = budget_info.get('date')

# Save to intermediate directory
output_dir = project_root / 'data' / 'intermediate' / 'speech_text'
output_dir.mkdir(parents=True, exist_ok=True)

output_path = output_dir / f'{fiscal_year.replace("-", "_")}_sentences.parquet'
sentences_df.to_parquet(output_path)

print(f"Saved processed sentences to {output_path}")
print(f"  Shape: {sentences_df.shape}")
print(f"  Columns: {list(sentences_df.columns)}")

In [None]:
# Also save as CSV for easy viewing
csv_path = output_dir / f'{fiscal_year.replace("-", "_")}_sentences.csv'
sentences_df.to_csv(csv_path, index=False)
print(f"Also saved as CSV: {csv_path}")

## 7. Summary Visualizations

In [None]:
# Sector attention distribution
prob_columns = [c for c in sentences_df.columns if c.startswith('prob_')]

# Calculate total attention per sector
sector_attention = {}
for col in prob_columns:
    sector = col.replace('prob_', '')
    # Weight by importance
    weighted_attention = (sentences_df[col] * sentences_df['importance_weight']).sum()
    sector_attention[sector] = weighted_attention

attention_df = pd.DataFrame([
    {'sector': k, 'attention': v} for k, v in sector_attention.items()
]).sort_values('attention', ascending=True)

# Plot
fig, ax = plt.subplots(figsize=(12, 8))
ax.barh(attention_df['sector'], attention_df['attention'], color='steelblue')
ax.set_xlabel('Weighted Attention Score')
ax.set_title(f'Sector Attention in Budget {fiscal_year}')
plt.tight_layout()
plt.show()

In [None]:
print("="*60)
print("NLP PROCESSING COMPLETE")
print("="*60)
print(f"\nProcessed Budget: {fiscal_year}")
print(f"Total sentences: {len(sentences_df)}")
print(f"Sectors classified: {len(prob_columns)}")
print(f"\nTop 5 Sectors by Attention:")
for _, row in attention_df.tail(5).iterrows():
    print(f"  {row['sector']}: {row['attention']:.2f}")
print(f"\nOutput saved to: {output_path}")