# 03 - Event Study Analysis

This notebook performs the core event study analysis:
1. Load processed speech data
2. Build sector portfolios
3. Detect first sector mentions
4. Calculate Cumulative Abnormal Returns (CAR)
5. Run statistical tests
6. Visualize results

In [None]:
# Setup

import sys
from pathlib import Path

project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

plt.style.use('seaborn-v0_8-whitegrid')
print(f"Project root: {project_root}")

AttributeError: partially initialized module 'pandas' has no attribute '_pandas_datetime_CAPI' (most likely due to a circular import)

: 

## 1. Load Processed Speech Data

In [None]:
# Load processed sentences
speech_dir = project_root / 'data' / 'intermediate' / 'speech_text'

if speech_dir.exists():
    speech_files = list(speech_dir.glob('*.parquet'))
    print(f"Found {len(speech_files)} processed speech files")
    
    sentences_by_year = {}
    for f in speech_files:
        fy = f.stem.replace('_sentences', '').replace('_', '-')
        sentences_by_year[fy] = pd.read_parquet(f)
        print(f"  {fy}: {len(sentences_by_year[fy])} sentences")
else:
    print("No processed speech data found. Run 02_nlp_processing.ipynb first.")
    # Demo mode - process one speech
    print("\nRunning demo processing...")

In [None]:
# If no processed data, run quick processing
if not sentences_by_year:
    from src.nlp import extract_text_with_fallback, clean_text, tokenize_speech, estimate_timestamps
    from src.nlp import classify_sectors_batch, analyze_sentiment_batch, score_certainty_batch
    import yaml
    
    # Load config
    with open(project_root / 'config' / 'sectors.yaml', 'r', encoding='utf-8') as f:
        sectors_config = yaml.safe_load(f)
    
    # Find latest speech
    speech_file = list(project_root.glob('*2024*.pdf'))[0]
    print(f"Processing {speech_file.name}...")
    
    # Quick processing
    text = clean_text(extract_text_with_fallback(str(speech_file)))
    sentences = tokenize_speech(text)
    sentences_df = pd.DataFrame(sentences)
    
    # Classify sectors
    sector_keywords = {k: v.get('keywords', []) for k, v in sectors_config['sectors'].items()}
    prob_cols = classify_sectors_batch(sentences_df['text'].tolist(), sector_keywords)
    for col, probs in prob_cols.items():
        sentences_df[col] = probs
    
    sentences_df['importance_weight'] = 1.0
    sentences_by_year = {'2024-25': sentences_df}
    print(f"Processed {len(sentences_df)} sentences")

## 2. Detect First Sector Mentions

In [None]:
from src.events import detect_all_sector_mentions, calculate_cumulative_attention

# Detect mentions for each year
mentions_by_year = {}

for fy, sentences_df in sentences_by_year.items():
    mentions = detect_all_sector_mentions(sentences_df, threshold=0.3)
    mentions_by_year[fy] = mentions
    
    print(f"\n{fy}: Detected {len(mentions)} sector first mentions")
    if not mentions.empty:
        print(mentions[['sector', 'sentence_position', 'cumulative_attention']].to_string())

In [None]:
# Visualize mention order
if mentions_by_year:
    # Take one year for visualization
    fy = list(mentions_by_year.keys())[0]
    mentions = mentions_by_year[fy]
    sentences_df = sentences_by_year[fy]
    
    if not mentions.empty:
        fig, ax = plt.subplots(figsize=(14, 6))
        
        # Plot cumulative attention for each sector
        prob_cols = [c for c in sentences_df.columns if c.startswith('prob_')]
        
        for col in prob_cols[:8]:  # Top 8 sectors
            sector = col.replace('prob_', '')
            cum_attn = calculate_cumulative_attention(sentences_df, sector)
            ax.plot(sentences_df['position'], cum_attn, label=sector, alpha=0.7)
        
        ax.axhline(y=0.3, color='red', linestyle='--', alpha=0.5, label='Mention Threshold')
        ax.set_xlabel('Sentence Position')
        ax.set_ylabel('Cumulative Attention')
        ax.set_title(f'Cumulative Sector Attention - Budget {fy}')
        ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        plt.show()

## 3. Load Market Data for Budget Day

In [None]:
from src.ingestion import load_single_stock, get_budget_dates
from src.market import clean_stock_data
import yaml

# Load config
with open(project_root / 'config' / 'sectors.yaml', 'r', encoding='utf-8') as f:
    sectors_config = yaml.safe_load(f)

with open(project_root / 'config' / 'event_dates.yaml', 'r', encoding='utf-8') as f:
    event_dates = yaml.safe_load(f)

# Get budget date for 2024-25
budget_info = event_dates['budget_events'].get('2024-25', {})
budget_date = datetime.strptime(budget_info.get('date', '2024-07-23'), '%Y-%m-%d').date()
print(f"Loading market data for: {budget_date}")

In [None]:
# Load stock data for key sectors
sample_sectors = ['banking_nbfc', 'infrastructure', 'it_services', 'pharma', 'energy_power']

sector_data = {}

for sector_key in sample_sectors:
    sector_info = sectors_config['sectors'].get(sector_key, {})
    stocks = sector_info.get('stocks', [])[:10]  # First 10 stocks per sector
    
    stock_returns = []
    
    for symbol in stocks:
        df = load_single_stock(symbol)
        if df.empty:
            continue
        
        # Filter to budget day
        df = df[df.index.date == budget_date]
        
        if len(df) > 0:
            df = clean_stock_data(df)
            if 'return' in df.columns:
                stock_returns.append(df['return'].rename(symbol))
    
    if stock_returns:
        sector_df = pd.concat(stock_returns, axis=1)
        sector_df['portfolio_return'] = sector_df.mean(axis=1)
        sector_data[sector_key] = sector_df
        print(f"{sector_key}: {len(stock_returns)} stocks, {len(sector_df)} bars")

In [None]:
# Combine sector returns
if sector_data:
    sector_returns = pd.DataFrame({
        sector: data['portfolio_return'] for sector, data in sector_data.items()
    })
    
    print(f"Sector returns shape: {sector_returns.shape}")
    print(f"Time range: {sector_returns.index.min()} to {sector_returns.index.max()}")
    sector_returns.head()

## 4. Calculate Abnormal Returns

In [None]:
from src.models import calculate_abnormal_returns

if 'sector_returns' in dir() and not sector_returns.empty:
    # Market return = average across sectors
    market_return = sector_returns.mean(axis=1)
    
    # Calculate abnormal returns
    abnormal_returns = calculate_abnormal_returns(sector_returns, market_return, method='market_adjusted')
    
    print("Abnormal returns statistics:")
    print(abnormal_returns.describe())

In [None]:
# Visualize cumulative returns around budget speech
if 'sector_returns' in dir() and not sector_returns.empty:
    fig, axes = plt.subplots(2, 1, figsize=(14, 10), sharex=True)
    
    # Raw cumulative returns
    ax1 = axes[0]
    for col in sector_returns.columns:
        cum_ret = (1 + sector_returns[col]).cumprod() - 1
        ax1.plot(sector_returns.index, cum_ret * 100, label=col, linewidth=1.5)
    
    # Mark speech start (11:00)
    from src.utils.time_utils import IST
    speech_start = datetime.combine(budget_date, datetime.strptime('11:00', '%H:%M').time())
    speech_start = IST.localize(speech_start)
    ax1.axvline(x=speech_start, color='red', linestyle='--', alpha=0.7, label='Speech Start')
    
    ax1.set_ylabel('Cumulative Return (%)')
    ax1.set_title(f'Sector Returns on Budget Day {budget_date}')
    ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    ax1.axhline(y=0, color='gray', linestyle='-', alpha=0.3)
    
    # Cumulative Abnormal Returns
    ax2 = axes[1]
    for col in abnormal_returns.columns:
        car = abnormal_returns[col].cumsum()
        ax2.plot(abnormal_returns.index, car * 100, label=col, linewidth=1.5)
    
    ax2.axvline(x=speech_start, color='red', linestyle='--', alpha=0.7)
    ax2.set_xlabel('Time (IST)')
    ax2.set_ylabel('Cumulative Abnormal Return (%)')
    ax2.set_title('Cumulative Abnormal Returns')
    ax2.axhline(y=0, color='gray', linestyle='-', alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## 5. Event Study Around First Mentions

In [None]:
from src.models import event_study_single

# Run event study for each sector mention
if 'abnormal_returns' in dir() and mentions_by_year:
    fy = list(mentions_by_year.keys())[0]
    mentions = mentions_by_year[fy]
    
    event_results = []
    
    for _, mention in mentions.iterrows():
        sector = mention['sector']
        
        # Use estimated timestamp or position-based
        if 'estimated_timestamp' in mention and pd.notna(mention['estimated_timestamp']):
            event_time = mention['estimated_timestamp']
        else:
            # Estimate based on position
            position = mention.get('sentence_position', 0)
            total_sentences = len(sentences_by_year[fy])
            progress = position / max(total_sentences, 1)
            event_time = speech_start + timedelta(minutes=progress * 90)
        
        if sector in abnormal_returns.columns:
            result = event_study_single(abnormal_returns, event_time, sector)
            result['fiscal_year'] = fy
            event_results.append(result)
    
    event_df = pd.DataFrame(event_results)
    print(f"Event study results: {len(event_df)} events")
    if not event_df.empty:
        print(event_df[['sector', 'car_5m', 'car_15m', 'car_30m', 'car_60m']].to_string())

## 6. Statistical Significance Tests

In [None]:
from scipy import stats

if 'event_df' in dir() and not event_df.empty:
    print("CAR Significance Tests:")
    print("="*60)
    
    for col in ['car_5m', 'car_15m', 'car_30m', 'car_60m']:
        if col in event_df.columns:
            values = event_df[col].dropna()
            if len(values) > 1:
                mean = values.mean()
                std = values.std()
                n = len(values)
                t_stat = mean / (std / np.sqrt(n)) if std > 0 else 0
                p_value = 2 * (1 - stats.t.cdf(abs(t_stat), n-1)) if n > 1 else 1
                
                print(f"\n{col}:")
                print(f"  Mean CAR: {mean*100:.3f}%")
                print(f"  Std Dev: {std*100:.3f}%")
                print(f"  t-stat: {t_stat:.3f}")
                print(f"  p-value: {p_value:.4f}")
                print(f"  Significant at 5%: {'Yes' if p_value < 0.05 else 'No'}")

## 7. Summary and Save Results

In [None]:
# Save results
output_dir = project_root / 'outputs' / 'tables'
output_dir.mkdir(parents=True, exist_ok=True)

if 'event_df' in dir() and not event_df.empty:
    event_df.to_csv(output_dir / 'event_study_results.csv', index=False)
    print(f"Saved results to {output_dir / 'event_study_results.csv'}")

print("\n" + "="*60)
print("EVENT STUDY ANALYSIS COMPLETE")
print("="*60)