# 01 - Data Audit Notebook

This notebook performs initial data exploration and quality checks for the Budget Speech Impact Analysis project.

## Contents
1. Market Data Overview
2. Budget Speech PDF Overview
3. Data Quality Checks
4. Coverage Analysis
5. Sample Visualizations

In [None]:
# Setup
import sys
from pathlib import Path

# Add src to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
pd.set_option('display.max_columns', 50)

print(f"Project root: {project_root}")

## 1. Market Data Overview

In [None]:
from src.ingestion import get_available_stocks, load_single_stock, validate_data_quality

# Get available stocks
stocks = get_available_stocks()
print(f"Total stocks available: {len(stocks)}")
print(f"\nFirst 20 stocks: {stocks[:20]}")
print(f"\nLast 20 stocks: {stocks[-20:]}")

In [None]:
# Load sample stock (RELIANCE)
sample_stock = 'RELIANCE'
df = load_single_stock(sample_stock)

print(f"\n{sample_stock} Data Shape: {df.shape}")
print(f"\nDate Range: {df.index.min()} to {df.index.max()}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nSample Data:")
df.head(10)

In [None]:
# Data quality check for sample stock
quality = validate_data_quality(df, sample_stock)
print("Data Quality Report:")
for key, value in quality.items():
    print(f"  {key}: {value}")

In [None]:
# Check data quality for multiple stocks
sample_stocks = ['RELIANCE', 'TCS', 'HDFCBANK', 'INFY', 'SBIN', 'TATAMOTORS']

quality_reports = []
for symbol in sample_stocks:
    stock_df = load_single_stock(symbol)
    if not stock_df.empty:
        q = validate_data_quality(stock_df, symbol)
        q['date_start'] = q['date_range'][0] if q['date_range'][0] else None
        q['date_end'] = q['date_range'][1] if q['date_range'][1] else None
        del q['date_range']
        quality_reports.append(q)

quality_df = pd.DataFrame(quality_reports)
quality_df

## 2. Budget Speech PDF Overview

In [None]:
from src.ingestion import get_available_speeches, load_event_dates, get_budget_dates

# Get available speech PDFs
speeches = get_available_speeches()
print(f"Available Budget Speech PDFs: {len(speeches)}")
for speech in speeches:
    print(f"  - {speech}")

In [None]:
# Load event dates configuration
event_dates = load_event_dates()
print("\nBudget Event Schedule:")
print("="*60)

for fy, info in event_dates.items():
    print(f"\n{fy}:")
    print(f"  Date: {info.get('date')}")
    print(f"  Speech: {info.get('speech_start')} - {info.get('speech_end')}")
    print(f"  Finance Minister: {info.get('finance_minister')}")

In [None]:
# Budget dates list
budget_dates = get_budget_dates()
print("\nBudget Presentation Dates:")
for date in budget_dates:
    print(f"  {date.strftime('%Y-%m-%d (%A)')}")

## 3. Sector Configuration Overview

In [None]:
from src.ingestion import load_config, get_all_sectors, load_sector_stocks

# Load sector configuration
config = load_config()
sectors_config = config.get('sectors', {}).get('sectors', {})

print("Sector Overview:")
print("="*60)

sector_summary = []
for sector_key, sector_info in sectors_config.items():
    stocks = sector_info.get('stocks', [])
    keywords = sector_info.get('keywords', [])
    name = sector_info.get('name', sector_key)
    
    sector_summary.append({
        'sector_key': sector_key,
        'sector_name': name,
        'n_stocks': len(stocks),
        'n_keywords': len(keywords),
        'sample_stocks': stocks[:5] if stocks else [],
        'sample_keywords': keywords[:5] if keywords else []
    })

sector_df = pd.DataFrame(sector_summary)
sector_df[['sector_name', 'n_stocks', 'n_keywords']]

In [None]:
# Visualize sector sizes
fig, ax = plt.subplots(figsize=(12, 6))
sector_df_sorted = sector_df.sort_values('n_stocks', ascending=True)
ax.barh(sector_df_sorted['sector_name'], sector_df_sorted['n_stocks'], color='steelblue')
ax.set_xlabel('Number of Stocks')
ax.set_title('Stocks per Sector')
ax.axvline(x=sector_df['n_stocks'].mean(), color='red', linestyle='--', label=f'Mean: {sector_df["n_stocks"].mean():.1f}')
ax.legend()
plt.tight_layout()
plt.show()

print(f"\nTotal stocks across all sectors: {sector_df['n_stocks'].sum()}")
print(f"Average stocks per sector: {sector_df['n_stocks'].mean():.1f}")

## 4. Market Data Coverage for Budget Days

In [None]:
from src.ingestion import get_budget_day_data

# Check data availability on budget days
budget_dates = get_budget_dates()

coverage_by_date = []
for budget_date in budget_dates:
    date_obj = budget_date.date()
    
    available_count = 0
    sample_bars = 0
    
    for symbol in stocks[:50]:  # Check first 50 stocks
        day_data = get_budget_day_data(symbol, date_obj)
        if not day_data.empty:
            available_count += 1
            sample_bars = max(sample_bars, len(day_data))
    
    coverage_by_date.append({
        'budget_date': date_obj,
        'stocks_available': available_count,
        'max_bars': sample_bars,
        'coverage_pct': available_count / 50 * 100
    })

coverage_df = pd.DataFrame(coverage_by_date)
coverage_df

## 5. Sample Intraday Data Visualization

In [None]:
from src.market import clean_stock_data

# Visualize intraday data on a budget day
sample_budget_date = budget_dates[-1].date()  # Most recent budget
print(f"Visualizing data for budget day: {sample_budget_date}")

# Load and clean data
sample_stocks = ['HDFCBANK', 'TCS', 'RELIANCE']
budget_day_data = {}

for symbol in sample_stocks:
    df = get_budget_day_data(symbol, sample_budget_date)
    if not df.empty:
        df = clean_stock_data(df)
        budget_day_data[symbol] = df
        print(f"  {symbol}: {len(df)} bars")

In [None]:
# Plot intraday price movements
if budget_day_data:
    fig, axes = plt.subplots(len(budget_day_data), 1, figsize=(14, 4*len(budget_day_data)), sharex=True)
    
    if len(budget_day_data) == 1:
        axes = [axes]
    
    for ax, (symbol, df) in zip(axes, budget_day_data.items()):
        if 'return' in df.columns:
            cumulative_return = (1 + df['return']).cumprod() - 1
            ax.plot(df.index, cumulative_return * 100, label=symbol, linewidth=1.5)
            ax.axhline(y=0, color='gray', linestyle='--', alpha=0.5)
            ax.set_ylabel('Cumulative Return (%)')
            ax.set_title(f'{symbol} - Intraday Returns on {sample_budget_date}')
            ax.legend()
    
    plt.xlabel('Time (IST)')
    plt.tight_layout()
    plt.show()
else:
    print("No data available for visualization")

## 6. Summary Statistics

In [None]:
print("="*60)
print("DATA AUDIT SUMMARY")
print("="*60)
print(f"\nMarket Data:")
print(f"  - Total stocks: {len(stocks)}")
print(f"  - Data frequency: 5-minute bars")
print(f"  - Approximate date range: Feb 2015 - Jan 2026")

print(f"\nBudget Speeches:")
print(f"  - Total PDFs: {len(speeches)}")
print(f"  - Years covered: {len(event_dates)}")

print(f"\nSector Configuration:")
print(f"  - Number of sectors: {len(sectors_config)}")
print(f"  - Total stocks mapped: {sector_df['n_stocks'].sum()}")
print(f"  - Total keywords: {sector_df['n_keywords'].sum()}")

print(f"\nNext Steps:")
print("  1. Run NLP pipeline on budget speech PDFs")
print("  2. Build sector portfolios")
print("  3. Align speech events with price data")
print("  4. Run event study analysis")