In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Enhanced Earnings IV Analysis - Meeting Requirements\n",
    "\n",
    "This notebook demonstrates the enhanced analysis functions based on the meeting discussion:\n",
    "\n",
    "## Key Features:\n",
    "1. **Realized Volatility Estimators** - Multiple rolling/exponentially weighted estimators\n",
    "2. **Volume Analysis** - Option volume vs stock ADV analysis\n",
    "3. **Kernel Regression** - Enhanced regression from Wolfe paper approach\n",
    "4. **Single-Name Case Studies** - Comprehensive analysis for individual stocks\n",
    "5. **Large Cap Universe** - Building filtered universe of liquid stocks\n",
    "\n",
    "## Meeting Requirements Addressed:\n",
    "- ✅ Kernel regression interface\n",
    "- ✅ Realized volatility computation with multiple estimators\n",
    "- ✅ Volume analysis (option notional vs stock ADV)\n",
    "- ✅ Single-name vs cross-sectional approach\n",
    "- ✅ Robust regression model with proper X and y variables\n",
    "- ✅ Large cap universe with option volume filters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import required libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import wrds\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "# Import the enhanced analysis functions\n",
    "from enhanced_analysis import (\n",
    "    calculate_realized_volatility_estimators,\n",
    "    analyze_option_volume_vs_stock_adv,\n",
    "    enhanced_kernel_regression_analysis,\n",
    "    build_regression_dataset,\n",
    "    run_single_name_case_study,\n",
    "    get_large_cap_universe\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Connect to WRDS\n",
    "db = wrds.Connection(wrds_username='your_wrds_username')\n",
    "\n",
    "# Import your existing pipeline class\n",
    "# (You'll need to run the cell with your EarningsIVDataPipeline class first)\n",
    "# from Week3_JoyceXu_Completed import EarningsIVDataPipeline\n",
    "\n",
    "# Initialize pipeline\n",
    "# pipeline = EarningsIVDataPipeline(db)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Single-Name Case Study (Meeting Requirement)\n",
    "\n",
    "Run comprehensive analysis on one stock to understand the data and construct realized volatility estimators."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Run single-name case study for AAPL\n",
    "case_study = run_single_name_case_study(\n",
    "    pipeline=pipeline,\n",
    "    ticker='AAPL',\n",
    "    start_date='2023-01-01',\n",
    "    end_date='2024-12-31'\n",
    ")\n",
    "\n",
    "if case_study:\n",
    "    print(f\"\\n📊 Case Study Results for {case_study['ticker']}:\")\n",
    "    print(f\"Regression data points: {len(case_study['regression_data'])}\")\n",
    "    print(f\"Test R²: {case_study['regression_results']['test_r2']:.3f}\")\n",
    "    print(f\"Test RMSE: {case_study['regression_results']['test_rmse']:.4f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Realized Volatility Analysis\n",
    "\n",
    "Compare different realized volatility estimators and their properties."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get stock prices for realized volatility calculation\n",
    "if 'stock_prices' in pipeline.data:\n",
    "    stock_prices = pipeline.data['stock_prices']\n",
    "    \n",
    "    # Calculate multiple realized volatility estimators\n",
    "    realized_vol = calculate_realized_volatility_estimators(\n",
    "        stock_prices, \n",
    "        windows=[5, 10, 21, 30]\n",
    "    )\n",
    "    \n",
    "    # Plot different estimators\n",
    "    if realized_vol is not None:\n",
    "        fig, axes = plt.subplots(2, 2, figsize=(15, 10))\n",
    "        \n",
    "        # Standard rolling volatility\n",
    "        axes[0,0].plot(realized_vol['date'], realized_vol['realized_vol_21d'], 'b-', alpha=0.7)\n",
    "        axes[0,0].set_title('Standard Rolling Volatility (21d)')\n",
    "        axes[0,0].set_ylabel('Annualized Volatility')\n",
    "        \n",
    "        # Exponentially weighted volatility\n",
    "        axes[0,1].plot(realized_vol['date'], realized_vol['ewm_vol_21d'], 'g-', alpha=0.7)\n",
    "        axes[0,1].set_title('Exponentially Weighted Volatility (21d)')\n",
    "        axes[0,1].set_ylabel('Annualized Volatility')\n",
    "        \n",
    "        # Compare different windows\n",
    "        axes[1,0].plot(realized_vol['date'], realized_vol['realized_vol_5d'], 'r-', alpha=0.7, label='5d')\n",
    "        axes[1,0].plot(realized_vol['date'], realized_vol['realized_vol_21d'], 'b-', alpha=0.7, label='21d')\n",
    "        axes[1,0].plot(realized_vol['date'], realized_vol['realized_vol_30d'], 'g-', alpha=0.7, label='30d')\n",
    "        axes[1,0].set_title('Rolling Volatility Comparison')\n",
    "        axes[1,0].set_ylabel('Annualized Volatility')\n",
    "        axes[1,0].legend()\n",
    "        \n",
    "        # Volatility of volatility\n",
    "        vol_of_vol = realized_vol['realized_vol_21d'].rolling(21).std()\n",
    "        axes[1,1].plot(realized_vol['date'], vol_of_vol, 'purple', alpha=0.7)\n",
    "        axes[1,1].set_title('Volatility of Volatility (21d)')\n",
    "        axes[1,1].set_ylabel('Volatility')\n",
    "        \n",
    "        plt.tight_layout()\n",
    "        plt.show()\n",
    "        \n",
    "        print(\"\\n📈 Realized Volatility Summary:\")\n",
    "        print(f\"5-day RV mean: {realized_vol['realized_vol_5d'].mean():.3f}\")\n",
    "        print(f\"21-day RV mean: {realized_vol['realized_vol_21d'].mean():.3f}\")\n",
    "        print(f\"30-day RV mean: {realized_vol['realized_vol_30d'].mean():.3f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Volume Analysis (Meeting Requirement)\n",
    "\n",
    "Analyze option volume relative to stock average daily volume and option notional vs stock notional."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Analyze option volume vs stock ADV\n",
    "if 'options_filtered' in pipeline.data and 'stock_prices' in pipeline.data:\n",
    "    volume_analysis = analyze_option_volume_vs_stock_adv(\n",
    "        options_df=pipeline.data['options_filtered'],\n",
    "        stock_df=pipeline.data['stock_prices'],\n",
    "        ticker='AAPL'\n",
    "    )\n",
    "    \n",
    "    if volume_analysis is not None:\n",
    "        print(\"\\n📊 Volume Analysis Complete!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Regression Analysis (Meeting Requirement)\n",
    "\n",
    "Build regression dataset and run kernel regression to predict post-earnings realized volatility."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Build regression dataset\n",
    "if 'earnings_options' in pipeline.data and 'realized_volatility' in pipeline.data:\n",
    "    regression_data = build_regression_dataset(\n",
    "        earnings_options_df=pipeline.data['earnings_options'],\n",
    "        realized_vol_df=pipeline.data['realized_volatility'],\n",
    "        target_window=21,  # 21-day realized volatility target\n",
    "        feature_window=10  # 10-day feature window\n",
    "    )\n",
    "    \n",
    "    if regression_data is not None and len(regression_data) > 10:\n",
    "        # Prepare features and target\n",
    "        feature_cols = ['avg_iv', 'iv_std', 'avg_volume', 'avg_spread', 'avg_tte', 'avg_moneyness']\n",
    "        X = regression_data[feature_cols]\n",
    "        y = regression_data['target_rv']\n",
    "        \n",
    "        print(f\"\\n🔬 Regression Dataset Summary:\")\n",
    "        print(f\"Features: {feature_cols}\")\n",
    "        print(f\"Target: 21-day realized volatility\")\n",
    "        print(f\"Data points: {len(regression_data)}\")\n",
    "        \n",
    "        # Run kernel regression\n",
    "        regression_results = enhanced_kernel_regression_analysis(X, y)\n",
    "        \n",
    "        print(f\"\\n✅ Regression Analysis Complete!\")\n",
    "        print(f\"Model performance: R² = {regression_results['test_r2']:.3f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Large Cap Universe (Meeting Requirement)\n",
    "\n",
    "Build universe of large cap stocks with sufficient option volume for cross-sectional analysis."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get large cap universe\n",
    "universe = get_large_cap_universe(\n",
    "    pipeline=pipeline,\n",
    "    min_market_cap=1e9,  # $1B+ market cap\n",
    "    min_option_volume=1000  # 1000+ average daily option volume\n",
    ")\n",
    "\n",
    "if universe is not None:\n",
    "    print(f\"\\n🏢 Large Cap Universe Summary:\")\n",
    "    print(f\"Total stocks: {len(universe)}\")\n",
    "    print(f\"Average market cap: ${universe['avg_market_cap'].mean()/1e9:.1f}B\")\n",
    "    print(f\"Average option volume: {universe['avg_option_volume'].mean():.0f} contracts/day\")\n",
    "    \n",
    "    # Show top stocks by market cap\n",
    "    print(f\"\\n📈 Top 10 Stocks by Market Cap:\")\n",
    "    top_stocks = universe.nlargest(10, 'avg_market_cap')\n",
    "    for _, stock in top_stocks.iterrows():\n",
    "        print(f\"{stock['ticker']}: ${stock['avg_market_cap']/1e9:.1f}B, {stock['avg_option_volume']:.0f} opt/day\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Cross-Sectional Analysis (Meeting Requirement)\n",
    "\n",
    "Run analysis across multiple stocks to get \"things right on average, over many samples.\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Run analysis on multiple stocks (subset for demonstration)\n",
    "if universe is not None:\n",
    "    # Take top 5 stocks for demonstration\n",
    "    demo_tickers = universe.head(5)['ticker'].tolist()\n",
    "    \n",
    "    print(f\"\\n🔬 Running Cross-Sectional Analysis on: {demo_tickers}\")\n",
    "    \n",
    "    cross_sectional_results = []\n",
    "    \n",
    "    for ticker in demo_tickers:\n",
    "        print(f\"\\n--- Analyzing {ticker} ---\")\n",
    "        try:\n",
    "            case_study = run_single_name_case_study(\n",
    "                pipeline=pipeline,\n",
    "                ticker=ticker,\n",
    "                start_date='2023-01-01',\n",
    "                end_date='2024-12-31'\n",
    "            )\n",
    "            \n",
    "            if case_study:\n",
    "                cross_sectional_results.append({\n",
    "                    'ticker': ticker,\n",
    "                    'test_r2': case_study['regression_results']['test_r2'],\n",
    "                    'test_rmse': case_study['regression_results']['test_rmse'],\n",
    "                    'data_points': len(case_study['regression_data'])\n",
    "                })\n",
    "        except Exception as e:\n",
    "            print(f\"Error analyzing {ticker}: {e}\")\n",
    "    \n",
    "    # Summary of cross-sectional results\n",
    "    if cross_sectional_results:\n",
    "        results_df = pd.DataFrame(cross_sectional_results)\n",
    "        \n",
    "        print(f\"\\n📊 Cross-Sectional Analysis Summary:\")\n",
    "        print(f\"Average Test R²: {results_df['test_r2'].mean():.3f}\")\n",
    "        print(f\"Average Test RMSE: {results_df['test_rmse'].mean():.4f}\")\n",
    "        print(f\"Total data points: {results_df['data_points'].sum():,}\")\n",
    "        \n",
    "        # Plot results\n",
    "        fig, axes = plt.subplots(1, 2, figsize=(12, 5))\n",
    "        \n",
    "        axes[0].bar(results_df['ticker'], results_df['test_r2'])\n",
    "        axes[0].set_title('Test R² by Stock')\n",
    "        axes[0].set_ylabel('R²')\n",
    "        \n",
    "        axes[1].bar(results_df['ticker'], results_df['test_rmse'])\n",
    "        axes[1].set_title('Test RMSE by Stock')\n",
    "        axes[1].set_ylabel('RMSE')\n",
    "        \n",
    "        plt.tight_layout()\n",
    "        plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Close database connection\n",
    "db.close()\n",
    "\n",
    "print(\"\\n🎉 Enhanced Analysis Complete!\")\n",
    "print(\"\\nNext Steps (from meeting):\")\n",
    "print(\"1. Focus on specific maturity points and ATM moneyness\")\n",
    "print(\"2. Adjust realized volatility parameters based on 'eye-balling' the graphs\")\n",
    "print(\"3. Build robust but parsimonious initial regression model\")\n",
    "print(\"4. Decide between single-name vs cross-sectional approach\")\n",
    "print(\"5. Settle on universe of large cap stocks with option volume filters\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}