# Signal Analysis Notebook

This notebook provides tools for exploring and analyzing signal generation in the Kalshi Sports Alpha system.

## Contents
1. Setup and Imports
2. Load Market Data
3. Feature Computation
4. Signal Generation Analysis
5. Signal Distribution Visualization


In [None]:
# Setup and Imports
import sys
sys.path.insert(0, '..')

from datetime import datetime, timezone
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Import project modules
from kalshi.models import MarketSnapshot
from features.registry import get_registry
from signals import (
    TailInformedFlowSignal,
    FadeOverreactionSignal,
    LateKickoffVolSignal,
    FragileMarketSignal,
)
from signals.signal_base import SignalDirection

# Configure plotting
plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline


## Create Sample Market Snapshots

For demonstration, we create synthetic market snapshots. In production, these would come from the poller or backfill.


In [None]:
def create_sample_snapshot(
    market_id: str,
    league: str = "NFL",
    best_bid: float = 0.45,
    best_ask: float = 0.48,
    volume_1m: int = 50,
    volume_5m: int = 200,
    volume_1h: int = 1000,
    total_bid_depth: int = 500,
    total_ask_depth: int = 400,
    time_to_kickoff: int = 3600,
) -> MarketSnapshot:
    """Create a sample market snapshot for analysis."""
    return MarketSnapshot(
        market_id=market_id,
        event_id=f"EVT_{market_id}",
        snapshot_time=datetime.now(timezone.utc),
        league=league,
        team_home="Team A",
        team_away="Team B",
        market_type="moneyline",
        best_bid=best_bid,
        best_ask=best_ask,
        mid_price=(best_bid + best_ask) / 2,
        last_trade_price=best_bid + 0.01,
        last_trade_size=25,
        volume_1m=volume_1m,
        volume_5m=volume_5m,
        volume_1h=volume_1h,
        volume_total=volume_1h * 5,
        total_bid_depth=total_bid_depth,
        total_ask_depth=total_ask_depth,
        time_to_kickoff_seconds=time_to_kickoff,
        time_to_resolution_seconds=time_to_kickoff + 10800,
    )

# Create a variety of sample snapshots
sample_snapshots = [
    create_sample_snapshot("MKT001", time_to_kickoff=7200, volume_1m=100, volume_5m=150),
    create_sample_snapshot("MKT002", time_to_kickoff=600, total_bid_depth=100, total_ask_depth=80),
    create_sample_snapshot("MKT003", best_bid=0.30, best_ask=0.38, volume_1m=200),
    create_sample_snapshot("MKT004", league="NBA", time_to_kickoff=300),
    create_sample_snapshot("MKT005", total_bid_depth=50, total_ask_depth=30),
]

print(f"Created {len(sample_snapshots)} sample snapshots")


## Feature Computation

Compute all registered features for each snapshot.


In [None]:
# Get the feature registry
registry = get_registry()

# List all registered features
all_features = registry.list_features()
print(f"Registered features ({len(all_features)}):")
for feat in sorted(all_features):
    definition = registry.get(feat)
    print(f"  - {feat} [{definition.category}]: {definition.description}")


In [None]:
# Compute features for all snapshots
feature_data = []

for snapshot in sample_snapshots:
    features = registry.compute_all(snapshot)
    features['market_id'] = snapshot.market_id
    features['league'] = snapshot.league
    features['spread'] = snapshot.spread
    features['kickoff_window'] = snapshot.kickoff_window
    feature_data.append(features)

# Create DataFrame for analysis
features_df = pd.DataFrame(feature_data)
features_df.set_index('market_id', inplace=True)
features_df


## Signal Generation

Initialize all signal generators and run them on our sample data.


In [None]:
# Initialize signal generators
signal_generators = [
    TailInformedFlowSignal(),
    FadeOverreactionSignal(),
    LateKickoffVolSignal(),
    FragileMarketSignal(),
]

print("Signal Generators:")
for gen in signal_generators:
    print(f"  - {gen.name}: {gen.description}")


In [None]:
# Generate signals for each snapshot
all_signals = []

for snapshot in sample_snapshots:
    features = registry.compute_all(snapshot)
    
    for generator in signal_generators:
        signal = generator.generate(snapshot, features)
        if signal:
            all_signals.append({
                'market_id': snapshot.market_id,
                'signal_name': signal.name,
                'direction': signal.direction.value,
                'strength': signal.strength,
                'confidence': signal.confidence,
                'composite_score': signal.composite_score,
                'rationale': signal.rationale,
            })

signals_df = pd.DataFrame(all_signals)
print(f"Generated {len(all_signals)} signals")
signals_df


## Signal Distribution Visualization


In [None]:
if not signals_df.empty:
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    
    # Signal counts by type
    ax1 = axes[0, 0]
    signal_counts = signals_df['signal_name'].value_counts()
    signal_counts.plot(kind='bar', ax=ax1, color='steelblue')
    ax1.set_title('Signal Count by Type')
    ax1.set_xlabel('Signal Type')
    ax1.set_ylabel('Count')
    ax1.tick_params(axis='x', rotation=45)
    
    # Strength distribution
    ax2 = axes[0, 1]
    signals_df['strength'].hist(bins=20, ax=ax2, color='coral', edgecolor='black')
    ax2.set_title('Signal Strength Distribution')
    ax2.set_xlabel('Strength')
    ax2.set_ylabel('Frequency')
    
    # Confidence distribution
    ax3 = axes[1, 0]
    signals_df['confidence'].hist(bins=20, ax=ax3, color='seagreen', edgecolor='black')
    ax3.set_title('Signal Confidence Distribution')
    ax3.set_xlabel('Confidence')
    ax3.set_ylabel('Frequency')
    
    # Strength vs Confidence scatter
    ax4 = axes[1, 1]
    for name, group in signals_df.groupby('signal_name'):
        ax4.scatter(group['strength'], group['confidence'], label=name, alpha=0.7, s=100)
    ax4.set_title('Strength vs Confidence')
    ax4.set_xlabel('Strength')
    ax4.set_ylabel('Confidence')
    ax4.legend()
    ax4.set_xlim(0, 1)
    ax4.set_ylim(0, 1)
    
    plt.tight_layout()
    plt.show()
else:
    print("No signals generated - adjust sample data or signal thresholds")


## Summary Statistics


In [None]:
print("Feature Summary Statistics:")
print("=" * 50)
numeric_cols = features_df.select_dtypes(include=[np.number]).columns
features_df[numeric_cols].describe().round(3)
