# Fast Validation Notebook (< 5 min)

Quick smoke test for code changes before full training.
Uses 3 tickers, 1 horizon, 2 models (ElasticNet + LightGBM).

**Sections:**
1. Install deps
2. Load data (3 tickers, 2y history)
3. Feature engineering + sentiment
4. Train ElasticNet + LightGBM
5. Quick IC/Sharpe evaluation
6. Run Top 10 engine
7. Display results

## 1. Setup

In [None]:
!pip install -q yfinance lightgbm scikit-learn scipy pandas numpy

import os
os.chdir('/content')
!rm -rf AI-stock-investment-tool

REPO = 'https://github.com/kevin6598/AI-stock-investment-tool.git'
ret = os.system('git clone %s 2>/dev/null' % REPO)
if ret != 0:
    from getpass import getpass
    token = getpass('GitHub token: ')
    os.system('git clone https://%s@github.com/kevin6598/AI-stock-investment-tool.git' % token)
    del token

os.chdir('/content/AI-stock-investment-tool')
print('Working dir: %s' % os.getcwd())

## 2. Load Data (3 tickers, 2y)

In [None]:
from data.stock_api import get_historical_data, get_stock_info
from training.feature_engineering import (
    build_panel_dataset, cross_sectional_normalize, add_ticker_embedding_column,
)
import numpy as np

TICKERS = ['AAPL', 'MSFT', 'GOOGL']
PERIOD = '2y'
FORWARD_HORIZONS = [21]  # 1M only

stock_dfs = {}
stock_infos = {}
for t in TICKERS:
    df = get_historical_data(t, period=PERIOD)
    if not df.empty:
        stock_dfs[t] = df
        stock_infos[t] = get_stock_info(t) or {}
        print('%s: %d rows' % (t, len(df)))

market_df = get_historical_data('SPY', period=PERIOD)
valid_tickers = sorted(stock_dfs.keys())
print('Valid tickers: %d' % len(valid_tickers))

panel = build_panel_dataset(stock_dfs, stock_infos, market_df, FORWARD_HORIZONS)
panel = cross_sectional_normalize(panel)
panel, ticker_to_id = add_ticker_embedding_column(panel, valid_tickers)
print('Panel shape: %s' % str(panel.shape))

## 3. Feature Engineering + Sentiment IC Validation

In [None]:
from training.feature_engineering import validate_sentiment_ic

TARGET_COL = 'fwd_return_21d'
feature_cols = [
    c for c in panel.columns
    if not c.startswith('fwd_return_')
    and not c.startswith('residual_return_')
    and not c.startswith('ranked_target_')
    and c not in ('_close', 'ticker_id')
]
print('Features: %d' % len(feature_cols))

# IC validation on NLP features
nlp_cols = [c for c in feature_cols if c.startswith('nlp_')]
print('NLP features: %d' % len(nlp_cols))

if nlp_cols:
    sample = panel.dropna(subset=[TARGET_COL])
    _, ic_report = validate_sentiment_ic(
        sample[nlp_cols], sample[TARGET_COL], ic_threshold=0.01,
    )
    for feat, ic_val in sorted(ic_report.items(), key=lambda x: abs(x[1]), reverse=True)[:5]:
        print('  %s: IC=%.4f' % (feat, ic_val))

## 4. Train ElasticNet + LightGBM

In [None]:
from training.models import create_model
from training.model_selection import compute_prediction_metrics
import time

X = panel[feature_cols].values.astype(np.float32)
y = panel[TARGET_COL].values.astype(np.float32)
np.nan_to_num(X, copy=False, nan=0.0, posinf=0.0, neginf=0.0)
np.nan_to_num(y, copy=False, nan=0.0, posinf=0.0, neginf=0.0)

split = int(len(X) * 0.7)
val_split = int(len(X) * 0.85)

results = {}
for model_type in ['elastic_net', 'lightgbm']:
    t0 = time.time()
    model = create_model(model_type)
    model.fit(
        X[:split], y[:split],
        X[split:val_split], y[split:val_split],
        feature_names=feature_cols,
    )
    preds = model.predict(X[val_split:])
    valid = ~np.isnan(preds)
    metrics = compute_prediction_metrics(y[val_split:][valid], preds[valid])
    elapsed = time.time() - t0
    results[model_type] = {'model': model, 'metrics': metrics, 'time': elapsed}
    print('%s: IC=%.4f, Hit=%.4f, Time=%.1fs' % (
        model_type, metrics.ic, metrics.hit_ratio, elapsed))

## 5. Quick Evaluation

In [None]:
print('\nModel Comparison:')
print('%-15s %8s %8s %8s %8s' % ('Model', 'IC', 'Hit', 'RMSE', 'Time'))
print('-' * 50)
for name, r in results.items():
    m = r['metrics']
    print('%-15s %8.4f %8.4f %8.6f %7.1fs' % (
        name, m.ic, m.hit_ratio, m.rmse, r['time']))

best_name = max(results, key=lambda k: results[k]['metrics'].ic)
print('\nBest model: %s (IC=%.4f)' % (best_name, results[best_name]['metrics'].ic))

## 6. Run Top 10 Engine

In [None]:
from engine.top10 import Top10Engine, Top10Result

# Create a simple predict function using our trained model
best_model = results[best_name]['model']

def quick_predict(ticker, horizon='1M'):
    """Simple prediction wrapper for the Top 10 engine."""
    from data.stock_api import get_historical_data, get_stock_info
    from training.feature_engineering import build_feature_matrix
    stock_df = get_historical_data(ticker, period='2y')
    if stock_df.empty:
        raise ValueError('No data for %s' % ticker)
    info = get_stock_info(ticker) or {}
    feat = build_feature_matrix(stock_df, info, market_df, [21], ticker=ticker)
    if feat.empty:
        raise ValueError('No features for %s' % ticker)
    fc = [c for c in feat.columns if not c.startswith('fwd_return_') and c != '_close']
    X_pred = feat[fc].values[-1:].astype(np.float32)
    np.nan_to_num(X_pred, copy=False, nan=0.0)
    point = float(best_model.predict(X_pred)[0])
    return {
        'ticker': ticker,
        'point_estimate': point,
        'probability_up': 0.5 + point * 5,
        'p_up': max(0.0, min(1.0, 0.5 + point * 5)),
        'confidence': 0.5,
        'risk_score': 0.4,
        'direction': 'UP' if point > 0 else 'DOWN',
        'meta_trade_probability': 0.5,
        'uncertainty': 0.3,
        'quantiles': {'q10': point - 0.05, 'p10': point - 0.05},
    }

engine = Top10Engine(predict_fn=quick_predict, model_version='test_v1')

# Only run on the 3 test tickers (fast)
print('Running Top 10 on test universe...')
result = engine.select(market='US', horizon='1M', max_stocks=3)

if result.stocks:
    print('\nTop Picks:')
    for s in result.stocks:
        print('  #%d %s %s Score=%.3f Return=%+.2f%%' % (
            s.rank, s.ticker, s.direction, s.score, s.expected_return * 100))
else:
    print('No picks generated (filter too strict for 3 tickers)')

## 7. Summary

In [None]:
print('Fast Validation Complete!')
print('========================')
print('Tickers tested: %s' % ', '.join(TICKERS))
print('Features: %d (including %d NLP)' % (len(feature_cols), len(nlp_cols)))
print('Best model: %s (IC=%.4f)' % (best_name, results[best_name]['metrics'].ic))
print('Top 10 engine: %s' % ('OK' if result.stocks else 'No picks'))
print('\nAll smoke tests passed. Safe to run full training.')