# AI Stock Investment Tool - Colab Training

Train multiple model configurations with walk-forward validation on GPU.

**Steps:**
1. Install dependencies & clone repo
2. Mount Google Drive & load data
3. Build features
4. Configure models
5. Train with walk-forward validation
6. Compare results
7. Save best models to Drive

## 1. Setup

In [None]:
!pip install -q yfinance lightgbm torch optuna pyarrow scikit-learn scipy pandas numpy

In [None]:
!git clone https://github.com/kevin6598/AI-stock-investment-tool.git
%cd AI-stock-investment-tool

In [None]:
import torch
print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## 2. Mount Google Drive & Load Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os

# Configure paths
DRIVE_DIR = "/content/drive/MyDrive/ai_stock_tool"
os.makedirs(DRIVE_DIR, exist_ok=True)

DATA_PATH = os.path.join(DRIVE_DIR, "dataset.parquet")
OUTPUT_DIR = os.path.join(DRIVE_DIR, "models_registry")
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Drive dir: {DRIVE_DIR}")
print(f"Output dir: {OUTPUT_DIR}")

## 3. Fetch Data & Build Features

If you already exported `dataset.parquet` locally and uploaded it to Drive, skip this cell and go to **Load existing dataset**.

In [None]:
# === Option A: Build dataset from scratch ===

TICKERS = ["AAPL", "MSFT", "GOOGL", "AMZN", "NVDA", "META", "TSLA", "JPM", "V", "JNJ"]
PERIOD = "5y"
FORWARD_HORIZONS = [21, 63, 126]  # 1M, 3M, 6M

from data.stock_api import get_historical_data, get_stock_info
from training.feature_engineering import build_panel_dataset, cross_sectional_normalize

print("Fetching stock data...")
stock_dfs = {}
stock_infos = {}
for ticker in TICKERS:
    df = get_historical_data(ticker, period=PERIOD)
    if not df.empty:
        stock_dfs[ticker] = df
        stock_infos[ticker] = get_stock_info(ticker) or {}
        print(f"  {ticker}: {len(df)} rows")

market_df = get_historical_data("SPY", period=PERIOD)
print(f"  SPY (market): {len(market_df)} rows")

print("\nBuilding features...")
panel = build_panel_dataset(stock_dfs, stock_infos, market_df, FORWARD_HORIZONS)
panel = cross_sectional_normalize(panel)
print(f"Panel shape: {panel.shape}")

# Save to Drive
panel.to_parquet(DATA_PATH)
print(f"\nSaved to {DATA_PATH}")

In [None]:
# === Option B: Load existing dataset from Drive ===

import pandas as pd

panel = pd.read_parquet(DATA_PATH)
print(f"Loaded panel: {panel.shape}")
print(f"Tickers: {panel.index.get_level_values(1).unique().tolist()}")
print(f"Date range: {panel.index.get_level_values(0).min()} to {panel.index.get_level_values(0).max()}")

## 4. Configure Models

In [None]:
from training.model_config import ModelConfig, ConfigGrid

# Option 1: Manual configs
configs = [
    ModelConfig(model_type="elastic_net", learning_rate=0.1, epochs=1),
    ModelConfig(model_type="lightgbm", learning_rate=0.05, epochs=500,
                extra_params={"num_leaves": 31, "max_depth": 6}),
    ModelConfig(model_type="lightgbm", learning_rate=0.01, epochs=500,
                extra_params={"num_leaves": 63, "max_depth": 8}),
    ModelConfig(model_type="lstm_attention", learning_rate=1e-3, epochs=100,
                dropout=0.2, sequence_length=60),
    ModelConfig(model_type="transformer", learning_rate=3e-4, epochs=100,
                dropout=0.2, sequence_length=60),
]

# Option 2: Grid search (uncomment to use)
# configs = ConfigGrid.from_grid({
#     "model_type": ["lightgbm"],
#     "learning_rate": [0.01, 0.05, 0.1],
#     "extra_params": [{"num_leaves": 31}, {"num_leaves": 63}],
# })

# Option 3: Random search (uncomment to use)
# configs = ConfigGrid.from_random({
#     "model_type": ["lightgbm", "lstm_attention"],
#     "learning_rate": [0.001, 0.005, 0.01, 0.05],
#     "dropout": [0.1, 0.2, 0.3],
# }, n_samples=8)

print(f"Total configs to train: {len(configs)}")
for i, c in enumerate(configs):
    print(f"  [{i}] {c.model_type} | lr={c.learning_rate} | dropout={c.dropout} | epochs={c.epochs}")

## 5. Train with Walk-Forward Validation

In [None]:
from training.model_config import MultiConfigRunner
from training.model_selection import WalkForwardConfig

# Define walk-forward settings
HORIZON = "1M"  # Change to "3M" or "6M" as needed
TARGET_COL = "fwd_return_21d"  # Must match horizon: 21d=1M, 63d=3M, 126d=6M

wf_config = WalkForwardConfig(
    train_start="2015-01-01",
    test_end="2025-01-01",
    train_min_months=36,
    val_months=6,
    test_months=6,
    step_months=6,
    embargo_days=21,
    expanding=True,
)

# Feature columns (exclude targets and _close)
feature_cols = [
    c for c in panel.columns
    if not c.startswith("fwd_return_") and c != "_close"
]
print(f"Features: {len(feature_cols)}")
print(f"Target: {TARGET_COL}")
print(f"Horizon: {HORIZON}")

In [None]:
# Optional: Prune low-importance features to speed up training
from training.feature_engineering import prune_features

sample = panel.head(5000)  # Use a sample for feature selection
_, selected_cols = prune_features(
    sample[feature_cols],
    sample[TARGET_COL],
    importance_threshold=0.005,
)
print(f"Pruned: {len(feature_cols)} -> {len(selected_cols)} features")

# Uncomment to use pruned features:
# feature_cols = selected_cols

In [None]:
import time

runner = MultiConfigRunner(save_to_registry=False)

print("Starting training...")
print("=" * 60)
t0 = time.time()

results = runner.run(
    configs=configs,
    panel=panel,
    target_col=TARGET_COL,
    feature_cols=feature_cols,
    wf_config=wf_config,
    horizon=HORIZON,
)

total_time = time.time() - t0
print("=" * 60)
print(f"Training complete in {total_time:.1f}s")
print(f"Configs evaluated: {len(results)}")

## 6. Compare Results

In [None]:
import pandas as pd

# Build results table
rows = []
for r in results:
    ev = r.evaluation
    rows.append({
        "Model": r.config.model_type,
        "LR": r.config.learning_rate,
        "Dropout": r.config.dropout,
        "IC": round(ev.mean_ic, 4),
        "ICIR": round(ev.icir, 2),
        "Sharpe": round(ev.mean_sharpe, 2),
        "Max DD": round(ev.mean_mdd, 4),
        "Calmar": round(ev.mean_calmar, 2),
        "Hit Ratio": round(ev.mean_hit_ratio, 4),
        "Folds": len(ev.fold_results),
        "Time (s)": round(r.training_time, 1),
    })

df_results = pd.DataFrame(rows)
df_results = df_results.sort_values("IC", ascending=False).reset_index(drop=True)
print("\nModel Comparison (sorted by IC):")
print("=" * 80)
display(df_results)

In [None]:
# Statistical comparison
from training.model_comparison import ModelComparisonEngine

evaluations = [r.evaluation for r in results]
engine = ModelComparisonEngine()
report = engine.compare(evaluations)

print("Rankings by IC:")
for name, val in report.rankings.get("ic", []):
    print(f"  {name}: {val:.4f}")

print(f"\nStability scores:")
for name, score in report.stability_scores.items():
    print(f"  {name}: {score:.4f}")

print(f"\nBest per horizon: {report.best_per_horizon}")

if report.significance_tests:
    print(f"\nSignificance tests:")
    for key, test in report.significance_tests.items():
        sig = "YES" if test["ttest_significant_5pct"] else "no"
        print(f"  {key}: p={test['ttest_p_value']:.4f} (significant: {sig})")

In [None]:
# Visualize results
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# IC by model
axes[0].barh(df_results["Model"] + " (lr=" + df_results["LR"].astype(str) + ")", df_results["IC"])
axes[0].set_xlabel("Mean IC")
axes[0].set_title("Information Coefficient")

# Sharpe by model
axes[1].barh(df_results["Model"] + " (lr=" + df_results["LR"].astype(str) + ")", df_results["Sharpe"])
axes[1].set_xlabel("Mean Sharpe")
axes[1].set_title("Sharpe Ratio")

# Training time
axes[2].barh(df_results["Model"] + " (lr=" + df_results["LR"].astype(str) + ")", df_results["Time (s)"])
axes[2].set_xlabel("Seconds")
axes[2].set_title("Training Time")

plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "comparison.png"), dpi=150, bbox_inches="tight")
plt.show()

## 7. Save Best Models to Drive

In [None]:
import json

# Save all results
results_data = []
for i, r in enumerate(results):
    model_dir = os.path.join(OUTPUT_DIR, f"{r.config.model_type}_v{i}")
    os.makedirs(model_dir, exist_ok=True)

    # Save config
    with open(os.path.join(model_dir, "config.json"), "w") as f:
        json.dump(r.config.to_json(), f, indent=2)

    # Save metrics
    metrics = {
        "mean_ic": r.evaluation.mean_ic,
        "icir": r.evaluation.icir,
        "mean_sharpe": r.evaluation.mean_sharpe,
        "mean_mdd": r.evaluation.mean_mdd,
        "mean_calmar": r.evaluation.mean_calmar,
        "mean_hit_ratio": r.evaluation.mean_hit_ratio,
        "n_folds": len(r.evaluation.fold_results),
    }
    with open(os.path.join(model_dir, "metrics.json"), "w") as f:
        json.dump(metrics, f, indent=2)

    results_data.append({
        "config": r.config.to_json(),
        "training_time": r.training_time,
        "best_params": r.best_params,
        "metrics": metrics,
    })

    print(f"Saved: {model_dir}")

# Save combined results JSON (for local import)
results_path = os.path.join(OUTPUT_DIR, "results.json")
with open(results_path, "w") as f:
    json.dump(results_data, f, indent=2)

print(f"\nAll results saved to {OUTPUT_DIR}")
print(f"Import locally with: DataExporter.import_results('{results_path}')")

## 8. (Optional) Hyperparameter Search with Optuna

In [None]:
# Run Optuna HP search for the best model type
from training.hyperparameter_search import run_optuna_search

best_model_type = results[0].config.model_type
print(f"Running Optuna HP search for: {best_model_type}")

best_params, best_ic = run_optuna_search(
    model_type=best_model_type,
    panel=panel,
    target_col=TARGET_COL,
    feature_cols=feature_cols,
    n_trials=20,
)

print(f"\nBest IC: {best_ic:.4f}")
print(f"Best params: {best_params}")