# AI Stock Picks - Model Training Notebook

This notebook trains the ML ensemble models for stock screening.

## What this notebook does:
1. Fetches historical stock data using yfinance
2. Calculates technical indicators and fundamental metrics
3. Trains XGBoost, Random Forest, and LightGBM models
4. Saves trained models as .pkl files
5. Provides download links for deployment

## Usage:
1. Run all cells in order
2. Download the generated model files
3. Upload to your server or commit to repo

**Expected Runtime:** 10-30 minutes depending on data size

## Step 1: Install Required Packages

In [None]:
!pip install -q yfinance pandas numpy scikit-learn xgboost lightgbm ta-lib-binary joblib

## Step 2: Import Libraries

In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import lightgbm as lgb
import joblib
import json
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

print("✅ Libraries imported successfully")

## Step 3: Configuration

In [None]:
# Training Configuration
CONFIG = {
    'universe': ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'NVDA', 'META', 'TSLA', 'JPM', 'V', 'JNJ',
                 'WMT', 'PG', 'MA', 'UNH', 'DIS', 'HD', 'BAC', 'ADBE', 'CRM', 'NFLX',
                 'XOM', 'CVX', 'PFE', 'KO', 'PEP', 'COST', 'ABBV', 'MRK', 'TMO', 'AVGO',
                 'LLY', 'ABT', 'DHR', 'NKE', 'ACN', 'TXN', 'NEE', 'ORCL', 'MCD', 'QCOM',
                 'BMY', 'LIN', 'PM', 'UNP', 'AMD', 'HON', 'UPS', 'LOW', 'RTX', 'INTC'],
    'period': '3y',  # 3 years of data
    'forward_days': 30,  # Predict 30-day forward returns
    'cv_splits': 5,  # Time series cross-validation splits
    'min_history': 250,  # Minimum trading days required
}

print(f"Training on {len(CONFIG['universe'])} stocks")
print(f"Period: {CONFIG['period']}")
print(f"Target: {CONFIG['forward_days']}-day forward returns")

## Step 4: Fetch Stock Data

In [None]:
def fetch_stock_data(symbols, period='3y'):
    """Fetch historical data for multiple stocks"""
    data = {}
    failed = []
    
    for i, symbol in enumerate(symbols, 1):
        try:
            print(f"[{i}/{len(symbols)}] Fetching {symbol}...", end=' ')
            ticker = yf.Ticker(symbol)
            hist = ticker.history(period=period)
            
            if len(hist) >= CONFIG['min_history']:
                data[symbol] = hist
                print(f"✅ {len(hist)} days")
            else:
                print(f"❌ Insufficient data ({len(hist)} days)")
                failed.append(symbol)
        except Exception as e:
            print(f"❌ Error: {e}")
            failed.append(symbol)
    
    print(f"\n✅ Successfully loaded {len(data)} stocks")
    if failed:
        print(f"❌ Failed: {', '.join(failed)}")
    
    return data

# Fetch data
stock_data = fetch_stock_data(CONFIG['universe'], CONFIG['period'])

## Step 5: Calculate Technical Indicators

In [None]:
def calculate_rsi(prices, period=14):
    """Calculate Relative Strength Index"""
    delta = prices.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

def calculate_macd(prices, fast=12, slow=26, signal=9):
    """Calculate MACD"""
    ema_fast = prices.ewm(span=fast).mean()
    ema_slow = prices.ewm(span=slow).mean()
    macd = ema_fast - ema_slow
    signal_line = macd.ewm(span=signal).mean()
    return macd - signal_line

def calculate_bollinger_bands(prices, period=20, std_dev=2):
    """Calculate Bollinger Bands position"""
    sma = prices.rolling(window=period).mean()
    std = prices.rolling(window=period).std()
    upper = sma + (std * std_dev)
    lower = sma - (std * std_dev)
    return (prices - lower) / (upper - lower)  # Position within bands

def calculate_features(df):
    """Calculate all technical features"""
    features = pd.DataFrame(index=df.index)
    
    # Price-based features
    features['returns_1d'] = df['Close'].pct_change()
    features['returns_5d'] = df['Close'].pct_change(5)
    features['returns_20d'] = df['Close'].pct_change(20)
    features['returns_60d'] = df['Close'].pct_change(60)
    
    # Moving averages
    for period in [5, 10, 20, 50, 200]:
        features[f'sma_{period}'] = df['Close'].rolling(period).mean() / df['Close'] - 1
    
    # Technical indicators
    features['rsi_14'] = calculate_rsi(df['Close'], 14)
    features['rsi_7'] = calculate_rsi(df['Close'], 7)
    features['macd'] = calculate_macd(df['Close'])
    features['bb_position'] = calculate_bollinger_bands(df['Close'])
    
    # Volume features
    features['volume_ratio'] = df['Volume'] / df['Volume'].rolling(20).mean()
    features['volume_trend'] = df['Volume'].pct_change(5)
    
    # Volatility
    features['volatility_20d'] = df['Close'].pct_change().rolling(20).std()
    features['volatility_60d'] = df['Close'].pct_change().rolling(60).std()
    
    # High-Low range
    features['hl_ratio'] = (df['High'] - df['Low']) / df['Close']
    features['close_position'] = (df['Close'] - df['Low']) / (df['High'] - df['Low'])
    
    return features

print("Calculating technical indicators for all stocks...")
features_dict = {}
for symbol, df in stock_data.items():
    features_dict[symbol] = calculate_features(df)
    print(f"✅ {symbol}: {features_dict[symbol].shape[1]} features")

print(f"\n✅ Features calculated for {len(features_dict)} stocks")

## Step 6: Create Training Dataset

In [None]:
def create_training_data(stock_data, features_dict, forward_days=30):
    """Create training dataset with forward returns as target"""
    
    all_features = []
    all_targets = []
    all_symbols = []
    all_dates = []
    
    for symbol in stock_data.keys():
        prices = stock_data[symbol]['Close']
        features = features_dict[symbol]
        
        # Calculate forward returns (target)
        forward_returns = prices.shift(-forward_days) / prices - 1
        
        # Align features and targets
        valid_idx = features.notna().all(axis=1) & forward_returns.notna()
        
        if valid_idx.sum() > 0:
            all_features.append(features[valid_idx])
            all_targets.append(forward_returns[valid_idx])
            all_symbols.extend([symbol] * valid_idx.sum())
            all_dates.extend(features[valid_idx].index.tolist())
    
    # Combine all data
    X = pd.concat(all_features, axis=0)
    y = pd.concat(all_targets, axis=0)
    
    # Add metadata
    meta = pd.DataFrame({
        'symbol': all_symbols,
        'date': all_dates
    }, index=X.index)
    
    return X, y, meta

print("Creating training dataset...")
X, y, meta = create_training_data(stock_data, features_dict, CONFIG['forward_days'])

print(f"\n✅ Training data created:")
print(f"   Samples: {len(X):,}")
print(f"   Features: {X.shape[1]}")
print(f"   Stocks: {meta['symbol'].nunique()}")
print(f"   Date range: {meta['date'].min()} to {meta['date'].max()}")
print(f"\n📊 Target statistics:")
print(y.describe())

## Step 7: Feature Scaling & Preparation

In [None]:
# Handle any remaining NaN values
X = X.fillna(X.median())
y = y.fillna(0)

# Scale features
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)

# Store feature names
feature_names = X.columns.tolist()

print(f"✅ Features scaled using RobustScaler")
print(f"✅ {len(feature_names)} feature names stored")

## Step 8: Time Series Cross-Validation Split

In [None]:
# Sort by date for time series split
sort_idx = meta['date'].argsort()
X_sorted = X_scaled.iloc[sort_idx]
y_sorted = y.iloc[sort_idx]

# Use 80% for training, 20% for final validation
split_idx = int(len(X_sorted) * 0.8)
X_train, X_test = X_sorted[:split_idx], X_sorted[split_idx:]
y_train, y_test = y_sorted[:split_idx], y_sorted[split_idx:]

print(f"Training set: {len(X_train):,} samples")
print(f"Test set: {len(X_test):,} samples")
print(f"\n✅ Time series split complete")

## Step 9: Train XGBoost Model

In [None]:
print("Training XGBoost model...\n")

xgb_params = {
    'objective': 'reg:squarederror',
    'max_depth': 6,
    'learning_rate': 0.05,
    'n_estimators': 500,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'n_jobs': -1,
    'early_stopping_rounds': 50
}

xgb_model = xgb.XGBRegressor(**xgb_params)
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=50
)

# Evaluate
y_pred_xgb = xgb_model.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"\n✅ XGBoost Training Complete")
print(f"   MSE: {mse_xgb:.6f}")
print(f"   MAE: {mae_xgb:.6f}")
print(f"   R²: {r2_xgb:.4f}")

## Step 10: Train Random Forest Model

In [None]:
print("Training Random Forest model...\n")

rf_params = {
    'n_estimators': 300,
    'max_depth': 12,
    'min_samples_split': 10,
    'min_samples_leaf': 5,
    'max_features': 'sqrt',
    'random_state': 42,
    'n_jobs': -1,
    'verbose': 1
}

rf_model = RandomForestRegressor(**rf_params)
rf_model.fit(X_train, y_train)

# Evaluate
y_pred_rf = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"\n✅ Random Forest Training Complete")
print(f"   MSE: {mse_rf:.6f}")
print(f"   MAE: {mae_rf:.6f}")
print(f"   R²: {r2_rf:.4f}")

## Step 11: Train LightGBM Model

In [None]:
print("Training LightGBM model...\n")

lgb_params = {
    'objective': 'regression',
    'max_depth': 8,
    'learning_rate': 0.05,
    'n_estimators': 500,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'n_jobs': -1,
    'verbose': 50
}

lgb_model = lgb.LGBMRegressor(**lgb_params)
lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
)

# Evaluate
y_pred_lgb = lgb_model.predict(X_test)
mse_lgb = mean_squared_error(y_test, y_pred_lgb)
mae_lgb = mean_absolute_error(y_test, y_pred_lgb)
r2_lgb = r2_score(y_test, y_pred_lgb)

print(f"\n✅ LightGBM Training Complete")
print(f"   MSE: {mse_lgb:.6f}")
print(f"   MAE: {mae_lgb:.6f}")
print(f"   R²: {r2_lgb:.4f}")

## Step 12: Evaluate Ensemble Performance

In [None]:
# Ensemble weights (can be tuned)
WEIGHTS = {
    'xgboost': 0.35,
    'random_forest': 0.25,
    'lightgbm': 0.40
}

# Weighted ensemble prediction
y_pred_ensemble = (
    WEIGHTS['xgboost'] * y_pred_xgb +
    WEIGHTS['random_forest'] * y_pred_rf +
    WEIGHTS['lightgbm'] * y_pred_lgb
)

# Evaluate ensemble
mse_ensemble = mean_squared_error(y_test, y_pred_ensemble)
mae_ensemble = mean_absolute_error(y_test, y_pred_ensemble)
r2_ensemble = r2_score(y_test, y_pred_ensemble)

print("\n" + "="*60)
print("📊 FINAL RESULTS")
print("="*60)
print(f"\nXGBoost:       MSE={mse_xgb:.6f}, MAE={mae_xgb:.6f}, R²={r2_xgb:.4f}")
print(f"Random Forest: MSE={mse_rf:.6f}, MAE={mae_rf:.6f}, R²={r2_rf:.4f}")
print(f"LightGBM:      MSE={mse_lgb:.6f}, MAE={mae_lgb:.6f}, R²={r2_lgb:.4f}")
print(f"\n🎯 ENSEMBLE:    MSE={mse_ensemble:.6f}, MAE={mae_ensemble:.6f}, R²={r2_ensemble:.4f}")
print("\n" + "="*60)

## Step 13: Save Models and Metadata

In [None]:
import os

# Create output directory
os.makedirs('trained_models', exist_ok=True)

# Save models
print("Saving models...")
joblib.dump(xgb_model, 'trained_models/xgboost.pkl')
joblib.dump(rf_model, 'trained_models/random_forest.pkl')
joblib.dump(lgb_model, 'trained_models/lightgbm.pkl')
joblib.dump(scaler, 'trained_models/feature_engineer.pkl')
print("✅ Models saved")

# Save metadata
metadata = {
    'trained_at': datetime.now().isoformat(),
    'config': CONFIG,
    'feature_names': feature_names,
    'feature_count': len(feature_names),
    'training_samples': len(X_train),
    'test_samples': len(X_test),
    'ensemble_weights': WEIGHTS,
    'metrics': {
        'xgboost': {'mse': float(mse_xgb), 'mae': float(mae_xgb), 'r2': float(r2_xgb)},
        'random_forest': {'mse': float(mse_rf), 'mae': float(mae_rf), 'r2': float(r2_rf)},
        'lightgbm': {'mse': float(mse_lgb), 'mae': float(mae_lgb), 'r2': float(r2_lgb)},
        'ensemble': {'mse': float(mse_ensemble), 'mae': float(mae_ensemble), 'r2': float(r2_ensemble)}
    },
    'torch_available': False
}

with open('trained_models/metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)
print("✅ Metadata saved")

print("\n" + "="*60)
print("✅ ALL MODELS SAVED SUCCESSFULLY")
print("="*60)
print("\nFiles created:")
print("  📁 trained_models/xgboost.pkl")
print("  📁 trained_models/random_forest.pkl")
print("  📁 trained_models/lightgbm.pkl")
print("  📁 trained_models/feature_engineer.pkl")
print("  📁 trained_models/metadata.json")
print("\n📥 Download these files and upload to your server's ml_models/ directory")

## Step 14: Test Prediction on Sample Data

In [None]:
# Test prediction on a few samples
sample_features = X_test.head(10)
sample_actual = y_test.head(10)

# Get predictions from all models
pred_xgb = xgb_model.predict(sample_features)
pred_rf = rf_model.predict(sample_features)
pred_lgb = lgb_model.predict(sample_features)
pred_ensemble = (
    WEIGHTS['xgboost'] * pred_xgb +
    WEIGHTS['random_forest'] * pred_rf +
    WEIGHTS['lightgbm'] * pred_lgb
)

# Display results
results = pd.DataFrame({
    'Actual': sample_actual.values,
    'Ensemble': pred_ensemble,
    'XGBoost': pred_xgb,
    'RandomForest': pred_rf,
    'LightGBM': pred_lgb
})

print("\n📊 Sample Predictions (30-day forward returns):")
print(results.round(4))
print("\n✅ Models are ready for deployment!")

## Step 15: Download Instructions

### In Google Colab:
1. Look for the `trained_models` folder in the file browser (left sidebar)
2. Right-click on each file and select "Download"
3. Download all 5 files:
   - `xgboost.pkl`
   - `random_forest.pkl`
   - `lightgbm.pkl`
   - `feature_engineer.pkl`
   - `metadata.json`

### Deployment:
1. Upload these files to your server's `ml_models/` directory
2. Or commit them to your GitHub repo (if < 100MB each)
3. Update your environment variable: `ML_MODELS_ENABLED=true`
4. Set `use_ml=True` in your pattern detector

### Alternative - Download as ZIP:

In [None]:
# Create a zip file for easy download
import shutil

shutil.make_archive('trained_models', 'zip', 'trained_models')
print("✅ Created trained_models.zip")
print("📥 Download this file from the Colab file browser")

# In Colab, you can also use:
from google.colab import files
files.download('trained_models.zip')