<a href="https://colab.research.google.com/github/lionatzion/PursuitofAlpha/blob/main/backtesting_workflow_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
"""
backtesting_workflow.py

This module provides a sketch of a workflow for downloading market data,
engineering features, training a classification model and running a backtest
using Backtrader. It is designed to be imported into a Google Colab
notebook or executed as a standalone script. It illustrates how to
structure code for testing quantitative trading strategies with Python.

Dependencies:
  pip install pandas numpy yfinance scikit-learn backtrader ta

Note: This code is a simplified example. In practice you should
implement more robust data handling, cross‑validation (e.g. purged
cross‑validation), proper position sizing and risk management.
"""

import subprocess
import sys

def _install_and_import(pkg_name: str) -> None:
    try:
        __import__(pkg_name)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg_name])
        __import__(pkg_name)

# Ensure essential third‑party libraries are available
for _package in ("backtrader", "yfinance", "ta"):
    _install_and_import(_package)

import datetime
import numpy as np
import pandas as pd
import yfinance as yf
from ta import trend, volatility

# Machine learning
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Backtesting
import backtrader as bt

def download_data(tickers, start, end, interval="1h"):
    """Download OHLCV data for a list of tickers using yfinance."""
    data = {}
    for ticker in tickers:
        df = yf.download(ticker, start=start, end=end, interval=interval, progress=False)
        if not df.empty:
            df.dropna(inplace=True)
            data[ticker] = df
    return data

def create_volume_bars(df, vol_target=1e6):
    """Aggregate tick/interval data into volume bars."""
    bars = []
    cum_vol = 0
    bar_rows = []
    for idx, row in df.iterrows():
        bar_rows.append(row)
        cum_vol += row["Volume"]
        if cum_vol >= vol_target:
            bar_df = pd.DataFrame(bar_rows)
            o = bar_df.iloc[0]["Open"]
            h = bar_df["High"].max()
            l = bar_df["Low"].min()
            c = bar_df.iloc[-1]["Close"]
            v = bar_df["Volume"].sum()
            bars.append({"Open": o, "High": h, "Low": l, "Close": c, "Volume": v})
            bar_rows = []
            cum_vol = 0
    return pd.DataFrame(bars)

def add_indicators(df, sma_fast=10, sma_slow=30, rsi_window=14, z_window=50):
    """Add technical indicators (SMA, RSI, Z-score) to a DataFrame."""
    df = df.copy()
    df[f"sma_{sma_fast}"] = df["Close"].rolling(window=sma_fast).mean()
    df[f"sma_{sma_slow}"] = df["Close"].rolling(window=sma_slow).mean()
    df[f"rsi_{rsi_window}"] = volatility.rsi(df["Close"], window=rsi_window)
    df[f"z_score_{z_window}"] = (df["Close"] - df["Close"].rolling(window=z_window).mean()) / df["Close"].rolling(window=z_window).std()
    df.dropna(inplace=True)
    return df

def prepare_features(df, feature_cols, lookahead=3):
    """Prepare feature matrix and labels for classification."""
    df = df.copy()
    df["future_return"] = df["Close"].shift(-lookahead) / df["Close"] - 1
    df["label"] = (df["future_return"] > 0).astype(int)
    X = df[feature_cols].values
    y = df["label"].values
    valid_rows = ~np.isnan(df["future_return"]).values
    return X[valid_rows], y[valid_rows]

def train_classifier(X, y, test_size=0.2, random_state=42):
    """Train a Gradient Boosting classifier and return the trained model."""
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, shuffle=False, random_state=random_state
    )
    model = GradientBoostingClassifier(max_depth=3, n_estimators=200)
    model.fit(X_train, y_train)
    preds = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, preds)
    return model, auc

class MLStrategy(bt.Strategy):
    """Backtrader strategy that uses an ML model to generate trading signals."""
    params = dict(
        model=None,
        feature_cols=None,
        lookahead=3,
        buy_threshold=0.6,
        sell_threshold=0.4,
        hold_period=3,
    )

    def __init__(self):
        self.dataclose = self.datas[0].close
        self.bars_since_entry = 0

    def next(self):
        window_data = self.data.close.get(size=max(self.params.lookahead, 50))
        if len(window_data) < max(self.params.lookahead, 50):
            return
        sma_fast = np.mean(window_data[-10:])
        sma_slow = np.mean(window_data[-30:])
        delta = np.diff(window_data[-15:])
        up = delta.copy()
        down = delta.copy()
        up[up < 0] = 0
        down[down > 0] = 0
        avg_gain = up.mean()
        avg_loss = abs(down.mean()) + 1e-8
        rs = avg_gain / avg_loss
        rsi_val = 100 - 100 / (1 + rs)
        mean = np.mean(window_data[-50:])
        std = np.std(window_data[-50:])
        z_score = (window_data[-1] - mean) / std
        features = np.array([[sma_fast, sma_slow, rsi_val, z_score]])
        prob = self.params.model.predict_proba(features)[0, 1]
        if not self.position:
            if prob > self.params.buy_threshold:
                self.buy(size=1)
                self.bars_since_entry = 0
            elif prob < self.params.sell_threshold:
                self.sell(size=1)
                self.bars_since_entry = 0
        else:
            self.bars_since_entry += 1
            if self.bars_since_entry >= self.params.hold_period:
                self.close()
                self.bars_since_entry = 0

def run_backtest(model, data_df, start_cash=100000.0, commission=0.0005):
    """Run a backtest using Backtrader and the provided model."""
    cerebro = bt.Cerebro()
    data_feed = bt.feeds.PandasData(dataname=data_df)
    cerebro.adddata(data_feed)
    cerebro.addstrategy(
        MLStrategy,
        model=model,
        feature_cols=["sma_10", "sma_30", "rsi_14", "z_score_50"],
    )
    cerebro.broker.setcash(start_cash)
    cerebro.broker.setcommission(commission=commission)
    cerebro.run()
    final_value = cerebro.broker.getvalue()
    return final_value

if __name__ == "__main__":
    tickers = ["AAPL"]
    data = download_data(tickers, start="2019-01-01", end="2024-12-31", interval="1h")
    if not data:
        raise SystemExit("No data downloaded.")
    df = create_volume_bars(data[tickers[0]], vol_target=5e6)
    df = add_indicators(df, sma_fast=10, sma_slow=30, rsi_window=14, z_window=50)
    feature_cols = ["sma_10", "sma_30", "rsi_14", "z_score_50"]
    X, y = prepare_features(df, feature_cols)
    model, auc = train_classifier(X, y)
    print(f"Trained model ROC AUC: {auc:.4f}")
    final = run_backtest(model, df)
    print(f"Final portfolio value: {final:.2f}")


  df = yf.download(ticker, start=start, end=end, interval=interval, progress=False)
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['AAPL']: YFPricesMissingError('possibly delisted; no price data found  (1h 2019-01-01 -> 2024-12-31) (Yahoo error = "1h data not available for startTime=1546318800 and endTime=1735621200. The requested range must be within the last 730 days.")')


SystemExit: No data downloaded.

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
