In [4]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Chart + ML-Driven Pattern Trading (NSE) â€” Patched
=================================================

Fixes:
- vectorbt `init_cash` dict TypeError -> use cash_sharing=False & scalar per-column pocket
- Separate long_entries and short_entries (direction='both')
"""

from __future__ import annotations
import os, math, warnings, logging
from dataclasses import dataclass
from typing import List, Dict, Tuple, Optional
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore", category=FutureWarning)

try:
    import yfinance as yf
    import vectorbt as vbt
except Exception:
    print("Install deps: pip install yfinance vectorbt scikit-learn scipy")
    pass

from scipy.signal import find_peaks
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# =========================
# CONFIG
# =========================
@dataclass
class CFG:
    # Universe (append .NS automatically if not present)
    tickers: List[str] = None
    period: str = "3y"
    interval: str = "1d"

    # Pattern geometry params
    peak_distance: int = 5
    peak_prominence: float = 0.0
    twin_tolerance: float = 0.025
    hs_min_sep: int = 5

    # Labeling (forward return)
    fwd_horizon: int = 10
    label_threshold: float = 0.02

    # ML
    train_ratio: float = 0.7
    clf_n_estimators: int = 400
    clf_max_depth: Optional[int] = None
    proba_threshold: float = 0.55

    # Backtest
    capital_per_stock: float = 50_000.0  # per-column pocket (cash_sharing=False)
    sl_pct: float = 0.03
    tp_pct: float = 0.06
    hold_max_bars: int = 20

    # IO
    out_dir: str = "outputs"

CFG = CFG(
    tickers=["RELIANCE", "TCS", "INFY", "HDFCBANK", "ICICIBANK"],
)

# =========================
# UTILS
# =========================
def add_ns_suffix(t: str) -> str:
    return t if t.endswith(".NS") else t + ".NS"

def ema(series: pd.Series, span: int) -> pd.Series:
    return series.ewm(span=span, adjust=False).mean()

def rsi(series: pd.Series, length: int = 14) -> pd.Series:
    delta = series.diff()
    up = delta.clip(lower=0)
    down = -delta.clip(upper=0)
    roll_up = up.ewm(alpha=1/length, adjust=False).mean()
    roll_down = down.ewm(alpha=1/length, adjust=False).mean()
    rs = roll_up / (roll_down.replace(0, np.nan))
    out = 100 - (100 / (1 + rs))
    return out.fillna(50)

def annualized_vol(returns: pd.Series, periods_per_year: int = 252) -> float:
    return returns.std() * math.sqrt(periods_per_year)

# =========================
# DATA
# =========================
def fetch_data(cfg: CFG) -> Dict[str, pd.DataFrame]:
    tickers = [add_ns_suffix(t) for t in cfg.tickers]
    data = {}
    for t in tickers:
        df = yf.download(t, period=cfg.period, interval=cfg.interval, progress=False, multi_level_index=False)
        if df is None or df.empty:
            continue
        df = df.rename(columns=str.title)
        df = df[["Open","High","Low","Close","Volume"]].dropna().copy()
        df["ret"] = df["Close"].pct_change()
        df["ema20"] = ema(df["Close"], 20)
        df["ema50"] = ema(df["Close"], 50)
        df["rsi14"] = rsi(df["Close"], 14)
        data[t] = df
    return data

# =========================
# PATTERN DETECTORS
# =========================
def swing_points(series: pd.Series, distance: int, prominence: float=0.0):
    peaks, _ = find_peaks(series.values, distance=distance, prominence=prominence)
    troughs, _ = find_peaks((-series).values, distance=distance, prominence=prominence)
    return peaks, troughs

def detect_double_top(df: pd.DataFrame, cfg: CFG) -> List[Dict]:
    out = []
    peaks, _ = find_peaks(df["High"].values, distance=cfg.peak_distance, prominence=cfg.peak_prominence)
    for i in range(len(peaks)-1):
        p1, p2 = peaks[i], peaks[i+1]
        h1, h2 = df["High"].iloc[p1], df["High"].iloc[p2]
        if abs(h1 - h2) / ((h1+h2)/2) <= cfg.twin_tolerance:
            mid_low_idx = df["Low"].iloc[p1:p2+1].idxmin()
            neckline = df.loc[mid_low_idx, "Low"]
            out.append(dict(
                pattern="DoubleTop",
                p1=int(p1), p2=int(p2),
                p1_ts=df.index[p1], p2_ts=df.index[p2],
                neckline=float(neckline),
                dir="short"
            ))
    return out

def detect_double_bottom(df: pd.DataFrame, cfg: CFG) -> List[Dict]:
    out = []
    troughs, _ = find_peaks((-df["Low"]).values, distance=cfg.peak_distance, prominence=cfg.peak_prominence)
    for i in range(len(troughs)-1):
        t1, t2 = troughs[i], troughs[i+1]
        l1, l2 = df["Low"].iloc[t1], df["Low"].iloc[t2]
        if abs(l1 - l2) / ((l1+l2)/2) <= cfg.twin_tolerance:
            mid_high_idx = df["High"].iloc[t1:t2+1].idxmax()
            neckline = df.loc[mid_high_idx, "High"]
            out.append(dict(
                pattern="DoubleBottom",
                t1=int(t1), t2=int(t2),
                t1_ts=df.index[t1], t2_ts=df.index[t2],
                neckline=float(neckline),
                dir="long"
            ))
    return out

def detect_head_shoulders(df: pd.DataFrame, cfg: CFG) -> List[Dict]:
    out = []
    peaks, _ = find_peaks(df["High"].values, distance=cfg.peak_distance, prominence=cfg.peak_prominence)
    for i in range(0, len(peaks)-2):
        L, H, R = peaks[i], peaks[i+1], peaks[i+2]
        if H - L < cfg.hs_min_sep or R - H < cfg.hs_min_sep:
            continue
        hL, hH, hR = df["High"].iloc[L], df["High"].iloc[H], df["High"].iloc[R]
        if not (hH > hL and hH > hR):
            continue
        low_LH = df["Low"].iloc[L:H+1].min()
        low_HR = df["Low"].iloc[H:R+1].min()
        neckline = (low_LH + low_HR) / 2
        out.append(dict(
            pattern="HeadShoulders",
            L=int(L), H=int(H), R=int(R),
            L_ts=df.index[L], H_ts=df.index[H], R_ts=df.index[R],
            neckline=float(neckline),
            dir="short"
        ))
    return out

# =========================
# FEATURES + LABELS
# =========================
def window_features(df: pd.DataFrame, idxs: List[int]) -> Dict[str, float]:
    lo, hi = min(idxs), max(idxs)
    w = df.iloc[lo:hi+1]

    hi_px = w["High"].max()
    lo_px = w["Low"].min()
    rng = (hi_px - lo_px) / lo_px if lo_px > 0 else 0

    ret_w = w["Close"].pct_change().dropna()
    vol_ann = annualized_vol(ret_w) if len(ret_w) > 3 else 0
    slope_close = (w["Close"].iloc[-1] - w["Close"].iloc[0]) / max(1, len(w)-1)
    slope_ema20 = (w["ema20"].iloc[-1] - w["ema20"].iloc[0]) / max(1, len(w)-1)

    rsi_end = w["rsi14"].iloc[-1]

    vol_mean = w["Volume"].replace(0, np.nan).mean()
    vol_last = w["Volume"].iloc[-1] if not w.empty else np.nan
    vol_rel = (vol_last / vol_mean) if vol_mean and not np.isnan(vol_mean) else 1.0

    return dict(
        win_len=len(w), win_range=rng, ann_vol=vol_ann,
        slope_close=slope_close, slope_ema20=slope_ema20, rsi_end=rsi_end,
        vol_rel=vol_rel
    )

def make_examples(ticker: str, df: pd.DataFrame, cfg: CFG) -> pd.DataFrame:
    examples = []

    dts = detect_double_top(df, cfg)
    dbs = detect_double_bottom(df, cfg)
    hss = detect_head_shoulders(df, cfg)

    for p in dts:
        lo, hi = p["p1"], p["p2"]
        feats = window_features(df, [lo, hi])
        entry_idx = p["p2"] + 1
        if entry_idx >= len(df):
            continue
        entry_ts = df.index[entry_idx]
        entry_px = df["Close"].iloc[entry_idx]
        fwd_end = min(len(df)-1, entry_idx + cfg.fwd_horizon)
        fwd_ret = (df["Close"].iloc[fwd_end] - entry_px) / entry_px
        y = 1 if fwd_ret <= -cfg.label_threshold else 0
        examples.append(dict(
            ticker=ticker, pattern="DoubleTop", dir="short",
            entry_idx=entry_idx, entry_ts=entry_ts, entry_px=float(entry_px),
            neckline=p["neckline"], fwd_ret=float(fwd_ret), y=y, **feats
        ))

    for p in dbs:
        lo, hi = p["t1"], p["t2"]
        feats = window_features(df, [lo, hi])
        entry_idx = p["t2"] + 1
        if entry_idx >= len(df):
            continue
        entry_ts = df.index[entry_idx]
        entry_px = df["Close"].iloc[entry_idx]
        fwd_end = min(len(df)-1, entry_idx + cfg.fwd_horizon)
        fwd_ret = (df["Close"].iloc[fwd_end] - entry_px) / entry_px
        y = 1 if fwd_ret >= cfg.label_threshold else 0
        examples.append(dict(
            ticker=ticker, pattern="DoubleBottom", dir="long",
            entry_idx=entry_idx, entry_ts=entry_ts, entry_px=float(entry_px),
            neckline=p["neckline"], fwd_ret=float(fwd_ret), y=y, **feats
        ))

    for p in hss:
        lo, hi = p["L"], p["R"]
        feats = window_features(df, [lo, hi])
        entry_idx = p["R"] + 1
        if entry_idx >= len(df):
            continue
        entry_ts = df.index[entry_idx]
        entry_px = df["Close"].iloc[entry_idx]
        fwd_end = min(len(df)-1, entry_idx + cfg.fwd_horizon)
        fwd_ret = (df["Close"].iloc[fwd_end] - entry_px) / entry_px
        y = 1 if fwd_ret <= -cfg.label_threshold else 0
        examples.append(dict(
            ticker=ticker, pattern="HeadShoulders", dir="short",
            entry_idx=entry_idx, entry_ts=entry_ts, entry_px=float(entry_px),
            neckline=p["neckline"], fwd_ret=float(fwd_ret), y=y, **feats
        ))

    return pd.DataFrame(examples)

# =========================
# ML TRAIN / SCORE
# =========================
FEATURES = [
    "win_len","win_range","ann_vol","slope_close","slope_ema20","rsi_end","vol_rel"
]

def time_split(df: pd.DataFrame, ratio: float) -> Tuple[pd.DataFrame, pd.DataFrame]:
    df = df.sort_values("entry_ts")
    n = len(df)
    k = int(n * ratio)
    return df.iloc[:k].copy(), df.iloc[k:].copy()

def train_and_score(df_all: pd.DataFrame, cfg: CFG) -> pd.DataFrame:
    if df_all.empty:
        return df_all
    df_all = df_all.sort_values("entry_ts")
    train, test = time_split(df_all, cfg.train_ratio)

    Xtr = train[FEATURES].values
    ytr = train["y"].values
    Xte = test[FEATURES].values

    clf = RandomForestClassifier(
        n_estimators=cfg.clf_n_estimators,
        max_depth=cfg.clf_max_depth,
        random_state=42
    )
    clf.fit(Xtr, ytr)
    proba_tr = clf.predict_proba(Xtr)[:,1]
    proba_te = clf.predict_proba(Xte)[:,1]

    try:
        auc_tr = roc_auc_score(ytr, proba_tr) if len(np.unique(ytr))>1 else np.nan
        auc_te = roc_auc_score(test["y"], proba_te) if len(np.unique(test["y"]))>1 else np.nan
        logging.info(f"RF AUC train={auc_tr:.3f}, test={auc_te:.3f}")
    except Exception:
        pass

    train = train.copy(); test = test.copy()
    train["ml_proba"] = proba_tr
    test["ml_proba"] = proba_te
    return pd.concat([train, test], axis=0).sort_values("entry_ts")

# =========================
# SIGNALS + BACKTEST (vectorbt)
# =========================
def build_signals_and_backtest(raw_data: Dict[str,pd.DataFrame], scored: pd.DataFrame, cfg: CFG):
    if not raw_data or scored.empty:
        print("No data/signals to backtest.")
        return pd.DataFrame(), None

    prices = pd.DataFrame({t: df["Close"] for t, df in raw_data.items()}).dropna(how="all")

    # Build entry signals per ticker
    long_entries_map, short_entries_map = {}, {}
    for t, df in raw_data.items():
        e_long = pd.Series(False, index=df.index)
        e_short = pd.Series(False, index=df.index)

        sub = scored[scored["ticker"] == t]
        for _, row in sub.iterrows():
            ts = row["entry_ts"]
            if ts not in df.index or row["ml_proba"] < cfg.proba_threshold:
                continue
            i = df.index.get_loc(ts)
            if row["dir"] == "long":
                if df["Close"].iloc[i] > row["neckline"]:
                    e_long.iloc[i] = True
            else:
                if df["Close"].iloc[i] < row["neckline"]:
                    e_short.iloc[i] = True

        long_entries_map[t] = e_long
        short_entries_map[t] = e_short

    long_entries  = pd.DataFrame({t: s.reindex(prices.index).fillna(False) for t,s in long_entries_map.items()})
    short_entries = pd.DataFrame({t: s.reindex(prices.index).fillna(False) for t,s in short_entries_map.items()})

    # --------- TIME-BASED EXITS (replace unsupported max_hold) ----------
    hold_n = int(cfg.hold_max_bars)
    # Each entry exits exactly N bars later if SL/TP hasn't closed it earlier
    long_time_exits  = long_entries.shift(hold_n, fill_value=False)
    short_time_exits = short_entries.shift(hold_n, fill_value=False)
    # --------------------------------------------------------------------

    # Stops
    sl_stop = cfg.sl_pct
    tp_stop = cfg.tp_pct

    # Use independent cash pockets per column; invest full pocket value per trade
    pf = vbt.Portfolio.from_signals(
        close=prices,
        entries=long_entries,
        exits=long_time_exits,
        short_entries=short_entries,
        short_exits=short_time_exits,
        direction='both',
        sl_stop=sl_stop,
        tp_stop=tp_stop,
        freq="D",
        cash_sharing=False,
        init_cash=cfg.capital_per_stock,
        size_type='value',           # <-- invest by cash value
        size=cfg.capital_per_stock,  # <-- deploy full pocket per entry
        fees=0.0005,
        slippage=0.0005,
    )

    trades = pf.trades.records_readable
    if trades is None or trades.empty:
        print("No trades triggered.")
        return pd.DataFrame(), pf

    trades = trades.rename(columns={
        "Column":"ticker","Entry Time":"entry_ts","Exit Time":"exit_ts",
        "Entry Price":"entry_px","Exit Price":"exit_px",
        "PnL":"pnl","Return":"ret","Size":"size"
    })
    return trades, pf


# =========================
# MAIN
# =========================
def main():
    logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")

    os.makedirs(CFG.out_dir, exist_ok=True)
    data = fetch_data(CFG)
    if not data:
        print("No data fetched. Check tickers/period/interval.")
        return

    # build dataset
    ex_all = []
    for t, df in data.items():
        ex = make_examples(t, df, CFG)
        if not ex.empty:
            ex_all.append(ex)
    if not ex_all:
        print("No patterns detected.")
        return
    df_all = pd.concat(ex_all, ignore_index=True).sort_values("entry_ts")

    scored = train_and_score(df_all, CFG)

    patterns_path = os.path.join(CFG.out_dir, "patterns.csv")
    scored.to_csv(patterns_path, index=False)
    print(f"Saved patterns: {patterns_path}")

    trades, pf = build_signals_and_backtest(data, scored, CFG)

    trades_path = os.path.join(CFG.out_dir, "trades.csv")
    (trades if trades is not None else pd.DataFrame()).to_csv(trades_path, index=False)
    print(f"Saved trades: {trades_path}")

    if pf is not None:
        stats = pf.stats()
        print("\n=== BACKTEST STATS ===")
        print(stats.to_string())
        if trades is not None and not trades.empty:
            print("\nTop 10 trades by PnL:")
            print(trades.sort_values("pnl", ascending=False).head(10).to_string(index=False))

if __name__ == "__main__":
    main()


2025-11-04 11:50:46,728 | INFO | RF AUC train=1.000, test=0.523
  pf = vbt.Portfolio.from_signals(


Saved patterns: outputs/patterns.csv
Saved trades: outputs/trades.csv

=== BACKTEST STATS ===
Start                         2022-11-04 00:00:00
End                           2025-11-04 00:00:00
Period                          741 days 00:00:00
Start Value                               50000.0
End Value                            54889.329486
Total Return [%]                         9.778659
Benchmark Return [%]                    24.400731
Max Gross Exposure [%]                  83.064563
Total Fees Paid                        100.084113
Max Drawdown [%]                         2.945328
Max Drawdown Duration           139 days 06:00:00
Total Trades                                  2.0
Total Closed Trades                           2.0
Total Open Trades                             0.0
Open Trade PnL                                0.0
Win Rate [%]                                100.0
Best Trade [%]                           6.975184
Worst Trade [%]                          4.004598
Avg Wi

  stats = pf.stats()
