<a href="https://colab.research.google.com/github/kesanir/ML-AI-TRADING/blob/main/LSTM_XGB_TOP5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -*- coding: utf-8 -*-
"""INSTITUTIONAL_NDX_MOMENTUM.ipynb"""

!pip install yfinance hmmlearn torch scikit-learn

import os
import yfinance as yf
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import requests
from datetime import datetime
from hmmlearn import hmm
from sklearn.preprocessing import MinMaxScaler

# ---------------- CONFIG ----------------
LOOKBACK = 60        # LSTM input days
TRAIN_WINDOW = 120   # Rolling training window
LOG_FILE = "trade_history.csv"

# ---------------- 1. DATA ----------------
def get_ndx_tickers():
    #url = 'https://en.wikipedia.org/wiki/Nasdaq-100'
    #tables = pd.read_html(requests.get(url).text)
    #df = None
    #for t in tables:
    #    if 'Ticker' in t.columns and len(t) >= 100:
    #        df = t
    #        break
    #    elif 'Symbol' in t.columns and len(t) >= 100:
    #        df = t.rename(columns={'Symbol':'Ticker'})
    #        break
    #if df is None:
    #    raise ValueError("Could not find Nasdaq-100 table")
    #tickers = df['Ticker'].astype(str).str.split('[').str[0].str.strip().tolist()
    tickers = [
    'AAPL', 'MSFT', 'GOOGL', 'AMZN', 'NVDA', 'META', 'TSLA', 'AVGO',
    'COST', 'NFLX', 'ADBE', 'PEP', 'CSCO', 'TMUS', 'AMD', 'INTC',
    'CMCSA', 'INTU', 'QCOM', 'TXN', 'AMGN', 'HON', 'AMAT', 'SBUX',
    'ISRG', 'BKNG', 'GILD', 'ADI', 'ADP', 'VRTX', 'MDLZ', 'LRCX',
    'REGN', 'MU', 'PANW', 'PYPL', 'SNPS', 'CDNS', 'KLAC', 'ASML',
    'MELI', 'CRWD', 'ABNB', 'FTNT', 'WDAY', 'MRNA', 'CTAS', 'DXCM',
    'ORLY', 'AEP', 'NXPI', 'CHTR', 'MAR', 'ADSK', 'MCHP', 'KDP',
    'MNST', 'EXC', 'ROST', 'CSX', 'KHC', 'PCAR', 'PAYX', 'CPRT',
    'AZN', 'CSGP', 'ODFL', 'DDOG', 'FAST', 'BKR', 'TTD', 'CTSH',
    'EA', 'GEHC', 'VRSK', 'LULU', 'ON', 'XEL', 'IDXX', 'ZS','S',
    'CCEP', 'TEAM', 'FANG', 'BIIB', 'CDW', 'ILMN', 'DASH',
    'GFS', 'WBD', 'MRVL', 'TTWO', 'EBAY', 'ZM', 'ALGN', 'ENPH'
    ]
    return tickers

def get_market_regime(qqq_close):
    returns = np.log(qqq_close/qqq_close.shift(1)).dropna().values.reshape(-1,1)
    model = hmm.GaussianHMM(n_components=2, covariance_type="full", n_iter=100)
    model.fit(returns)
    state = model.predict(returns)[-1]
    bull_state = np.argmax(model.means_)
    return 1 if state==bull_state else 0  # 1 = Bull, 0 = Bear/Volatile

def get_vix():
    vix = yf.download("^VIX", period="2y", interval="1d", auto_adjust=True)['Close']
    return vix

# ---------------- 2. FEATURES ----------------
def prepare_features(df, ticker_name, regime, vix):
    df = df.copy()
    df['Returns'] = df['Close'].pct_change()
    df['ROC'] = df['Close'].pct_change(12)
    df['ATR'] = (df['High'] - df['Low']).rolling(14).mean() / df['Close']
    delta = df['Close'].diff()
    gain = delta.clip(lower=0).rolling(14).mean()
    loss = (-delta).clip(lower=0).rolling(14).mean()
    df['RSI'] = 100 - 100/(1+gain/loss)
    df['Volume'] = df['Volume']
    df['Regime'] = regime
    df['VIX'] = vix.reindex(df.index).ffill() # Updated to use .ffill()
    df = df.dropna()
    df['Date'] = df.index
    df['Ticker'] = ticker_name # Use the passed ticker_name
    return df[['Ticker','Date','Close','Returns','ROC','ATR','RSI','Volume','Regime','VIX']].rename(columns={'Close':'Price'})

# ---------------- 3. CROSS-SECTIONAL Z-SCORE ----------------
def cross_sectional_zscore(panel_df, feature_cols):
    panel_df = panel_df.copy()

    def zscore_transform(group_series):
        # group_series is a Series representing a single feature column for a single date group
        mean_val = group_series.mean()
        std_val = group_series.std()
        # Handle std_val being 0 for a group (e.g., all values are the same)
        std_val = std_val if std_val != 0 else 1 # Using 1 instead of 0 to avoid division by zero
        return (group_series - mean_val) / std_val

    # Apply the custom zscore_transform to each feature column within each date group
    normalized_df = panel_df.groupby('Date')[feature_cols].transform(zscore_transform)

    panel_df[feature_cols] = normalized_df.clip(-3,3)
    return panel_df

# ---------------- 4. LSTM MODEL ----------------
class MomentumLSTM(nn.Module):
    def __init__(self, input_size=7, hidden_size=32, num_layers=1):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=0.2 if num_layers>1 else 0)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_size, 1)
    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.dropout(out[:,-1,:])
        return self.fc(out)

def train_and_predict(features_df):
    data = features_df.values
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(data)
    X, y = [], []
    for i in range(LOOKBACK,len(scaled)):
        X.append(scaled[i-LOOKBACK:i])
        y.append(scaled[i,0])
    X, y = torch.FloatTensor(np.array(X)), torch.FloatTensor(np.array(y)).view(-1,1)
    model = MomentumLSTM(input_size=X.shape[2])
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.MSELoss()
    model.train()
    for epoch in range(10):
        optimizer.zero_grad()
        output = model(X)
        loss = criterion(output,y)
        loss.backward()
        optimizer.step()
    model.eval()
    last_seq = torch.FloatTensor(scaled[-LOOKBACK:]).unsqueeze(0)
    pred = model(last_seq).item()
    return np.tanh(pred)  # bound [-1,1]

# ---------------- 5. RUN DAILY SCANNER ----------------
def run_daily_scanner():
    tickers = get_ndx_tickers()
    data = yf.download(tickers+['QQQ'], period="2y", interval="1d", auto_adjust=True)
    vix = get_vix()
    regime = get_market_regime(data.xs('QQQ',axis=1,level=1)['Close'])
    results = []
    for t in tickers:
        try:
            df_t = data.xs(t,axis=1,level=1).copy()
            # df_t.name = t # This line is removed
            df_t = prepare_features(df_t, t, regime, vix) # Pass 't' explicitly
            if len(df_t) < LOOKBACK+10:
                print(f"Skipping {t}: Not enough data after feature preparation ({len(df_t)} rows). Requires {LOOKBACK+10}.")
                continue
            top_features = ['Returns','ROC','ATR','RSI','Volume','Regime','VIX']
            pred = train_and_predict(df_t[top_features].tail(TRAIN_WINDOW))
            df_t['Pred_LSTM'] = pred
            latest = df_t.iloc[-1]
            results.append(latest[['Ticker','Pred_LSTM','Price','Returns','ROC','ATR','RSI','Volume','Regime','VIX','Date']])
        except Exception as e: # Catch specific exception to print it
            print(f"Error processing ticker {t}: {e}")
            continue

    panel = pd.DataFrame(results)
    panel = panel.reset_index(drop=True)

    if panel.empty: # Check if panel is empty
        print("No data generated for analysis. Exiting daily scanner.")
        return

    panel = cross_sectional_zscore(panel,['Pred_LSTM','Returns','ROC','ATR','RSI','Volume','Regime','VIX'])
    panel = panel.sort_values('Pred_LSTM',ascending=False)
    today = datetime.now().strftime('%Y-%m-%d %H:%M')
    print(f"--- {today} | Top-5 NASDAQ-100 Momentum ---")
    print(panel.head(5).to_string(index=False))
    # Save to CSV
    panel.to_csv(LOG_FILE, mode='a', index=False, header=not os.path.exists(LOG_FILE))

# ---------------- 6. EXECUTE ----------------
if __name__ == "__main__":
    run_daily_scanner()

In [29]:
# ============================================
# INSTITUTIONAL NASDAQ-100 MOMENTUM - XGBOOST
# ============================================

!pip install yfinance xgboost --quiet

import yfinance as yf
import pandas as pd
import numpy as np
import xgboost as xgb
from datetime import datetime
#import requests

# ---------------- CONFIG ----------------
LOOKBACK_FORWARD = 21       # forward return target
TRAIN_YEARS = 3
TOP_N = 5

# ---------------- 1. GET NASDAQ-100 ----------------
def get_ndx_tickers():
    tickers = [
    'AAPL', 'MSFT', 'GOOGL', 'AMZN', 'NVDA', 'META', 'TSLA', 'AVGO',
    'COST', 'NFLX', 'ADBE', 'PEP', 'CSCO', 'TMUS', 'AMD', 'INTC',
    'CMCSA', 'INTU', 'QCOM', 'TXN', 'AMGN', 'HON', 'AMAT', 'SBUX',
    'ISRG', 'BKNG', 'GILD', 'ADI', 'ADP', 'VRTX', 'MDLZ', 'LRCX',
    'REGN', 'MU', 'PANW', 'PYPL', 'SNPS', 'CDNS', 'KLAC', 'ASML',
    'MELI', 'CRWD', 'ABNB', 'FTNT', 'WDAY', 'MRNA', 'CTAS', 'DXCM',
    'ORLY', 'AEP', 'NXPI', 'CHTR', 'MAR', 'ADSK', 'MCHP', 'KDP',
    'MNST', 'EXC', 'ROST', 'CSX', 'KHC', 'PCAR', 'PAYX', 'CPRT',
    'AZN', 'CSGP', 'ODFL', 'DDOG', 'FAST', 'BKR', 'TTD', 'CTSH',
    'EA', 'GEHC', 'VRSK', 'LULU', 'ON', 'XEL', 'IDXX', 'ZS','S',
    'CCEP', 'TEAM', 'FANG', 'BIIB', 'CDW', 'ILMN', 'DASH',
    'GFS', 'WBD', 'MRVL', 'TTWO', 'EBAY', 'ZM', 'ALGN', 'ENPH'
    ]
    return tickers

tickers = get_ndx_tickers()

# ---------------- 2. DOWNLOAD DATA ----------------
data = yf.download(tickers + ["QQQ","^VIX"],
                   period=f"{TRAIN_YEARS}y",
                   auto_adjust=True,
                   progress=False)

close = data["Close"]
high  = data["High"]
low   = data["Low"]
volume = data["Volume"]

qqq = close["QQQ"]
vix = close["^VIX"]
qqq.tail(5)
vix.tail(5)

# ---------------- 3. REGIME FILTER ----------------
qqq_200 = qqq.rolling(200).mean()
ma_distance = (qqq - qqq_200) / qqq_200
ma_distance = ma_distance.rolling(5).mean()

regime = (ma_distance > 0).astype(int).shift(1)


# ---------------- 4. FEATURE ENGINEERING ----------------
feature_list = []

for t in tickers:
    df = pd.DataFrame(index=close.index)
    df["Ticker"] = t
    df["Close"] = close[t]
    df["Returns"] = close[t].pct_change(fill_method=None) # Address FutureWarning
    df["ROC_20"] = close[t].pct_change(20, fill_method=None) # Address FutureWarning
    df["ATR"] = (high[t] - low[t]).rolling(14).mean() / close[t]
    df["RSI"] = 100 - 100/(1 +
        close[t].diff().clip(lower=0).rolling(14).mean() /
        (-close[t].diff().clip(upper=0)).rolling(14).mean()
    )
    df["Volume"] = volume[t]
    df["VIX"] = vix
    df["VIX_ROC"] = vix.pct_change(10, fill_method=None) # Address FutureWarning
    df["Regime"] = regime

    # Forward 21d return target
    df["Fwd_Return"] = close[t].pct_change(LOOKBACK_FORWARD, fill_method=None).shift(-LOOKBACK_FORWARD) # Address FutureWarning

    feature_list.append(df)

panel = pd.concat(feature_list)
panel = panel.dropna()

# ---------------- 5. CROSS-SECTIONAL Z-SCORING ----------------
feature_cols = ["Returns","ROC_20","ATR","RSI","Volume","VIX","VIX_ROC"]

# Define a function to apply to each group for z-score calculation
def zscore_transform(x):
    mean = x.mean()
    std = x.std()
    # Handle std_val being 0 for a group (e.g., all values are the same)
    std = std if std != 0 else 1
    return (x - mean) / std

# Apply the zscore_transform to each feature column within each date group
# and then clip the results.
panel[feature_cols] = panel.groupby(level=0)[feature_cols].transform(zscore_transform).clip(-3,3)

panel = panel.dropna()

# ---------------- 6. TRAIN XGBOOST ----------------
X = panel[feature_cols]
y = panel["Fwd_Return"]

model = xgb.XGBRegressor(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X, y)

# Add predictions for the entire panel
panel["Pred"] = model.predict(X)

# ---------------- 7. TODAY'S PREDICTION ----------------
latest_date = panel.index.max()
today_panel = panel.loc[latest_date].copy() # Make an explicit copy to avoid SettingWithCopyWarning

today_features = today_panel[feature_cols]
today_panel["Pred"] = model.predict(today_features)

# Apply regime filter (only long in bull regime)
if regime.iloc[-1] == 1:
    today_panel = today_panel.sort_values("Pred", ascending=False)
else:
    print("Bear regime detected â€” no long positions.")
    today_panel = today_panel.sort_values("Pred", ascending=False)

top5 = today_panel.head(TOP_N)

print("\n========================================")
print(f"{datetime.now().strftime('%Y-%m-%d')} | TOP {TOP_N} NASDAQ-100 MOMENTUM")
print("========================================")
print(top5[["Ticker","Pred","Close"]])


2026-02-17 | TOP 5 NASDAQ-100 MOMENTUM
           Ticker      Pred       Close
Date                                   
2026-01-14     MU  0.070246  333.350006
2026-01-14   AVGO  0.050726  339.890015
2026-01-14   NVDA  0.044909  183.139999
2026-01-14    AMD  0.042060  223.600006
2026-01-14   TSLA  0.040769  439.200012


In [30]:
panel["Rank"] = (
    panel.groupby("Date")["Pred"]
    .rank(pct=True)
)

In [33]:
latest_date = panel.index.max()
today_panel = panel.loc[latest_date].copy()

In [36]:
top = today_panel.sort_values("Rank", ascending=False).head(20)


In [37]:
top

Unnamed: 0_level_0,Ticker,Close,Returns,ROC_20,ATR,RSI,Volume,VIX,VIX_ROC,Regime,Fwd_Return,Pred,Rank
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2026-01-14,MU,333.350006,-0.512223,3.0,2.444004,1.004305,0.345247,0.0,0.0,1.0,0.234918,0.070246,1.0
2026-01-14,AVGO,339.890015,-1.88579,-0.216302,0.907824,-0.522837,0.760373,0.0,0.0,1.0,-0.043308,0.050726,0.989583
2026-01-14,NVDA,183.139999,-0.523912,0.16922,-0.109002,-0.948503,3.0,0.0,0.0,1.0,-0.001802,0.044909,0.979167
2026-01-14,AMD,223.600006,0.794352,0.571666,0.889576,0.353867,1.198938,0.0,0.0,1.0,-0.072809,0.04206,0.96875
2026-01-14,TSLA,439.200012,-0.700518,-1.036776,0.754309,-1.303079,1.909018,0.0,0.0,1.0,-0.049545,0.040769,0.958333
2026-01-14,LRCX,208.789993,-1.111285,2.605082,1.148498,1.026276,0.256873,0.0,0.0,1.0,0.128071,0.038471,0.947917
2026-01-14,S,14.22,-1.242415,-0.677665,0.818526,-0.553002,-0.106433,0.0,0.0,1.0,-0.024613,0.034807,0.9375
2026-01-14,ABNB,132.789993,-2.410846,-0.160789,0.109095,-0.56961,-0.187348,0.0,0.0,1.0,-0.086151,0.031066,0.927083
2026-01-14,MRVL,81.209999,-0.914593,-0.611761,1.4088,-1.04247,0.134608,0.0,0.0,1.0,-0.032016,0.029366,0.916667
2026-01-14,MRNA,40.580002,1.438916,3.0,3.0,1.059537,0.312481,0.0,0.0,1.0,0.04066,0.029206,0.90625


In [52]:
panel_df = panel_df.sort_values(["Ticker", "Date"])
panel_df["Actual_Return_1D"] = (
    panel_df.groupby("Ticker")["Close"]
    .pct_change(fill_method=None)
    .shift(-1)
)
panel_df["Rank"] = (
    panel_df.groupby("Date")["Pred"]
    .rank(pct=True)
)
TOP_N = 5

daily_picks = (
    panel_df
    .sort_values(["Date", "Rank"], ascending=[True, False])
    .groupby("Date")
    .head(TOP_N)
)
daily_portfolio = (
    daily_picks
    .groupby("Date")["Actual_Return_1D"]
    .mean()
    .to_frame("Strategy_Return")
)
daily_portfolio = daily_portfolio.dropna()
daily_portfolio["Equity_Curve"] = (
    1 + daily_portfolio["Strategy_Return"]
).cumprod()

# Calculate QQQ returns as a separate Series and rename it
qqq_returns = qqq.pct_change(fill_method=None).rename("QQQ_Return")

daily_portfolio = daily_portfolio.merge(
    qqq_returns,
    left_index=True,
    right_index=True,
    how="left"
)

daily_portfolio["QQQ_Cum"] = (
    1 + daily_portfolio["QQQ_Return"]
).cumprod()
import numpy as np

ann_return = daily_portfolio["Strategy_Return"].mean() * 252
ann_vol = daily_portfolio["Strategy_Return"].std() * np.sqrt(252)
sharpe = ann_return / ann_vol

max_dd = (
    daily_portfolio["Equity_Curve"] /
    daily_portfolio["Equity_Curve"].cummax() - 1
).min()

print("Annual Return:", round(ann_return, 3))
print("Annual Vol:", round(ann_vol, 3))
print("Sharpe:", round(sharpe, 2))
print("Max Drawdown:", round(max_dd, 3))


Annual Return: 0.913
Annual Vol: 0.346
Sharpe: 2.64
Max Drawdown: -0.285


In [41]:
COST_PER_TRADE = 0.001  # 10 bps

daily_picks["Net_Return"] = (
    daily_picks["Actual_Return_1D"] - COST_PER_TRADE
)

daily_portfolio = (
    daily_picks
    .groupby("Date")["Net_Return"]
    .mean()
    .to_frame("Strategy_Return")
)


In [42]:
ann_return = daily_portfolio["Strategy_Return"].mean() * 252
ann_vol = daily_portfolio["Strategy_Return"].std() * np.sqrt(252)
sharpe = ann_return / ann_vol

In [44]:
print("Annual Return:", round(ann_return, 3))
print("Annual Vol:", round(ann_vol, 3))
print("Sharpe:", round(sharpe, 2))
print("Max Drawdown:", round(max_dd, 3))

Annual Return: 0.661
Annual Vol: 0.346
Sharpe: 1.91
Max Drawdown: -0.285


In [45]:
daily_portfolio["Year"] = daily_portfolio.index.year

yearly = daily_portfolio.groupby("Year")["Strategy_Return"].apply(
    lambda x: (1 + x).prod() - 1
)

print(yearly)

Year
2023    0.813506
2024    0.701405
2025    0.667290
2026    0.061489
Name: Strategy_Return, dtype: float64


In [53]:
panel_df = panel_df.sort_values(["Ticker", "Date"])
panel_df["Actual_Return_1D"] = (
    panel_df.groupby("Ticker")["Close"]
    .pct_change(fill_method=None)
    .shift(-1)
)
panel_df["Rank"] = (
    panel_df.groupby("Date")["Pred"]
    .rank(pct=True)
)
TOP_N = 5

daily_picks = (
    panel_df
    .sort_values(["Date", "Rank"], ascending=[True, False])
    .groupby("Date")
    .head(TOP_N)
)
daily_portfolio = (
    daily_picks
    .groupby("Date")["Actual_Return_1D"]
    .mean()
    .to_frame("Strategy_Return")
)
daily_portfolio = daily_portfolio.dropna()
daily_portfolio["Equity_Curve"] = (
    1 + daily_portfolio["Strategy_Return"]
).cumprod()
# Calculate QQQ returns as a separate Series and rename it
qqq_returns = qqq.pct_change(fill_method=None).rename("QQQ_Return")

daily_portfolio = daily_portfolio.merge(
    qqq_returns,
    left_index=True,
    right_index=True,
    how="left"
)

daily_portfolio["QQQ_Cum"] = (
    1 + daily_portfolio["QQQ_Return"]
).cumprod()
import numpy as np

ann_return = daily_portfolio["Strategy_Return"].mean() * 252
ann_vol = daily_portfolio["Strategy_Return"].std() * np.sqrt(252)
sharpe = ann_return / ann_vol

max_dd = (
    daily_portfolio["Equity_Curve"] /
    daily_portfolio["Equity_Curve"].cummax() - 1
).min()
qqq_ann_return = daily_portfolio["QQQ_Return"].mean() * 252
print("QQQ Annual Return:", round(qqq_ann_return, 3))
print("Annual Return:", round(ann_return, 3))
print("Annual Vol:", round(ann_vol, 3))
print("Sharpe:", round(sharpe, 2))
print("Max Drawdown:", round(max_dd, 3))

QQQ Annual Return: 0.29
Annual Return: 0.913
Annual Vol: 0.346
Sharpe: 2.64
Max Drawdown: -0.285
