In [61]:
from collections import OrderedDict

# imports
import numpy as np
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
from statsmodels.tsa.regime_switching.markov_regression import MarkovRegression
from pypfopt.efficient_frontier import EfficientFrontier
from pypfopt import risk_models
from pypfopt import expected_returns
import indicators
import importlib
importlib.reload(indicators)
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from collections import OrderedDict

In [62]:
stocks = [
    "AAPL", "MSFT", "GOOGL", "AMZN", "META", "TSLA", "NVDA", "NFLX", "AMD", "INTC",
    "JPM", "GS", "BAC", "C", "WFC", "V", "MA", "AXP", "BRK-B",
    "UNH", "JNJ", "PFE", "LLY", "ABBV", "TMO", "DHR", "BMY", "GILD",
    "XOM", "CVX", "COP", "OXY", "SLB", "HAL", "BP", "SHEL", "EOG",
    "WMT", "COST", "HD", "LOW", "MCD", "SBUX", "TGT", "NKE", "PG", "KO"
]
stocks2 = [
    "ADBE", "CRM", "ORCL", "SAP", "NOW", "SHOP", "SQ", "ZM", "CRWD", "DDOG",
    "TXN", "QCOM", "AVGO", "MU", "LRCX", "KLAC", "NXPI", "ADI", "MRVL", "SWKS",
    "PYPL", "INTU", "FISV", "ADP", "VEEV", "TEAM", "WDAY", "ZS", "OKTA", "MDB",
    "T", "VZ", "TMUS", "CHTR", "CMCSA", "DIS", "ROKU", "LYV", "TTWO", "ATVI",
    "PEP", "KMB", "CL", "HSY", "MDLZ", "GIS", "MO", "PM", "EL", "STZ"
]

In [63]:
#x = yf.download(stocks2, start="2015-01-01", end="2025-04-15", interval="1wk")

In [64]:
#y = yf.download("^GSPC", start="2010-01-01", end="2025-04-15", interval="1wk")

In [65]:
def extract_ticker_dataframe(csv_filepath: str, ticker: str) -> pd.DataFrame:
    """
    Reads a multi-ticker CSV file (like one from yfinance) and isolates the data
    for the specified ticker. This updated version assumes that the CSV contains the
    dates as the index (first column), so we use that as the Date information.

    The CSV is expected to have a two-row header:
      - The first row contains field names (e.g., "Open", "High", etc.).
      - The second row contains the ticker symbols for each column.

    The returned DataFrame's columns will be ordered as needed for Backtrader:
      Date, Open, High, Low, Close, Volume.
    """
    # Use the first column as the index so that the dates are read from the CSV.
    df = pd.read_csv(csv_filepath)#, header=[0, 1], index_col=0)

    # Convert the index to datetime in case it's not already
    df.index = pd.to_datetime(df["Date"], errors="coerce")
    df = df.drop("Date", axis=1)

    # Identify all columns for the specified ticker (matching on the lower level).
    ticker_cols = [col for col in df.columns if ticker == str(col).strip().split("_")[-1]]
    if not ticker_cols:
        raise ValueError(f"Ticker '{ticker}' not found in the CSV file.")

    # Extract the ticker's columns
    df_ticker = df.loc[:, ticker_cols].copy()
    #df_ticker.columns = [col[0].strip() for col in df_ticker.columns]

    # Removing ticker name from column names
    for col in df_ticker.columns:
        df_ticker.rename(columns={col: str(col).split("_")[0]}, inplace=True)

    # Rearranging the columns according to Backtrader's expected order.
    desired_order = ["Open", "High", "Low", "Close", "Volume"]

    available_order = [col for col in desired_order if col in df_ticker.columns]
    df_ticker = df_ticker[available_order]

    # If desired, reset the index (which contains the Date) back to a column.
    df_ticker = df_ticker.reset_index().rename(columns={"index": "Date"}).set_index("Date")

    return df_ticker

In [66]:
'''
stockData = x.copy()

# Rename columns, joining multi-level column names into a single string with "_".
stockData.columns = ["_".join(col) if isinstance(col, tuple) else col for col in stockData.columns]

# Remove unnecessary columns such as "level_0" or "index" that may have been carried over.
stockData = stockData.loc[:, ~stockData.columns.isin(["level_0", "index"])]
stockData.to_csv("50stocks.csv")
'''


'''
sp500 = y.copy()

sp500.columns = ["_".join(col) if isinstance(col, tuple) else col for col in sp500.columns]

sp500 = sp500.loc[:, ~sp500.columns.isin(["level_0", "index"])]
sp500.to_csv("SP500.csv")'''

'\nsp500 = y.copy()\n\nsp500.columns = ["_".join(col) if isinstance(col, tuple) else col for col in sp500.columns]\n\nsp500 = sp500.loc[:, ~sp500.columns.isin(["level_0", "index"])]\nsp500.to_csv("SP500.csv")'

In [67]:
# 1. Feature engineering: Hamilton Regime Switching Model
sp500 = pd.read_csv("SP500.csv")
sp500.set_index("Date", inplace=True)
sp500.index = pd.to_datetime(sp500.index)
sp500['Log_Returns'] = np.log(sp500['Close_^GSPC'] / sp500['Close_^GSPC'].shift(1))
sp500 = sp500.dropna()

def classify_regimes(sp500):
    model = MarkovRegression(sp500['Log_Returns'], k_regimes=2, trend='c', switching_variance=True)
    result = model.fit()
    #print(result.summary())
    smoothed_probs = result.smoothed_marginal_probabilities
    sp500['Regime'] = smoothed_probs.idxmax(axis=1)
    sp500['Bull_Prob'] = smoothed_probs[0]

    """if show_regimes:
        plt.plot(sp500.index, smoothed_probs[0], label="Probability of Bull Market")
        plt.fill_between(sp500.index, 0, 1, where=sp500['Bull_Prob'] > 0.5, color='green', alpha=0.3)
        plt.fill_between(sp500.index, 0, 1, where=sp500['Bull_Prob'] <= 0.5, color='red', alpha=0.3)
        plt.legend()
        plt.title("Bull vs. Bear Market Probability")
        plt.show()"""

    return sp500["Bull_Prob"].to_frame()
    # 0 -> Bull, 1 -> Bear


In [68]:
# Compute rolling portfolio weights using a lookback period (e.g., 52 weeks)
def compute_rolling_portfolio_weights(data, lookback_window=52):
    """
    Computes portfolio weights for each date using historical data up to that date.

    Args:
        data (pd.DataFrame): DataFrame with dates as index and stocks as columns.
        lookback_window (int): Number of days to look back for the optimization.

    Returns:
        pd.DataFrame: A DataFrame with dates as index and stocks as columns, containing weights.
    """
    weights_list = []
    dates = []
    for date in data.index[lookback_window:]:
        window_data = data.loc[:date].tail(lookback_window)
        mu = expected_returns.mean_historical_return(window_data)
        S = risk_models.sample_cov(window_data)
        ef = EfficientFrontier(mu, S)
        try:
            ef.max_sharpe()
            clean_weights = ef.clean_weights()
        except Exception as e:
            # In case optimization fails, assign zero weights.
            clean_weights = {stock: 0 for stock in data.columns}
        weights_list.append(clean_weights)
        dates.append(date)
    weights_df = pd.DataFrame(weights_list, index=dates)
    return weights_df

In [69]:
# 2. Feature engineering: Technical indicators, fundamentals
# SMA, RSI, MACD, etc. etc. basically, seeing which indicator sticks
def create_stock_features(stocks, stock_data_filename):
    feature_rows = []
    regime_df = classify_regimes(sp500)

    for stock in stocks:
        # will be filled with indicators for one stock
        prices = extract_ticker_dataframe(stock_data_filename, stock)
        # weekly return
        prices["Return"] = prices["Close"].pct_change(periods=2)

        features = pd.DataFrame(index=prices.index)

        # Simple Moving Avg Comparison (5 vs 20)
        sma = indicators.sma_strategy(prices["Close"].to_frame(), 5, 20)
        features["SMA_5v20"] = sma["signal_raw"]

        # Relative Strength Index (RSI)
        rsi = indicators.rsi_strategy(prices["Close"].to_frame())
        features["RSI"] = rsi["rsi"]

        # Moving Average Convergence Divergence (MACD)
        macd = indicators.macd_strategy(prices["Close"].to_frame())
        features["MACD"] = macd["signal_raw"] # only using signal, not other columns, since I don't want to have weird "fitting" of the model to the components of the macd signal

        # Bollinger Bands
        bands = indicators.bollinger_strategy(prices["Close"].to_frame())
        features["Bollinger_Bands"] = bands["signal_ternary"]

        # Average True Range (ATR)
        atr = indicators.atr_indicator(prices[["High", "Low", "Close"]])
        features["ATR"] = atr["signal"]

        # Stochastic Oscillator Strategy
        stochastic = indicators.stochastic_strategy(prices[["High", "Low", "Close"]])
        features["Stochastic"] = stochastic["signal"]

        # OBV (On-Balance Volume)
        obv = indicators.obv_strategy(prices[["Close", "Volume"]])
        features["OBV"] = obv["obv"]

        # ADX (Average Directional Index)
        adx = indicators.adx_strategy(prices[["High", "Low", "Close"]])
        features["ADX"] = adx["adx"]

        # Aroon Indicator
        aroon = indicators.aroon_strategy(prices[["High", "Low"]])
        features["Aroon"] = aroon["aroon_oscillator"]

        # Returns features. Overall 4 week percent change, split into 3 week period, 1 week lag and 1 week period, 0 week lag
        features["Returns-3wk-1wklag"] = prices["Close"].shift(1).pct_change(periods=3)
        features["Returns-1wk-0wklag"] = prices["Close"].pct_change()

        # Other technical indicators and fundamentals?

        # rolling return (2 week window)
        features["Returns-2wk"] = prices["Return"]
        features["Bull_Probability"] = regime_df["Bull_Prob"]

        features["Stock"] = stock
        features = features.reset_index().rename(columns={"index": "Date"})
        feature_rows.append(features)

    # Combine data for all stocks into one dataframe
    features_long = pd.concat(feature_rows, ignore_index=True).set_index("Date").dropna()



    return features_long

df = create_stock_features(stocks, "50stocks.csv")
df

  self._init_dates(dates, freq)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['high_low'] = data['High'] - data['Low']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['high_prev_close'] = (data['High'] - data['Close'].shift()).abs()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['low_min'] = data['Low'].rolling(window=window).min

Unnamed: 0_level_0,SMA_5v20,RSI,MACD,Bollinger_Bands,ATR,Stochastic,OBV,ADX,Aroon,Returns-3wk-1wklag,Returns-1wk-0wklag,Returns-2wk,Bull_Probability,Stock
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2015-06-18,0.52938,50.32274,0.10017,0,1.02441,0,8102717200,49.22311,64.00000,-0.03590,0.00636,-0.00597,0.97923,AAPL
2015-06-25,0.25479,57.95552,0.14309,0,1.00638,0,7303614000,46.60114,56.00000,-0.01545,-0.01179,-0.00550,0.95129,AAPL
2015-07-02,0.02088,47.81580,0.22883,0,1.01080,1,6651694800,43.51375,56.00000,-0.01769,-0.03183,-0.04324,0.93524,AAPL
2015-07-09,0.09720,52.95009,0.21787,0,1.07514,0,7638924000,41.12572,52.00000,-0.03716,0.03467,0.00174,0.93078,AAPL
2015-07-16,0.15926,48.27649,0.23012,0,1.18865,0,6304946800,39.01937,48.00000,-0.01007,-0.01262,0.02162,0.94709,AAPL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-03-13,5.26372,77.75921,0.75674,0,2.39235,0,3368958300,27.22717,36.00000,-0.00186,-0.00944,-0.01142,0.15557,KO
2025-03-20,4.99324,79.54904,0.73067,0,2.43390,0,3472300600,28.06158,36.00000,-0.02147,0.01814,0.00853,0.19495,KO
2025-03-27,4.73986,81.48973,0.75284,0,2.51976,-1,3555309400,29.41343,36.00000,0.00652,0.01871,0.03719,0.15587,KO
2025-04-03,4.42193,77.12512,0.63069,0,2.99399,0,3418881000,28.59207,52.00000,0.02740,-0.01935,-0.00100,0.05030,KO


In [70]:
def label_signal(return_val, buy_thresh=0.01, sell_thresh=-0.01):
    if return_val > buy_thresh:
        return 0
    elif return_val < sell_thresh:
        return 1
    else:
        return 2

df['Signal'] = df['Returns-2wk'].apply(label_signal)

# Sort by date?
df = df.sort_values(by='Date')

# 70% train, 15% val, 15% test
split_1 = int(len(df) * 0.7)
split_2 = int(len(df) * 0.85)

train = df.iloc[:split_1]
val = df.iloc[split_1:split_2]
test = df.iloc[split_2:]


In [71]:
features = ["SMA_5v20", "RSI", "MACD", "Bollinger_Bands", "ATR", "Stochastic", "OBV", "ADX", "Aroon", "Bull_Probability", "Returns-3wk-1wklag", "Returns-1wk-0wklag"]
X_train = train[features]
y_train = train['Signal']
X_val = val[features]
y_val = val['Signal']

In [72]:
# tuning class weights b/c the val set has underrepresented sell orders
classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weight_dict = dict(zip(classes, class_weights))
print("Class Weights:", class_weight_dict)

Class Weights: {np.int64(0): np.float64(0.7018922852983989), np.int64(1): np.float64(0.9893891429241412), np.int64(2): np.float64(1.771305625524769)}


In [73]:
# Map class weights to each sample in training set
sample_weights = y_train.map(class_weight_dict)

# Train the model
model = XGBClassifier(
    objective='multi:softprob',  # for multi-class
    num_class=len(classes),
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42
)

model.fit(X_train, y_train, sample_weight=sample_weights)

Parameters: { "use_label_encoder" } are not used.



AttributeError: 'super' object has no attribute '__sklearn_tags__'

AttributeError: 'super' object has no attribute '__sklearn_tags__'

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=None,
              n_jobs=None, num_class=3, num_parallel_tree=None, ...)

In [74]:
X_test = test[features]
y_test = test['Signal']

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.76      0.77      1678
           1       0.73      0.77      0.75      1292
           2       0.29      0.27      0.28       647

    accuracy                           0.68      3617
   macro avg       0.60      0.60      0.60      3617
weighted avg       0.67      0.68      0.67      3617



In [75]:
# NOW do portfolio weight stuff with y_pred... AFTER your tuning for val and using the test set please :)
today_stocks_features = df.tail(len(stocks))
todays_pred = model.predict(today_stocks_features[features])
stock_col = today_stocks_features["Stock"].reset_index().drop(columns="Date")
pred_col = pd.DataFrame(todays_pred)

# Making recommendations per stock
recommendations = stock_col.join(pred_col)
recommendations.columns = ["Stock", "Recommendation"]
recommendations["Recommendation"] = recommendations["Recommendation"].map({0: 'Hold', 1: 'Buy', 2: 'Sell'})

In [76]:
# Isolating buys
buy_recommendations = recommendations[recommendations.Recommendation == "Buy"].drop(columns="Recommendation")
rec_array = buy_recommendations.to_numpy().flatten().tolist()
buy_stocks_history = extract_ticker_dataframe("50stocks.csv", rec_array[0])["Close"]
for i in range(1, len(rec_array)):
    a = extract_ticker_dataframe("50stocks.csv", rec_array[i])["Close"]
    buy_stocks_history = pd.concat([buy_stocks_history, a], axis=1, join='inner')
buy_stocks_history.columns = rec_array

In [77]:
def compute_adjusted_mu(buy_probs, baseline_mu, alpha=0.01):
    tickers = baseline_mu.index.intersection(buy_probs.index)
    adjusted_mu = baseline_mu.loc[tickers] * (1 + alpha * (buy_probs.loc[tickers] - 0.5))
    return adjusted_mu

probs = model.predict_proba(today_stocks_features[features])
probs_db = pd.DataFrame(probs, columns=["Hold", "Buy", "Sell"], index=today_stocks_features["Stock"].values)
buy_probs = probs_db["Buy"]

# 2. Compute the baseline mu
baseline_mu = expected_returns.mean_historical_return(buy_stocks_history)

# 3. Compute the adjusted mu. Alpha is a strength parameter for the adjustment. Larger values will make the adjustment more aggressive.
adjusted_mu = compute_adjusted_mu(buy_probs, baseline_mu, alpha=0.05)

S = risk_models.sample_cov(buy_stocks_history)

# Ensure that the adjusted_mu and S use the same tickers
common_tickers = adjusted_mu.index.intersection(S.index)
#adjusted_mu = adjusted_mu.loc[common_tickers]

S = S.loc[common_tickers, common_tickers]
ef = EfficientFrontier(adjusted_mu, S)
ef.max_sharpe()
clean_weights = ef.clean_weights()

In [78]:
# Portfolio weights
weights = OrderedDict()
for key, value in clean_weights.items():
    if value != 0.0:
        weights[key] = value
weights

OrderedDict([('NVDA', 0.72708), ('LLY', 0.27292)])

In [79]:
mu = expected_returns.mean_historical_return(buy_stocks_history)
S = risk_models.sample_cov(buy_stocks_history)
ef = EfficientFrontier(mu, S)
ef.max_sharpe()
clean_weights = ef.clean_weights()
weights = OrderedDict()
for key, value in clean_weights.items():
    if value != 0.0:
        weights[key] = value
weights

OrderedDict([('NVDA', 0.72895), ('LLY', 0.27105)])