In [None]:
# !python -m pip install --upgrade yfinance

In [4]:
import glob
import logging
import os
import time
import urllib.parse
from datetime import datetime, date

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pytz
import tensorflow as tf
from astral import LocationInfo
from astral.sun import sun
from keras import layers, models
from prophet import Prophet
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier

import strat_defs # custom functions
import prep_data

In [6]:
def gen_powerset(some_list):
    powerset = [[]]

    for i in some_list:
        powerset += [x+[i] for x in powerset]

    return powerset

In [8]:
# Suppress prophet logging (prophet prints something each time)
for lib in ["prophet", "cmdstanpy"]:
    logger = logging.getLogger(lib)
    logger.setLevel(logging.ERROR)  # Set logging level
    
    # Remove all existing handlers
    while logger.hasHandlers():
        logger.removeHandler(logger.handlers[0])
    
    # Add a NullHandler to prevent logs from propagating
    logger.addHandler(logging.NullHandler())

### Build dataframe for testing

In [106]:
# Configuration
moving_average_config = prep_data.MovingAverageConfig(short_window=10, long_window=50)
bollinger_config = prep_data.BollingerConfig(window=90, num_std=3.0)
macd_config = prep_data.MACDConfig(short_window=12, long_window=26)

indicator_config = prep_data.IndicatorConfig(
    ticker='MMM',
    target='Adj Close', # probably should always be Adj Close
    rsi_window = 30,
    moving_average=moving_average_config,
    bollinger=bollinger_config,
    macd=macd_config
)

In [108]:
prepd_data = prep_data.prep_data(config=indicator_config, drop_tickers=True)

### Set configuration and parameters for model testing

In [14]:
# Configuration
keras_config = strat_defs.KerasConfig(proba=0.5, sequence_length=30, epochs=20)
proba_config = strat_defs.ProbaConfig(knn = 0.5, logit = 0.5, mlp = 0.5, rf = 0.5, svc = 0.5, xgboost = 0.5)

backtest_config = strat_defs.BacktestConfig(
    overbought = 70,
    logit_warm_start = True, # true with drop tickers = false fails. not sure why. true with drop tickers = True fails. not sure why.
    proba = proba_config,
    keras = keras_config
)

**s_date**\
data start date S&P 500 minimum is 1993-01-29, Wikipedia page views minimum is 2015-07-01

**exclude_vars**\
enum {"Open","High","Low","Close","Adj Close","Volume","movement", "views"}

**strategy_list**\
enum {"Hold","SMA","RSI","VWAP","Bollinger","Breakout","Prophet","KNN","LinearSVC","Logit","MLP","RandomForest","SVC","SVC_proba","XGBoost","Keras","Perfection"

From fast to slow (usually - times with drop_tickers=True):\
Hold, Perfection, SMA, RSI, VWAP, Bollinger, Breakout, KNN(*4s*), LinearSVC(*5s*), Logit(*24s*), XGBoost(*115s*), SVC(*138s*), Prophet(*160s*), SVC_proba(*200s*), MLP(*2978s*), Keras(*4048s*)

\
*If testing Breakout, "High" and "Low" cannot be excluded*

In [16]:
s_date = "2015-07-01"
exclude_vars = ("Open","High","Low","Close","Volume")

initial_train_period = 1890 # 2015-07-01 start predicting in 2023
# initial_train_period = 2140 # 2015-07-01 start predicting in 2024
# initial_train_period = 7535 # 1993-01-29 start predicting in 2024
bko_window = 20
random_state = 42
n_jobs = None
                                                     
# Strategies to test
strategy_list = ["Hold", "KNN", "LinearSVC", "Logit", "XGBoost", "SVC", "Prophet", "SVC_proba"]

### Run models

In [None]:
%%time
df_for_chart = prepd_data.loc[prepd_data['Date']>=s_date].reset_index(drop=True)
df_for_chart = df_for_chart.drop(columns=[
    col for col in df_for_chart.columns 
    if any(col.startswith(prefix) for prefix in exclude_vars) and col != indicator_config.target+"_"+indicator_config.ticker
])
df_for_chart = df_for_chart.dropna(axis='columns') # drop columns with an na

print(f'Training on data set with {len(df_for_chart)} rows and {df_for_chart.shape[1]-1} features')

# Calculate portfolio value over time
strat_bds, strat_mods = {}, {}
for strat in strategy_list:
    start_time = time.time()
    print(f'\n{strat}', end=" ")
    backtested_data,model,score = strat_defs.backtest_strategy(data=df_for_chart,
                                                               strategy=strat,
                                                               target=indicator_config.target,
                                                               ticker=indicator_config.ticker,
                                                               config=backtest_config,
                                                               initial_train_period=initial_train_period,
                                                               bko_window=bko_window,
                                                               random_state=random_state,
                                                               n_jobs=n_jobs)
    end_time = time.time()
    print(f'score = {score}, time = {end_time-start_time}')
    
    strat_bds[strat] = backtested_data
    strat_mods[strat] = model

In [None]:
proba_strats = [x for x in strategy_list if x not in ["Hold","LinearSVC","SVC","Prophet"]]

combos = [x for x in gen_powerset(proba_strats) if len(x) > 1]

mod_mod_dic = {}
for mods in combos:
    # Model of models
    if indicator_config.ticker == "SPY":
        df_prev = strat_bds[mods[0]][['Date','Daily_Return','Target','proba_1','Signal']]
    else:
        df_prev = strat_bds[mods[0]][['Date','Daily_Return_SPY','Daily_Return','Target','proba_1','Signal']]
    
    df_prev = df_prev.rename(columns={'proba_1': 'proba_1'+mods[0], 'Signal': 'Signal_'+mods[0]})

    for i in mods[1:]:
        mod_mod = strat_bds[i][['Date','proba_1','Signal']].rename(columns={'proba_1': 'proba_1'+i, 'Signal': 'Signal_'+i})
        mod_mod = mod_mod.merge(df_prev,on='Date')
        df_prev = mod_mod
    
    # All strats predict 0
    signal_columns = mod_mod.columns[mod_mod.columns.str.contains('Signal')]
    
    mod_mod['Signal_all0'] = np.where(mod_mod[signal_columns].eq(0).all(axis=1), 0, 1) 
    
    mod_mod['Strategy_Return_all0'] = mod_mod['Signal_all0'].shift(1) * mod_mod['Daily_Return']
    
    # Using strategy with most confident prediction (furthest from 50%)
    proba_cols = [col for col in mod_mod.columns if col.startswith('proba_1')]
    
    for i in proba_cols:
        mod_mod["dist_"+i] = abs(mod_mod[i] - 0.5)
    
    dist_cols = [col for col in mod_mod.columns if col.startswith('dist_')]
    
    mod_mod['proba_1max_col'] = mod_mod[dist_cols].idxmax(axis=1,skipna=True)
    mod_mod['proba_1max_col'] = mod_mod['proba_1max_col'].str.replace("dist_","")
    
    mod_mod['proba_1max'] = mod_mod.apply(
        lambda row: row[row['proba_1max_col']] if pd.notnull(row['proba_1max_col']) else 1, axis=1
    )
    
    mod_mod['Signal'] = mod_mod['proba_1max'].round()
    
    mod_mod['Strategy_Return'] = mod_mod['Signal'].shift(1) * mod_mod['Daily_Return']

    if indicator_config.ticker != "SPY":
        mod_mod.loc[:initial_train_period, 'Strategy_Return'] = mod_mod['Daily_Return_SPY']
    
    mod_mod.loc[0, 'Strategy_Return'] = np.nan

    mod_mod_dic["_".join(mods)] = mod_mod

### Results

In [None]:
initial_capital = 10000 # scalar

# Plot Daily Portfolio Value
fig = go.Figure()

if indicator_config.ticker != "SPY":
    df = strat_bds['Hold'][initial_train_period:].copy()
    df.loc[initial_train_period, 'Strategy_Return'] = 0
    df['Portfolio_Value'] = (1 + df['Daily_Return_SPY']).cumprod() * initial_capital

    fig.add_trace(go.Scatter(x=df['Date'], y=df['Portfolio_Value'],
                                     mode='lines', name='Portfolio Value SPY_Hold'))

for strat in strategy_list:
    df = strat_bds[strat][initial_train_period:].copy()
    df.loc[initial_train_period, 'Strategy_Return'] = 0
    df['Portfolio_Value'] = (1 + df['Strategy_Return']).cumprod() * initial_capital
    
    fig.add_trace(go.Scatter(x=df['Date'], y=df['Portfolio_Value'],
                             mode='lines', name=f'Portfolio Value ({strat})'))

try:
    for mods in combos:
        df = mod_mod_dic["_".join(mods)][initial_train_period:].copy()
        df.loc[initial_train_period, 'Strategy_Return'] = 0
        df['Portfolio_Value'] = (1 + df['Strategy_Return']).cumprod() * initial_capital
        
        fig.add_trace(go.Scatter(x=df['Date'], y=df['Portfolio_Value'],
                                 mode='lines', name=f"Portfolio Value ({'_'.join(mods)})"))
except:
    pass

fig.update_layout(title="Portfolio")
fig.show()

In [None]:
# % right
dat = []
for i in strategy_list:
    after_train = strat_bds[i][initial_train_period:]
    cc = after_train.dropna().reset_index(drop=True)
    cc['win'] = cc['Target']==cc['Signal']

    win = cc['win'].value_counts(normalize=True)[True]

    dat.append(f'{win:.2%}')

try:
    combos_join = ["_".join(x) for x in combos]
    for i in combos_join:
        cc = mod_mod_dic[i].dropna().reset_index(drop=True)
        cc['win'] = cc['Target']==cc['Signal']
    
        win = cc['win'].value_counts(normalize=True)[True]
    
        dat.append(f'{win:.2%}')

    review = pd.DataFrame({'strategy': strategy_list+combos_join, 'pct_right': dat})
except:
    review = pd.DataFrame({'strategy': strategy_list, 'pct_right': dat})

review.sort_values(by='pct_right', ascending=False)

### Testing

In [None]:
########################################################################################################
# Testing ##############################################################################################
########################################################################################################

In [114]:
prepd_data = prep_data.prep_data(config=indicator_config, drop_tickers=True)

df_for_chart = prepd_data.loc[prepd_data['Date']>=s_date].reset_index(drop=True)
df_for_chart = df_for_chart.drop(columns=[
    col for col in df_for_chart.columns 
    if any(col.startswith(prefix) for prefix in exclude_vars) and col != 'Adj Close_MMM'])
df_for_chart = df_for_chart.dropna(axis='columns') # drop columns with an na
df_for_chart.head()

Unnamed: 0,Date,Adj Close_MMM,Adj Close_SPY,movement_MMM,movement_SPY,views_MMM,views_SPY,sunlight_nyc,federal_funds_rate,high_temp_nyc,...,short_ema,long_ema,macd_line,Daily_Return,Daily_Return_SPY,day_of_week_name_Monday,day_of_week_name_Thursday,day_of_week_name_Tuesday,day_of_week_name_Wednesday,Target
0,2015-07-01,93.61058,175.434174,1676789.0,189688400.0,1306.0,2496.0,54124.706899,0.13,27.8,...,94.232879,94.870506,-0.637627,0.008749,0.008015,0,0,0,1,0
1,2015-07-02,93.448189,175.273605,-392791.3,-16759100.0,1384.0,2309.0,54086.256107,0.13,26.7,...,94.112158,94.765149,-0.652992,-0.001735,-0.000915,0,1,0,0,0
2,2015-07-06,93.267769,174.77475,-468722.1,-58852690.0,1338.0,2876.0,53897.71602,0.13,28.3,...,93.982252,94.654232,-0.671981,-0.001931,-0.002846,1,0,0,0,1
3,2015-07-07,93.478279,175.873856,721397.4,191046800.0,1245.0,3082.0,53842.09973,0.13,31.1,...,93.904717,94.567125,-0.662407,0.002257,0.006289,0,0,1,0,0
4,2015-07-08,91.842407,172.923157,-5112929.0,-483973900.0,1355.0,3807.0,53783.180302,0.13,31.1,...,93.587439,94.365294,-0.777855,-0.0175,-0.016777,0,0,0,1,1


In [116]:
data = df_for_chart.copy()
feats = [col for col in data.columns if col not in ['Date', 'Target']]

# Drop rows with missing values due to rolling calculations
data = data.dropna().copy()

train_data = data.iloc[:initial_train_period]
X_train, y_train = train_data[feats], train_data['Target']

# Grid search for best parameters
pipeline = make_pipeline(
    StandardScaler(),
    PCA(svd_solver='full'),
    LogisticRegression()
)

# Parameter grid with conditional n_jobs
param_grid = [
    {  # Case where solver is liblinear → NO n_jobs
        "pca__n_components": [0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95],
        "logisticregression__C": np.logspace(-4, 4, 9),
        "logisticregression__solver": ["liblinear"],
        "logisticregression__max_iter": [100, 500, 1000],
    },
    {  # Case where solver is lbfgs or saga → USE n_jobs
        "pca__n_components": [0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95],
        "logisticregression__C": np.logspace(-4, 4, 9),
        "logisticregression__solver": ["lbfgs", "saga"],
        "logisticregression__max_iter": [100, 500, 1000],
        "logisticregression__n_jobs": [n_jobs],  # Only set n_jobs for these solvers
    }
]

search = GridSearchCV(pipeline, param_grid, cv=TimeSeriesSplit(), n_jobs=n_jobs)
search.fit(X_train, y_train)
print(search.best_params_)
print(search.best_estimator_.classes_)

{'logisticregression__C': 1.0, 'logisticregression__max_iter': 100, 'logisticregression__solver': 'liblinear', 'pca__n_components': 0.65}
[0 1]


In [118]:
train_data

Unnamed: 0,Date,Adj Close_MMM,Adj Close_SPY,movement_MMM,movement_SPY,views_MMM,views_SPY,sunlight_nyc,federal_funds_rate,high_temp_nyc,...,short_ema,long_ema,macd_line,Daily_Return,Daily_Return_SPY,day_of_week_name_Monday,day_of_week_name_Thursday,day_of_week_name_Tuesday,day_of_week_name_Wednesday,Target
0,2015-07-01,93.610580,175.434174,1.676789e+06,1.896884e+08,1306.0,2496.0,54124.706899,0.13,27.8,...,94.232879,94.870506,-0.637627,0.008749,0.008015,0,0,0,1,0
1,2015-07-02,93.448189,175.273605,-3.927913e+05,-1.675910e+07,1384.0,2309.0,54086.256107,0.13,26.7,...,94.112158,94.765149,-0.652992,-0.001735,-0.000915,0,1,0,0,0
2,2015-07-06,93.267769,174.774750,-4.687221e+05,-5.885269e+07,1338.0,2876.0,53897.716020,0.13,28.3,...,93.982252,94.654232,-0.671981,-0.001931,-0.002846,1,0,0,0,1
3,2015-07-07,93.478279,175.873856,7.213974e+05,1.910468e+08,1245.0,3082.0,53842.099730,0.13,31.1,...,93.904717,94.567125,-0.662407,0.002257,0.006289,0,0,1,0,0
4,2015-07-08,91.842407,172.923157,-5.112929e+06,-4.839739e+08,1355.0,3807.0,53783.180302,0.13,31.1,...,93.587439,94.365294,-0.777855,-0.017500,-0.016777,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1885,2022-12-23,91.102127,371.283600,-3.431338e+06,1.271073e+08,964.0,1798.0,33288.444546,4.10,14.4,...,93.309709,93.941237,-0.631529,-0.011925,0.005752,0,0,0,0,1
1886,2022-12-27,91.162796,369.819458,1.314208e+05,-7.560565e+07,930.0,2160.0,33350.271345,4.10,1.7,...,92.979414,93.735427,-0.756013,0.000666,-0.003943,0,0,1,0,0
1887,2022-12-28,89.699272,365.223389,-3.432484e+06,-3.259142e+08,933.0,2884.0,33375.800327,4.10,8.3,...,92.474777,93.436453,-0.961675,-0.016054,-0.012428,0,0,0,1,1
1888,2022-12-29,91.428200,371.797546,4.261317e+06,4.402773e+08,928.0,2549.0,33405.314715,4.10,10.6,...,92.313765,93.287693,-0.973928,0.019275,0.018000,0,1,0,0,0


In [120]:
# Proba loop
best_pipeline = search.best_estimator_
proba=0.5

proba_results = []
for i in range(initial_train_period, len(data)):
    # Train only on past data up to the current point
    train_data = data.iloc[:i]
    X_train = train_data[feats]
    y_train = train_data['Target']

    # Fit the pipeline (scaling + model training)
    best_pipeline.fit(X_train, y_train)

    # Predict for the next day
    test_data = data.loc[[i]]
    X_test = test_data[feats]

    # Store predictions with indices
    proba_results.append((i, best_pipeline.predict_proba(X_test)[0]))

proba_df = pd.DataFrame(proba_results, columns=["index", "proba"]).set_index("index")
data[["proba_0", "proba_1"]] = pd.DataFrame(proba_df["proba"].to_list(), index=proba_df.index)

data['Signal'] = np.where(data['proba_1'].fillna(1) > proba, 1, 0)

score = best_pipeline.score(X_train, y_train)
model = best_pipeline.steps[-1][1]

In [84]:
print(print(best_pipeline.classes_))
# print(search.best_params_)

[0 1]
None


In [60]:
best_pipeline.predict_proba(X_test)[0]

array([0.47873988, 0.52126012])

In [78]:
data[["proba_0", "proba_1",'Signal']]

Unnamed: 0,proba_0,proba_1,Signal
0,,,1
1,,,1
2,,,1
3,,,1
4,,,1
...,...,...,...
2441,0.491470,0.508530,1
2442,0.463985,0.536015,1
2443,0.492428,0.507572,1
2444,0.469653,0.530347,1


In [None]:
cols_for_review = ['Date','Adj Close_SPY','Target','streak0','streak1','proba_1','Signal']

# prophet = strat_bds['Prophet'][cols_for_review+['predicted_price_tomorrow']]
logit = strat_bds['Logit'][cols_for_review]
knn = strat_bds['KNN'][cols_for_review]
boost = strat_bds['XGBoost'][cols_for_review]
# svc = strat_bds['SVC'][cols_for_review]
# mlp = strat_bds['MLP'][cols_for_review]
# keras = strat_bds['Keras'][cols_for_review+['next_day_prediction']]

linearsvc = strat_bds['LinearSVC'][['Date','Adj Close_SPY','Target','streak0','streak1','Signal']]

In [None]:
px.histogram(logit['proba_1'].dropna(),nbins=50)

In [None]:
px.histogram(knn['proba_1'].dropna(),nbins=50)

In [None]:
px.histogram(boost['proba_1'].dropna(),nbins=50)