In [None]:
import glob
import os
import time

import numpy as np
import pandas as pd
import plotly.graph_objects as go

import strat_defs # custom functions
import prep_data

In [None]:
def gen_powerset(some_list):
    powerset = [[]]

    for i in some_list:
        powerset += [x + [i] for x in powerset]

    return powerset

In [None]:
sp_df_files = glob.glob('sp_df_*.csv')
sp_df_latest = max(sp_df_files, key=os.path.getctime)
sp_df_raw = pd.read_csv(sp_df_latest, parse_dates=['Date added'])

stocks_df, wiki_pageviews, ffr_raw, weather, gt_adjusted = prep_data.load_data()

In [None]:
set(sp_df_raw['GICS Sector'])

### Set configuration and parameters for model testing

In [None]:
# Indicator Configuration
moving_average_config = prep_data.MovingAverageConfig(short_window=10, long_window=50)
bollinger_config = prep_data.BollingerConfig(window=90, num_std=3.0)
macd_config = prep_data.MACDConfig(short_window=12, long_window=26)

# Backtest Configuration
keras_config = strat_defs.KerasConfig(proba=0.5, sequence_length=30, epochs=20)
proba_config = strat_defs.ProbaConfig(knn = 0.5, logit = 0.5, mlp = 0.5, rf = 0.5, svc = 0.5, xgboost = 0.5)

backtest_config = strat_defs.BacktestConfig(
    retrain_days = 10,
    proba = proba_config,
    keras = keras_config
)

**s_date**\
data start date S&P 500 minimum is 1993-01-29, Wikipedia page views minimum is 2015-07-01

**exclude_vars**\
enum {"Open","High","Low","Close","Adj Close","Volume","movement", "views"}

**strategy_list**\
enum {"Hold","KNN","Logit","MLP","RandomForest","SVC_proba","XGBoost","Keras","Perfection"}

From fast to slow (usually): Hold, Perfection, KNN, Logit, XGBoost, SVC_proba(200s), MLP (2978s), Keras(4048s)

For Best Prediction, I'm removing strategies that do not give a probability

\
*If testing Breakout, "High" and "Low" cannot be excluded*

In [None]:
s_date = "2015-07-01"
exclude_vars = ("Open","High","Low","Close","Adj Close","Volume")

initial_train_period = 1890 # 2015-07-01 start predicting in 2023
# initial_train_period = 2140 # 2015-07-01 start predicting in 2024
# initial_train_period = 7535 # 1993-01-29 start predicting in 2024
random_state = 42
n_jobs = -1

# Stocks to test
these_dont_work = ['GOOG', 'FOX', 'NWS']
to_test = list(sp_df_raw.loc[sp_df_raw['Date added']<="2015-07-01",'Symbol'])
to_test = [x for x in to_test if x not in these_dont_work]
                                           
# Strategies to test
# strategy_list = ["Hold","KNN","Logit"] # 2h 2min with n_jobs = -1, KNN always best prob for some reason?
strategy_list = ["Hold","KNN","Logit", "XGBoost"] # 13h 47min with n_jobs = None, KNN always best prob for some reason?
                                                  # 12h 23min with n_jobs = -1

### Run models

In [None]:
%%time
strat_bds, strat_mods = {}, {}
for ticker in to_test:
    print(f'\n\033[1m{ticker}\033[0m')
    indicator_config = prep_data.IndicatorConfig(ticker=ticker)
    prepd_data = prep_data.prep_data(stocks_df, wiki_pageviews, ffr_raw, weather, gt_adjusted, config=indicator_config, drop_tickers=True)

    df_for_chart = prepd_data.loc[prepd_data['Date']>=s_date].reset_index(drop=True)
    df_for_chart = df_for_chart.drop(columns=[
        col for col in df_for_chart.columns 
        if any(col.startswith(prefix) for prefix in exclude_vars) and col != indicator_config.target+"_"+ticker
    ])
    df_for_chart = df_for_chart.dropna(axis='columns') # drop columns with an na

    print(f'Training on data set with {len(df_for_chart)} rows and {df_for_chart.shape[1]-1} features')

    # Calculate portfolio value over time
    for strat in strategy_list:
        start_time = time.time()
        print(f'{strat}', end=" ")
        backtested_data,model,score = strat_defs.backtest_strategy(
            data=df_for_chart,
            strategy=strat,
            target='Adj Close',
            ticker=ticker,
            config=backtest_config,
            initial_train_period=initial_train_period,
            random_state=random_state,
            n_jobs=n_jobs
        )
        end_time = time.time()
        print(f'score = {score}, time = {end_time-start_time}')

        strat_bds[f'{ticker}_{strat}'] = backtested_data
        strat_mods[f'{ticker}_{strat}'] = model

In [None]:
proba_strats = [x for x in strategy_list if x not in ["Hold"]]

combos = gen_powerset(proba_strats)
combos.remove([])

all_mod_results_dic = {}
for mods in combos:
    print(mods)
    df_to_build = strat_bds[to_test[0]+"_"+mods[0]][['Date']] # start with just date column

    # Get results for all tickers in to_test, in this subgroup of models, eg ['KNN_Logit']
    for ticker in to_test:
    
        # Get the results for a ticker with the first model in mods list
        df_prev = strat_bds[ticker+"_"+mods[0]][['Date','Daily_Return','proba_1']]
        df_prev = df_prev.rename(columns={'Daily_Return': f'Daily_Return_{ticker}'
                                          ,'proba_1': f'{ticker}_proba_1{mods[0]}'})
        df_to_build = df_to_build.merge(df_prev,on='Date')

        # Get results for this ticker for other models in mods
        for model in mods[1:]:  ########################### only for ['KNN', 'Logit'] which just goes to KNN! (testing 3/31)
            df = strat_bds[ticker+"_"+model][['Date','proba_1']].copy()
            # df['proba_1'] = 1 - df['proba_1']
            df = df.rename(columns={'proba_1': f'{ticker}_proba_1{model}'})
            df_to_build = df_to_build.merge(df,on='Date')

    proba_cols = [col for col in df_to_build.columns if 'proba_1' in col]

    df_to_build['proba_1max'] = df_to_build[proba_cols].max(axis=1) # max value acoss all proba_1 cols

    # Column name that proba_1max is in
    mask = df_to_build[proba_cols].notna().any(axis=1)
    df_to_build.loc[mask, 'proba_1max_col'] = df_to_build.loc[mask, proba_cols].idxmax(axis=1, skipna=True)
    
    # Daily return column name of relevant ticker
    df_to_build['proba_1max_ticker'] = "Daily_Return_"+df_to_build['proba_1max_col'].str.split('_').str[0]

    df_to_build['yesterday_proba_1max'] = df_to_build['proba_1max'].shift(1)
    df_to_build['yesterday_proba_1max_col'] = df_to_build['proba_1max_col'].shift(1)
    df_to_build['yesterday_proba_1max_ticker'] = df_to_build['proba_1max_ticker'].shift(1)
    
    # Value of daily return column of relevant ticker
    df_to_build['yesterday_proba_1max_ticker_today_Daily_return'] = df_to_build.apply(
        lambda row: row[row['yesterday_proba_1max_ticker']] if pd.notnull(row['yesterday_proba_1max_col']) else row['Daily_Return_SPY'], axis=1
    )

    df_to_build['Strategy_Return'] = df_to_build['yesterday_proba_1max_ticker_today_Daily_return']

    df_to_build.loc[df_to_build['yesterday_proba_1max'] < 0.7, 'Strategy_Return'] = df_to_build['Daily_Return_SPY']
    df_to_build.loc[0, 'Strategy_Return'] = np.nan

    # df_to_build has results for this model for all tickers
    all_mod_results_dic["_".join(mods)] = df_to_build

### Results

In [None]:
initial_capital = 10000 # scalar

# Plot Daily Portfolio Value
fig = go.Figure()

df = strat_bds['SPY_Hold'][initial_train_period:].copy()
df.loc[initial_train_period, 'Strategy_Return'] = 0
df['Portfolio_Value'] = (1 + df['Strategy_Return']).cumprod() * initial_capital

fig.add_trace(go.Scatter(x=df['Date'], y=df['Portfolio_Value'],
                                 mode='lines', name='PV SPY_Hold'))

end_val = {'SPY':df['Portfolio_Value'].values[-1]}
for mods in combos:
    df = all_mod_results_dic["_".join(mods)][initial_train_period:].copy()
    df.loc[initial_train_period, 'Strategy_Return'] = 0
    df['Portfolio_Value'] = (1 + df['Strategy_Return']).cumprod() * initial_capital

    fig.add_trace(go.Scatter(x=df['Date'], y=df['Portfolio_Value'],
                             mode='lines', name=f"PV ({'_'.join(mods)})"))
    end_val['_'.join(mods)] = df['Portfolio_Value'].values[-1]

fig.update_layout(title="Portfolio")
fig.show()

In [None]:
# Which ticker is the easiest to predict? Which am I correct most often on?
dat_review = []
for ticker in to_test:
    # % right
    dat = []
    for i in strategy_list:
        after_train = strat_bds[ticker + "_" + i][initial_train_period:]
        cc = after_train.dropna().reset_index(drop=True)
        cc['win'] = cc['Target']==cc['Signal']
    
        win = cc['win'].value_counts(normalize=True)[True]
        dat.append(win)
    
    try:
        combos_join = ["_".join(x) for x in combos]
        for i in combos_join:
            cc = mod_mod_dic[i].dropna().reset_index(drop=True)
            cc['win'] = cc['Target']==cc['Signal']
    
            win = cc['win'].value_counts(normalize=True)[True]
            dat.append(win)
    
        dat_review.append(pd.DataFrame({'ticker': ticker ,'strategy': strategy_list + combos_join, 'pct_right': dat}))
    except:
        dat_review.append(pd.DataFrame({'ticker': ticker ,'strategy': strategy_list, 'pct_right': dat}))
    

final_review = pd.concat(dat_review, ignore_index=True)
final_review = final_review.pivot(index='ticker', columns='strategy', values='pct_right')
final_review = final_review.reset_index()
final_review.columns.name = None

final_review['max_col'] = final_review[['Hold', 'KNN','Logit', 'XGBoost']].idxmax(axis=1)
final_review['max'] = final_review[['Hold', 'KNN','Logit', 'XGBoost']].max(axis=1)

final_review['KNN_diff'] = final_review['KNN'] - final_review['Hold']
final_review['Logit_diff'] = final_review['Logit'] - final_review['Hold']
final_review['XGBoost_diff'] = final_review['XGBoost'] - final_review['Hold']

final_review['diff_max_col'] = final_review[['KNN_diff', 'Logit_diff', 'XGBoost_diff']].idxmax(axis=1)
final_review['diff_max'] = final_review[['KNN_diff', 'Logit_diff', 'XGBoost_diff']].max(axis=1)
final_review['diff_mean'] = final_review[['KNN_diff', 'Logit_diff', 'XGBoost_diff']].mean(axis=1)

final_review = final_review.sort_values(by='diff_mean', ascending=False).reset_index(drop=True)
# final_review = final_review.loc[final_review['diff_mean']>0]
final_review.loc[final_review['ticker'].isin(["PAYC","EPAM","TDY","MKTX", "HII", "MTD", "SW", "GEV", "VLTO", "TPL", "SOLV", "ERIE", "NVR"])]
# final_review.head(20)

In [None]:
pl_df = pd.DataFrame([end_val]).transpose().reset_index()
pl_df = pl_df.rename(columns={0:'end_val','index':'ticker'})
pl_df['vs_SPY'] = pl_df['end_val'] - end_val['SPY']
pl_df.sort_values(by='vs_SPY',ascending=False)