In [2]:
import glob
import os
import time
from datetime import datetime, date

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

import strat_defs # custom functions
import prep_data

sp_df_files = glob.glob('sp_df_*.csv')
sp_df_latest = max(sp_df_files, key=os.path.getctime)
sp_df_raw = pd.read_csv(sp_df_latest, parse_dates=['Date added'])

stocks_df, wiki_pageviews, ffr_raw, weather, gt_adjusted = prep_data.load_data()

In [3]:
s_date = "2015-07-01"
exclude_vars = ("Open","High","Low","Close","Adj Close","Volume")

initial_train_period = 2140 # 2015-07-01 start predicting in 2024

# Stocks to test
these_dont_work = ['BF.B', 'BRK.B', 'GOOG', 'FOX', 'NWS']
to_test = list(sp_df_raw.loc[sp_df_raw['Date added']<=s_date,'Symbol'])
to_test = [x for x in to_test if x not in these_dont_work]

In [None]:
%%time
strat_bds, strat_mods = {}, {}
for ticker in to_test:
    print(f'\n\033[1m{ticker}\033[0m')
    prepd_data = prep_data.prep_data(
        stocks_df,
        wiki_pageviews,
        ffr_raw,
        weather,
        gt_adjusted,
        config=prep_data.IndicatorConfig(ticker=ticker),
        drop_tickers=True
    )

    df_for_chart = prepd_data.loc[prepd_data['Date']>=s_date].reset_index(drop=True)
    df_for_chart = df_for_chart.drop(columns=[
        col for col in df_for_chart.columns 
        if any(col.startswith(prefix) for prefix in exclude_vars) and col != "Adj Close_"+ticker
    ])
    df_for_chart = df_for_chart.dropna(axis='columns') # drop columns with an na

    print(f'Training on data set with {len(df_for_chart)} rows and {df_for_chart.shape[1]-1} features')
    print("Logit", end=" ")
    start_time = time.time()
    backtested_data,model,score = strat_defs.backtest_strategy(
        data=df_for_chart,
        strategy="Logit",
        target='Adj Close',
        ticker=ticker,
        config=strat_defs.BacktestConfig(),
        initial_train_period=initial_train_period,
        n_jobs=-1
    )
    end_time = time.time()    
    print(f'score = {score}, time = {end_time-start_time}')

    strat_bds[f'{ticker}_Logit'] = backtested_data
    strat_mods[f'{ticker}_Logit'] = model


[1mMMM[0m
Training on data set with 2465 rows and 29 features
Logit score = 0.5300324675324676, time = 8.565546989440918

[1mABT[0m
Training on data set with 2465 rows and 29 features
Logit score = 0.5385551948051948, time = 13.258584022521973

[1mABBV[0m
Training on data set with 2465 rows and 29 features
Logit score = 0.5430194805194806, time = 12.903130054473877

[1mACN[0m
Training on data set with 2465 rows and 29 features
Logit score = 0.5499188311688312, time = 13.17265772819519

[1mADBE[0m
Training on data set with 2465 rows and 28 features
Logit score = 0.5369318181818182, time = 15.713664054870605

[1mAES[0m
Training on data set with 2465 rows and 29 features
Logit score = 0.5271915584415584, time = 14.146892786026001

[1mAFL[0m
Training on data set with 2465 rows and 29 features
Logit score = 0.5625, time = 12.656495094299316

[1mA[0m
Training on data set with 2465 rows and 29 features
Logit score = 0.5430194805194806, time = 7.200598955154419

[1mAPD[0m
Tr



score = 0.5568181818181818, time = 12.591730117797852

[1mAEE[0m
Training on data set with 2465 rows and 29 features
Logit score = 0.5450487012987013, time = 12.784030199050903

[1mAEP[0m
Training on data set with 2465 rows and 29 features
Logit score = 0.549512987012987, time = 12.682819128036499

[1mAXP[0m
Training on data set with 2465 rows and 29 features
Logit score = 0.5243506493506493, time = 15.768457889556885

[1mAIG[0m
Training on data set with 2465 rows and 29 features
Logit score = 0.5284090909090909, time = 12.516451835632324

[1mAMT[0m
Training on data set with 2465 rows and 28 features
Logit score = 0.5373376623376623, time = 13.711375951766968

[1mAMP[0m
Training on data set with 2465 rows and 29 features
Logit score = 0.5275974025974026, time = 13.795675039291382

[1mAME[0m
Training on data set with 2465 rows and 29 features
Logit score = 0.5373376623376623, time = 12.651792049407959

[1mAMGN[0m
Training on data set with 2465 rows and 29 features
Logit 

In [None]:
prepd_data = prep_data.prep_data(
    stocks_df,
    wiki_pageviews,
    ffr_raw,
    weather,
    gt_adjusted,
    config=prep_data.IndicatorConfig(ticker="SPY"),
    drop_tickers=True
)

df_for_chart = prepd_data.loc[prepd_data['Date']>=s_date].reset_index(drop=True)

spy_data,spy_model,spy_score = strat_defs.backtest_strategy(
    data=df_for_chart.dropna(axis='columns'),
    strategy="Hold",
    target='Adj Close',
    ticker="SPY",
    config=strat_defs.BacktestConfig(),
    initial_train_period=initial_train_period,
    n_jobs=-1
)

In [None]:
df_to_build = strat_bds[to_test[0]+"_Logit"][['Date']] # start with just date column

# Get results for all tickers in to_test
for ticker in to_test:
    df = strat_bds[ticker+"_Logit"][['Date','Daily_Return','proba_1']]
    df = df.rename(columns={'Daily_Return': f'Daily_Return_{ticker}', 'proba_1': f'{ticker}_proba_1Logit'})
    df_to_build = df_to_build.merge(df,on='Date')

proba_cols = [col for col in df_to_build.columns if 'proba_1' in col]

# filter to after training period (avoid all rows nan error)
df_to_build = df_to_build[initial_train_period:]

df_to_build['proba_1max'] = df_to_build[proba_cols].max(axis=1) # max value acoss all proba_1 cols

df_to_build['proba_1max_col'] = df_to_build[proba_cols].idxmax(axis=1,skipna=True) # column name that proba_1max is in
df_to_build['proba_1max_ticker'] = "Daily_Return_"+df_to_build['proba_1max_col'].str.split('_').str[0] # daily return column name of relevant ticker

# Daily return value of ticker with highest predicted probability of increase
df_to_build['proba_1max_ticker_Daily_return'] = df_to_build.apply(
    lambda row: row[row['proba_1max_ticker']] if pd.notnull(row['proba_1max_col']) else row['Daily_Return_SPY'], axis=1
)

df_to_build['Strategy_Return'] = df_to_build['proba_1max_ticker_Daily_return']

df_to_build.loc[df_to_build['proba_1max'] < 0.7, 'Strategy_Return'] = df_to_build['Daily_Return_SPY']

In [None]:
initial_capital = 10000 # scalar

fig = go.Figure()

df = spy_data[initial_train_period:].copy()
df.loc[initial_train_period, 'Strategy_Return'] = 0
df['Portfolio_Value'] = (1 + df['Strategy_Return']).cumprod() * initial_capital
fig.add_trace(go.Scatter(x=df['Date'], y=df['Portfolio_Value'], mode='lines', name='PV SPY_Hold'))

df_to_build.loc[initial_train_period, 'Strategy_Return'] = 0
df_to_build['Portfolio_Value'] = (1 + df_to_build['Strategy_Return']).cumprod() * initial_capital
fig.add_trace(go.Scatter(x=df_to_build['Date'], y=df_to_build['Portfolio_Value'], mode='lines', name="PV (Logit)"))

fig.update_layout(title="Portfolio")
fig.show()