In [2]:
# Auto reload local files
%load_ext autoreload
%autoreload 2
# Make files in src/ available to notebook
import sys
if 'src' not in sys.path:
    sys.path.insert(0, 'src')


import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from technical_signals import TechnicalSignals, percent_change
import datastore as ds

spy_constituents = pd.read_csv('../../data/spy_constituents.csv', header=0)

def test_model(model, X_test, y_test):
    # Predict
    y_predicted = model.predict(X_test)
    # Error and % correct
    loss = np.mean(np.square(y_predicted - y_test))
    correct = ((y_predicted > 0) == (y_test > 0)).sum() / len(y_predicted) * 100
    up = (y_test > 0).sum() / len(y_test) * 100
    print(f'Correctly predicted direction of change {correct}% of the time')
    print(f'Up {up}% of the time')
    print('MSE Loss:', loss)
    print(f'Standard deviation: actual = {np.std(y_test)}, predicted = {np.std(y_predicted)}')
    
    return y_predicted[-1]
    
Xs, ys, Xy_dates = [], [], []
for ticker in spy_constituents['Symbol']:
    try:
        print('Ticker:', ticker)
        # Download data
        #ds.download_daily_candlesticks([ticker], "2000-01-01", "2023-05-07")
        # Load data
        data = ds.get_daily_candlesticks([ticker], "2000-01-01", "2023-06-06")[ticker]
        # Get technical indicator features for use in forest model
        tsigs = TechnicalSignals(data, predict_window=7)
        X, y, Xy_date = tsigs.toXy()
        Xs.append(X)
        ys.append(y)
        Xy_dates.append(Xy_date)
    except:
        pass
    
X = np.concatenate(Xs)
y = np.concatenate(ys)
Xy_date = np.concatenate(Xy_dates)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
#Xy_test_date = Xy_date.iloc[-y_test.shape[0]:]

Ticker: MMM
Ticker: AOS
Ticker: ABT
Ticker: ABBV
Ticker: ABMD
Ticker: ACN
Ticker: ATVI
Ticker: ADM
Ticker: ADBE
Ticker: AAP
Ticker: AMD
Ticker: AES
Ticker: AFL
Ticker: A
Ticker: APD
Ticker: AKAM
Ticker: ALK
Ticker: ALB
Ticker: ARE
Ticker: ALGN
Ticker: ALLE
Ticker: LNT
Ticker: ALL
Ticker: GOOGL
Ticker: GOOG
Ticker: MO
Ticker: AMZN
Ticker: AMCR
Ticker: AEE
Ticker: AAL
Ticker: AEP
Ticker: AXP
Ticker: AIG
Ticker: AMT
Ticker: AWK
Ticker: AMP
Ticker: ABC
Ticker: AME
Ticker: AMGN
Ticker: APH
Ticker: ADI
Ticker: ANSS
Ticker: ANTM
Ticker: AON
Ticker: APA
Ticker: AAPL
Ticker: AMAT
Ticker: APTV
Ticker: ANET
Ticker: AJG
Ticker: AIZ
Ticker: T
Ticker: ATO
Ticker: ADSK
Ticker: ADP
Ticker: AZO
Ticker: AVB
Ticker: AVY
Ticker: BKR
Ticker: BLL
Ticker: BAC
Ticker: BBWI
Ticker: BAX
Ticker: BDX
Ticker: BRK.B
Ticker: BBY
Ticker: BIO
Ticker: TECH
Ticker: BIIB
Ticker: BLK
Ticker: BK
Ticker: BA
Ticker: BKNG
Ticker: BWA
Ticker: BXP
Ticker: BSX
Ticker: BMY
Ticker: AVGO
Ticker: BR
Ticker: BRO
Ticker: BF.B
Ticker: 

In [5]:
# Try out xgboost and some of sklearn's models
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

print(X.shape)

models = [
    ('XGBoost', xgb.XGBRegressor()),
    ('Decision Tree', DecisionTreeRegressor()),
    ('Random Forest', RandomForestRegressor()),
]

for name, model in models:
    print('Training model:', name)
    # Train model
    signal_model.fit(X_train, y_train)
    # Print predictions
    test_model(signal_model, X_test, y_test)

(2474117, 24)
Training model: XGBoost
Correctly predicted direction of change 53.45415743779607% of the time
Up 50.62891048130245% of the time
MSE Loss: 0.9489332375071751
Standard deviation: actual = 1.0028448656227407, predicted = 0.2429320123814722
Training model: Decision Tree
Correctly predicted direction of change 53.25893651075938% of the time
Up 50.62891048130245% of the time
MSE Loss: 0.9493220565431066
Standard deviation: actual = 1.0028448656227407, predicted = 0.24506557316138333
Training model: Random Forest
Correctly predicted direction of change 53.157082114044584% of the time
Up 50.62891048130245% of the time
MSE Loss: 0.9498113328297667
Standard deviation: actual = 1.0028448656227407, predicted = 0.2439806717240923


## Results

No significant results obtained. The RandomForestRegressor's success in predicting direction of change for single assets is reduced by adding data from other assets.

Possible paths forward:

- Train CNN/RNN on individual stock price data
  - Possible to train on multiple stocks without implying the previous sample is related to next when switching tickers?
- Split into industries and try again