## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

# Suppress ta warnings
import warnings
warnings.filterwarnings("ignore")

# Auto reload local files
%load_ext autoreload
%reload_ext autoreload
%autoreload 2
# Make files in src/ available to notebook
import sys
if 'src' not in sys.path:
    sys.path.insert(0, '../src')

In [2]:
# Read SPY csv, define config
spy_constituents = list(pd.read_csv('../../data/spy_constituents.csv', header=0)['Symbol'])
random.shuffle(spy_constituents)

tickers = spy_constituents
start_date = "2000-01-01"
end_date = "2025-01-01"
predict_window = 14

## Sync & Load Data, Create Indicators

In [3]:
# Load the data from db
from sklearn.model_selection import train_test_split

import datastore as ds
from technical_signals import TechnicalSignalSet

ds.download_daily_candlesticks(tickers, start_date, end_date)
candlesticks = ds.get_daily_candlesticks(tickers, start_date, end_date)

Xs = []
ys = []

for ticker in tickers:
    try:
        technical_sigs = TechnicalSignalSet(candlesticks[ticker], predict_window)
        X, y, Xy_date = technical_sigs.to_xy()
        Xs.append(X)
        ys.append(y)
    except Exception as ex:
        print(f"Exception on {ticker}:")
        print(ex)

X = np.concatenate(Xs, axis=0)
y = np.concatenate(ys, axis=0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

Error downloading daily candlesticks for VIAC:
No data fetched for symbol VIAC using YahooDailyReader
Error downloading daily candlesticks for PBCT:
No data fetched for symbol PBCT using YahooDailyReader
Error downloading daily candlesticks for BLL:
No data fetched for symbol BLL using YahooDailyReader
Error downloading daily candlesticks for CERN:
No data fetched for symbol CERN using YahooDailyReader
Error downloading daily candlesticks for BRK.B:
'Date'
Error downloading daily candlesticks for DISCK:
No data fetched for symbol DISCK using YahooDailyReader
Error downloading daily candlesticks for INFO:
No data fetched for symbol INFO using YahooDailyReader
Error downloading daily candlesticks for XLNX:
No data fetched for symbol XLNX using YahooDailyReader
Error downloading daily candlesticks for BF.B:
'Date'
Error downloading daily candlesticks for WLTW:
No data fetched for symbol WLTW using YahooDailyReader
Error downloading daily candlesticks for KSU:
No data fetched for symbol KS

## Clean Data For Training

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import gc


def round_batch_size(sample_count, approximately, leeway=None):
    """
    Round batch size to a more suitable value. This helps to avoid a
    problem where the final batch has a lot of samples, but not enough for
    a full batch, leading to many samples being thrown out.

    approximately: int, leeway: int
      decide on a chunk size around a number, with specified leeway
      (leeway defaults to `approximately // 10`).
    """
    if leeway is None:
        leeway = approximately // 10
    
    # Get the number of leftover samples if we use the suggested batch size
    best_leftover = sample_count - np.floor(sample_count / approximately) * approximately

    # Brute-force search for the value that yeilds the fewest leftovers
    # within the given leeway range.
    best_chunk_count = approximately
    for offset in range(-leeway, leeway):
        chunk_size = approximately + offset
        leftover = sample_count - np.floor(sample_count / chunk_size) * chunk_size
        if leftover < best_leftover:
            best_leftover = leftover
            best_chunk_count = chunk_size
    return best_chunk_count
            

batch_size = round_batch_size(X_train.shape[0], 1024, leeway=200)
n_features = X_train.shape[1]

# Convert X, y to torch tensors
X_train_tensor = torch.from_numpy(X_train).float()
X_test_tensor = torch.from_numpy(X_test).float()
y_train_tensor = torch.from_numpy(y_train.reshape(y_train.shape[0], 1)).float()
y_test_tensor = torch.from_numpy(y_test.reshape(y_test.shape[0], 1)).float()

print(X_train_tensor.shape)
print('Batch size:', batch_size)

# Generators
training_set = TensorDataset(X_train_tensor, y_train_tensor)
dataloader_train = DataLoader(training_set, shuffle=True, batch_size=batch_size)

validation_set = TensorDataset(X_test_tensor, y_test_tensor)
dataloader_test = DataLoader(validation_set, shuffle=True, batch_size=batch_size)

# Release duplicated memory
try:
    del X
    del y
    del Xs
    del ys
    #del X_train
    #del X_test
    #del y_train
    #del y_test
    del X_train_tensor
    del X_test_tensor
    del y_train_tensor
    del y_test_tensor
except:
    pass
gc.collect()

torch.Size([2059299, 58])
Batch size: 1023


0

## Create the NN model

In [5]:
n_outputs = 1

net = nn.Sequential(
    nn.Linear(n_features, 256),
    nn.ReLU(),
    nn.Linear(256, 128),
    nn.ReLU(),
    nn.Linear(128, 32),
    nn.ReLU(),
    nn.Linear(32, 16),
    nn.ReLU(),
    nn.Linear(16, n_outputs),
)

net = nn.Sequential(
    nn.Linear(n_features, 256),
    nn.ReLU(),
    nn.Linear(256, 128),
    nn.ReLU(),
    nn.Linear(128, 32),
    nn.ReLU(),
    nn.Linear(32, 16),
    nn.ReLU(),
    nn.Linear(16, 8),
    nn.ReLU(),
    nn.Linear(8, n_outputs),
)

## Training

In [6]:
device = torch.device("cuda")

# Set device for model
net = net.to(device)

# Select optimizerand loss criteria
criterion = torch.nn.MSELoss() 
optimizer = torch.optim.Adam(net.parameters(), lr=0.001) 

# Training the model
for epoch in range(200):
    train_loss = 0.0
    
    # Training
    net.train()
    for local_batch, local_labels in dataloader_train:
        if local_batch.shape[0] != batch_size:
            print(f"Wrong train batch size. Skipping batch.\nThrowing away {local_batch.shape[0]} samples.")
            continue
        local_batch, local_labels = local_batch.to(device), local_labels.to(device)
        
        # Forward pass: Compute predicted y by passing x to the model 
        y_pred = net(local_batch)
        # Compute and print loss 
        loss = criterion(y_pred, local_labels)
        # Zero gradients, perform a backward pass, update the weights. 
        optimizer.zero_grad() 
        loss.backward() 
        optimizer.step() 
        # Update loss
        train_loss += loss.item()

    # Validation
    net.eval()
    valid_loss = 0.0
    for data, labels in dataloader_test:
        if data.shape[0] != batch_size:
            continue
        data, labels = data.to(device), labels.to(device)
        
        target = net(data)
        loss = criterion(target,labels)
        valid_loss += loss.item()

    print(f'Epoch {epoch+1} \t\t Training Loss: {train_loss / len(dataloader_train)} \t\t Validation Loss: {valid_loss / len(dataloader_test)}')

Epoch 1 		 Training Loss: 0.9310950853311772 		 Validation Loss: 0.8938240488725049
Epoch 2 		 Training Loss: 0.8858433501731088 		 Validation Loss: 0.8692614970994847
Epoch 3 		 Training Loss: 0.8629090928107545 		 Validation Loss: 0.8515444243592876
Epoch 4 		 Training Loss: 0.8449403086054461 		 Validation Loss: 0.8386445564350912
Epoch 5 		 Training Loss: 0.8309071763676313 		 Validation Loss: 0.8224686865827867
Epoch 6 		 Training Loss: 0.8181151853114648 		 Validation Loss: 0.8128670432737896
Epoch 7 		 Training Loss: 0.8061985162082621 		 Validation Loss: 0.8043306044169835
Epoch 8 		 Training Loss: 0.7956835946454022 		 Validation Loss: 0.7997041254171303
Epoch 9 		 Training Loss: 0.7859410556820691 		 Validation Loss: 0.7916998474725655
Epoch 10 		 Training Loss: 0.7784018351317518 		 Validation Loss: 0.7882710402565343
Epoch 11 		 Training Loss: 0.7715815457312314 		 Validation Loss: 0.7816575454281909
Epoch 12 		 Training Loss: 0.7643593970780579 		 Validation Loss: 0.776689

Epoch 98 		 Training Loss: 0.631740749237612 		 Validation Loss: 0.6843317611409085
Epoch 99 		 Training Loss: 0.631308998300019 		 Validation Loss: 0.6874754979674306
Epoch 100 		 Training Loss: 0.6306774932178644 		 Validation Loss: 0.683356162160635
Epoch 101 		 Training Loss: 0.6306159685514368 		 Validation Loss: 0.6892720112310988
Epoch 102 		 Training Loss: 0.6299993561620565 		 Validation Loss: 0.6880030645323651
Epoch 103 		 Training Loss: 0.6296885261710935 		 Validation Loss: 0.6931235752999783
Epoch 104 		 Training Loss: 0.6291802481401636 		 Validation Loss: 0.6835181768983603
Epoch 105 		 Training Loss: 0.6287968494201619 		 Validation Loss: 0.6816858210201774
Epoch 106 		 Training Loss: 0.6287958545289227 		 Validation Loss: 0.6842825819871255
Epoch 107 		 Training Loss: 0.6280444303496939 		 Validation Loss: 0.6846056796078172
Epoch 108 		 Training Loss: 0.6278149485588074 		 Validation Loss: 0.6831971196723836
Epoch 109 		 Training Loss: 0.6269932119809628 		 Validatio

Epoch 194 		 Training Loss: 0.6051259284817752 		 Validation Loss: 0.6646934454994542
Epoch 195 		 Training Loss: 0.605181006667511 		 Validation Loss: 0.6669299458818776
Epoch 196 		 Training Loss: 0.6052908944183478 		 Validation Loss: 0.6655975761158126
Epoch 197 		 Training Loss: 0.6049639917817569 		 Validation Loss: 0.6665186794208628
Epoch 198 		 Training Loss: 0.6049323970414013 		 Validation Loss: 0.6686753782310656
Epoch 199 		 Training Loss: 0.6046203255831041 		 Validation Loss: 0.6667208030287709
Epoch 200 		 Training Loss: 0.6044722487543924 		 Validation Loss: 0.6648910867848566


## Backtest

In [7]:
"""
import backtest as bt
from strategy import PretrainedModelStrategy, SignalModelStrategy
from technical_signals import TechnicalSignalSet
from sklearn.svm import SVR

random.shuffle(spy_constituents)

# XXX temporary - need to rework concurrency to be suitable for CUDA
# (Must use `spawn` as opposed to `fork` based concurrency I believe - separate OS processes?)
net = net.to(torch.device('cpu'))

def predict(net):
    return lambda X:\
        net(torch.from_numpy(X).float().cpu()).detach().numpy()

def df_to_signal_set(df):
    return TechnicalSignalSet(df, predict_window=predict_window)

strategy = PretrainedModelStrategy(predict(net), df_to_signal_set, cutoff=0.95, bias=0.2)
#strategy = SignalModelStrategy(SVR(), lambda df: TechnicalSignalSet(df, predict_window=14), cutoff=1., bias=0.1)
bt.comprehensive_backtest(strategy, spy_constituents[:1], "2000-01-01", "2025-01-01", plot=True, train_test_ratio=0.8)
"""

'\nimport backtest as bt\nfrom strategy import PretrainedModelStrategy, SignalModelStrategy\nfrom technical_signals import TechnicalSignalSet\nfrom sklearn.svm import SVR\n\nrandom.shuffle(spy_constituents)\n\n# XXX temporary - need to rework concurrency to be suitable for CUDA\n# (Must use `spawn` as opposed to `fork` based concurrency I believe - separate OS processes?)\nnet = net.to(torch.device(\'cpu\'))\n\ndef predict(net):\n    return lambda X:        net(torch.from_numpy(X).float().cpu()).detach().numpy()\n\ndef df_to_signal_set(df):\n    return TechnicalSignalSet(df, predict_window=predict_window)\n\nstrategy = PretrainedModelStrategy(predict(net), df_to_signal_set, cutoff=0.95, bias=0.2)\n#strategy = SignalModelStrategy(SVR(), lambda df: TechnicalSignalSet(df, predict_window=14), cutoff=1., bias=0.1)\nbt.comprehensive_backtest(strategy, spy_constituents[:1], "2000-01-01", "2025-01-01", plot=True, train_test_ratio=0.8)\n'

## Generate and Store Predictions

In [10]:
from prediction import Prediction, predict_price_change
from predictive_model import PredictiveModel
from datetime import datetime

net = net.to(torch.device('cpu'))

def df_to_signal_set(df):
    return TechnicalSignalSet(df, predict_window=predict_window)

model = PredictiveModel(net, "TorchMATI", predict_window, datetime.now())

predictions = predict_price_change(model, df_to_signal_set, tickers)
ds.save_predictions([p for t, p in predictions.items()])

Exception on VIAC


Traceback (most recent call last):
  File "/home/jared/.local/share/virtualenvs/stock-prediction-MOS0QyR2/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 3621, in get_loc
    return self._engine.get_loc(casted_key)
  File "pandas/_libs/index.pyx", line 136, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 163, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 5198, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 5206, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'high'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/jared/workspace/market-diff/stock-prediction/notebooks/../src/prediction.py", line 49, in predict_price_change
    signals = mk_signal_set(candles[ticker])
  File "/tmp/ipykernel_252085/840965209.py", line 8, in df_to_signal_set
    r

Exception on PBCT


Traceback (most recent call last):
  File "/home/jared/.local/share/virtualenvs/stock-prediction-MOS0QyR2/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 3621, in get_loc
    return self._engine.get_loc(casted_key)
  File "pandas/_libs/index.pyx", line 136, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 163, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 5198, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 5206, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'high'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/jared/workspace/market-diff/stock-prediction/notebooks/../src/prediction.py", line 49, in predict_price_change
    signals = mk_signal_set(candles[ticker])
  File "/tmp/ipykernel_252085/840965209.py", line 8, in df_to_signal_set
    r

Exception on BRK.B


Traceback (most recent call last):
  File "/home/jared/.local/share/virtualenvs/stock-prediction-MOS0QyR2/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 3621, in get_loc
    return self._engine.get_loc(casted_key)
  File "pandas/_libs/index.pyx", line 136, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 163, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 5198, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 5206, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'high'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/jared/workspace/market-diff/stock-prediction/notebooks/../src/prediction.py", line 49, in predict_price_change
    signals = mk_signal_set(candles[ticker])
  File "/tmp/ipykernel_252085/840965209.py", line 8, in df_to_signal_set
    r

Exception on DISCK


Traceback (most recent call last):
  File "/home/jared/.local/share/virtualenvs/stock-prediction-MOS0QyR2/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 3621, in get_loc
    return self._engine.get_loc(casted_key)
  File "pandas/_libs/index.pyx", line 136, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 163, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 5198, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 5206, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'high'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/jared/workspace/market-diff/stock-prediction/notebooks/../src/prediction.py", line 49, in predict_price_change
    signals = mk_signal_set(candles[ticker])
  File "/tmp/ipykernel_252085/840965209.py", line 8, in df_to_signal_set
    r

Exception on INFO
Exception on XLNX


Traceback (most recent call last):
  File "/home/jared/.local/share/virtualenvs/stock-prediction-MOS0QyR2/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 3621, in get_loc
    return self._engine.get_loc(casted_key)
  File "pandas/_libs/index.pyx", line 136, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 163, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 5198, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 5206, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'high'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/jared/workspace/market-diff/stock-prediction/notebooks/../src/prediction.py", line 49, in predict_price_change
    signals = mk_signal_set(candles[ticker])
  File "/tmp/ipykernel_252085/840965209.py", line 8, in df_to_signal_set
    r

Exception on BF.B


Traceback (most recent call last):
  File "/home/jared/.local/share/virtualenvs/stock-prediction-MOS0QyR2/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 3621, in get_loc
    return self._engine.get_loc(casted_key)
  File "pandas/_libs/index.pyx", line 136, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 163, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 5198, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 5206, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'high'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/jared/workspace/market-diff/stock-prediction/notebooks/../src/prediction.py", line 49, in predict_price_change
    signals = mk_signal_set(candles[ticker])
  File "/tmp/ipykernel_252085/840965209.py", line 8, in df_to_signal_set
    r

Exception on WLTW


Traceback (most recent call last):
  File "/home/jared/.local/share/virtualenvs/stock-prediction-MOS0QyR2/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 3621, in get_loc
    return self._engine.get_loc(casted_key)
  File "pandas/_libs/index.pyx", line 136, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 163, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 5198, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 5206, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'high'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/jared/workspace/market-diff/stock-prediction/notebooks/../src/prediction.py", line 49, in predict_price_change
    signals = mk_signal_set(candles[ticker])
  File "/tmp/ipykernel_252085/840965209.py", line 8, in df_to_signal_set
    r

Exception on OGN


Traceback (most recent call last):
  File "/home/jared/workspace/market-diff/stock-prediction/notebooks/../src/prediction.py", line 50, in predict_price_change
    X, y, _ = signals.to_xy()
  File "/home/jared/workspace/market-diff/stock-prediction/notebooks/../src/signals.py", line 53, in to_xy
    X = self.X_scaler.fit_transform(X)
  File "/home/jared/.local/share/virtualenvs/stock-prediction-MOS0QyR2/lib/python3.10/site-packages/sklearn/base.py", line 867, in fit_transform
    return self.fit(X, **fit_params).transform(X)
  File "/home/jared/.local/share/virtualenvs/stock-prediction-MOS0QyR2/lib/python3.10/site-packages/sklearn/preprocessing/_data.py", line 809, in fit
    return self.partial_fit(X, y, sample_weight)
  File "/home/jared/.local/share/virtualenvs/stock-prediction-MOS0QyR2/lib/python3.10/site-packages/sklearn/preprocessing/_data.py", line 844, in partial_fit
    X = self._validate_data(
  File "/home/jared/.local/share/virtualenvs/stock-prediction-MOS0QyR2/lib/python3.

Exception on KSU


Traceback (most recent call last):
  File "/home/jared/.local/share/virtualenvs/stock-prediction-MOS0QyR2/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 3621, in get_loc
    return self._engine.get_loc(casted_key)
  File "pandas/_libs/index.pyx", line 136, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 163, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 5198, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 5206, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'high'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/jared/workspace/market-diff/stock-prediction/notebooks/../src/prediction.py", line 49, in predict_price_change
    signals = mk_signal_set(candles[ticker])
  File "/tmp/ipykernel_252085/840965209.py", line 8, in df_to_signal_set
    r

Exception on DISCA


Traceback (most recent call last):
  File "/home/jared/.local/share/virtualenvs/stock-prediction-MOS0QyR2/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 3621, in get_loc
    return self._engine.get_loc(casted_key)
  File "pandas/_libs/index.pyx", line 136, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 163, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 5198, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 5206, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'high'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/jared/workspace/market-diff/stock-prediction/notebooks/../src/prediction.py", line 49, in predict_price_change
    signals = mk_signal_set(candles[ticker])
  File "/tmp/ipykernel_252085/840965209.py", line 8, in df_to_signal_set
    r