## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

# Suppress ta warnings
import warnings
warnings.filterwarnings("ignore")

# Auto reload local files
%load_ext autoreload
%reload_ext autoreload
%autoreload 2
# Make files in src/ available to notebook
import sys
if 'src' not in sys.path:
    sys.path.insert(0, '../src')

In [2]:
# Read SPY csv, define config
spy_constituents = list(pd.read_csv('../../data/spy_constituents.csv', header=0)['Symbol'])
random.shuffle(spy_constituents)

tickers = spy_constituents
start_date = "2000-01-01"
end_date = "2025-01-01"
predict_window = 14

## Sync & Load Data, Create Indicators

In [3]:
# Load the data from db
from sklearn.model_selection import train_test_split

import datastore as ds
from technical_signals import TechnicalSignalSet

#ds.download_daily_candlesticks(tickers, start_date, end_date)
candlesticks = ds.get_daily_candlesticks(tickers, start_date, end_date)

Xs = []
ys = []

for ticker in tickers:
    try:
        technical_sigs = TechnicalSignalSet(candlesticks[ticker], predict_window)
        X, y, Xy_date = technical_sigs.to_xy()
        Xs.append(X)
        ys.append(y)
    except Exception as ex:
        print(f"Exception on {ticker}:")
        print(ex)

X = np.concatenate(Xs, axis=0)
y = np.concatenate(ys, axis=0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

Exception on INFO:
'high'
Exception on BF.B:
'high'
Exception on PBCT:
'high'
Exception on VIAC:
'high'
Exception on KSU:
'high'
Exception on WLTW:
'high'
Exception on OGN:
Found array with 0 sample(s) (shape=(0, 58)) while a minimum of 1 is required by RobustScaler.
Exception on DISCA:
'high'
Exception on DISCK:
'high'
Exception on BRK.B:
'high'
Exception on XLNX:
'high'


## Clean Data For Training

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import gc


def round_batch_size(sample_count, approximately, leeway=None):
    """
    Round batch size to a more suitable value. This helps to avoid a
    problem where the final batch has a lot of samples, but not enough for
    a full batch, leading to many samples being thrown out.

    approximately: int, leeway: int
      decide on a chunk size around a number, with specified leeway
      (leeway defaults to `approximately // 10`).
    """
    if leeway is None:
        leeway = approximately // 10
    
    # Get the number of leftover samples if we use the suggested batch size
    best_leftover = sample_count - np.floor(sample_count / approximately) * approximately

    # Brute-force search for the value that yeilds the fewest leftovers
    # within the given leeway range.
    best_chunk_count = approximately
    for offset in range(-leeway, leeway):
        chunk_size = approximately + offset
        leftover = sample_count - np.floor(sample_count / chunk_size) * chunk_size
        if leftover < best_leftover:
            best_leftover = leftover
            best_chunk_count = chunk_size
    return best_chunk_count
            

batch_size = round_batch_size(X_train.shape[0], 1024, leeway=200)
n_features = X_train.shape[1]

# Convert X, y to torch tensors
X_train_tensor = torch.from_numpy(X_train).float()
X_test_tensor = torch.from_numpy(X_test).float()
y_train_tensor = torch.from_numpy(y_train.reshape(y_train.shape[0], 1)).float()
y_test_tensor = torch.from_numpy(y_test.reshape(y_test.shape[0], 1)).float()

print(X_train_tensor.shape)
print('Batch size:', batch_size)

# Generators
training_set = TensorDataset(X_train_tensor, y_train_tensor)
dataloader_train = DataLoader(training_set, shuffle=True, batch_size=batch_size)

validation_set = TensorDataset(X_test_tensor, y_test_tensor)
dataloader_test = DataLoader(validation_set, shuffle=True, batch_size=batch_size)

# Release duplicated memory
try:
    del X
    del y
    del Xs
    del ys
    #del X_train
    #del X_test
    #del y_train
    #del y_test
    del X_train_tensor
    del X_test_tensor
    del y_train_tensor
    del y_test_tensor
except:
    pass
gc.collect()

torch.Size([2059741, 58])
Batch size: 913


0

## Create the NN model

In [5]:
n_outputs = 1

net = nn.Sequential(
    nn.Linear(n_features, 256),
    nn.ReLU(),
    nn.Linear(256, 128),
    nn.ReLU(),
    nn.Linear(128, 32),
    nn.ReLU(),
    nn.Linear(32, 16),
    nn.ReLU(),
    nn.Linear(16, n_outputs),
)

net = nn.Sequential(
    nn.Linear(n_features, 256),
    nn.ReLU(),
    nn.Linear(256, 128),
    nn.ReLU(),
    nn.Linear(128, 32),
    nn.ReLU(),
    nn.Linear(32, 16),
    nn.ReLU(),
    nn.Linear(16, 8),
    nn.ReLU(),
    nn.Linear(8, n_outputs),
)

## Training

In [6]:
device = torch.device("cuda")

# Set device for model
net = net.to(device)

# Select optimizerand loss criteria
criterion = torch.nn.MSELoss() 
optimizer = torch.optim.Adam(net.parameters(), lr=0.001) 

# Training the model
for epoch in range(200):
    train_loss = 0.0
    
    # Training
    net.train()
    for local_batch, local_labels in dataloader_train:
        if local_batch.shape[0] != batch_size:
            print(f"Wrong train batch size. Skipping batch.\nThrowing away {local_batch.shape[0]} samples.")
            continue
        local_batch, local_labels = local_batch.to(device), local_labels.to(device)
        
        # Forward pass: Compute predicted y by passing x to the model 
        y_pred = net(local_batch)
        # Compute and print loss 
        loss = criterion(y_pred, local_labels)
        # Zero gradients, perform a backward pass, update the weights. 
        optimizer.zero_grad() 
        loss.backward() 
        optimizer.step() 
        # Update loss
        train_loss += loss.item()

    # Validation
    net.eval()
    valid_loss = 0.0
    for data, labels in dataloader_test:
        if data.shape[0] != batch_size:
            continue
        data, labels = data.to(device), labels.to(device)
        
        target = net(data)
        loss = criterion(target,labels)
        valid_loss += loss.item()

    print(f'Epoch {epoch+1} \t\t Training Loss: {train_loss / len(dataloader_train)} \t\t Validation Loss: {valid_loss / len(dataloader_test)}')

Wrong train batch size. Skipping batch.
Throwing away 13 samples.
Epoch 1 		 Training Loss: 0.905487361739154 		 Validation Loss: 0.890648316814605
Wrong train batch size. Skipping batch.
Throwing away 13 samples.
Epoch 2 		 Training Loss: 0.8630136547228061 		 Validation Loss: 0.8565230813634348
Wrong train batch size. Skipping batch.
Throwing away 13 samples.
Epoch 3 		 Training Loss: 0.8409128228964191 		 Validation Loss: 0.8354569428945443
Wrong train batch size. Skipping batch.
Throwing away 13 samples.
Epoch 4 		 Training Loss: 0.8238397436591973 		 Validation Loss: 0.8322162585429461
Wrong train batch size. Skipping batch.
Throwing away 13 samples.
Epoch 5 		 Training Loss: 0.8108888373841846 		 Validation Loss: 0.8167028320263107
Wrong train batch size. Skipping batch.
Throwing away 13 samples.
Epoch 6 		 Training Loss: 0.7976253585156267 		 Validation Loss: 0.8065330488273347
Wrong train batch size. Skipping batch.
Throwing away 13 samples.
Epoch 7 		 Training Loss: 0.78664519

Epoch 55 		 Training Loss: 0.6305479867087564 		 Validation Loss: 0.6765009801226308
Wrong train batch size. Skipping batch.
Throwing away 13 samples.
Epoch 56 		 Training Loss: 0.6297204129242717 		 Validation Loss: 0.6777838288075421
Wrong train batch size. Skipping batch.
Throwing away 13 samples.
Epoch 57 		 Training Loss: 0.6284439208685005 		 Validation Loss: 0.6770082579190988
Wrong train batch size. Skipping batch.
Throwing away 13 samples.
Epoch 58 		 Training Loss: 0.6283782139044184 		 Validation Loss: 0.675219465774369
Wrong train batch size. Skipping batch.
Throwing away 13 samples.
Epoch 59 		 Training Loss: 0.627055182729509 		 Validation Loss: 0.6731568707887869
Wrong train batch size. Skipping batch.
Throwing away 13 samples.
Epoch 60 		 Training Loss: 0.6255903462419269 		 Validation Loss: 0.672219385426358
Wrong train batch size. Skipping batch.
Throwing away 13 samples.
Epoch 61 		 Training Loss: 0.6248421853867896 		 Validation Loss: 0.6766640669796097
Wrong train 

Wrong train batch size. Skipping batch.
Throwing away 13 samples.
Epoch 110 		 Training Loss: 0.5959522685594775 		 Validation Loss: 0.6526632772023935
Wrong train batch size. Skipping batch.
Throwing away 13 samples.
Epoch 111 		 Training Loss: 0.595579131137073 		 Validation Loss: 0.6552325059693173
Wrong train batch size. Skipping batch.
Throwing away 13 samples.
Epoch 112 		 Training Loss: 0.5954331194319237 		 Validation Loss: 0.6542397668637128
Wrong train batch size. Skipping batch.
Throwing away 13 samples.
Epoch 113 		 Training Loss: 0.5948071575133105 		 Validation Loss: 0.6537913729945027
Wrong train batch size. Skipping batch.
Throwing away 13 samples.
Epoch 114 		 Training Loss: 0.5938852052668528 		 Validation Loss: 0.6530044164315638
Wrong train batch size. Skipping batch.
Throwing away 13 samples.
Epoch 115 		 Training Loss: 0.5943718254434148 		 Validation Loss: 0.6534313060847886
Wrong train batch size. Skipping batch.
Throwing away 13 samples.
Epoch 116 		 Training L

Wrong train batch size. Skipping batch.
Throwing away 13 samples.
Epoch 164 		 Training Loss: 0.5801159937824356 		 Validation Loss: 0.6431497473165808
Wrong train batch size. Skipping batch.
Throwing away 13 samples.
Epoch 165 		 Training Loss: 0.5798867264573533 		 Validation Loss: 0.6437022897351785
Wrong train batch size. Skipping batch.
Throwing away 13 samples.
Epoch 166 		 Training Loss: 0.580049793409678 		 Validation Loss: 0.6441137102020689
Wrong train batch size. Skipping batch.
Throwing away 13 samples.
Epoch 167 		 Training Loss: 0.5797355563095516 		 Validation Loss: 0.6478765540388951
Wrong train batch size. Skipping batch.
Throwing away 13 samples.
Epoch 168 		 Training Loss: 0.5799113126998989 		 Validation Loss: 0.6416987023980494
Wrong train batch size. Skipping batch.
Throwing away 13 samples.
Epoch 169 		 Training Loss: 0.5791419286959246 		 Validation Loss: 0.6426783163708994
Wrong train batch size. Skipping batch.
Throwing away 13 samples.
Epoch 170 		 Training L

## Backtest

In [7]:
"""
import backtest as bt
from strategy import PretrainedModelStrategy, SignalModelStrategy
from technical_signals import TechnicalSignalSet
from sklearn.svm import SVR

random.shuffle(spy_constituents)

# XXX temporary - need to rework concurrency to be suitable for CUDA
# (Must use `spawn` as opposed to `fork` based concurrency I believe - separate OS processes?)
net = net.to(torch.device('cpu'))

def predict(net):
    return lambda X:\
        net(torch.from_numpy(X).float().cpu()).detach().numpy()

def df_to_signal_set(df):
    return TechnicalSignalSet(df, predict_window=predict_window)

strategy = PretrainedModelStrategy(predict(net), df_to_signal_set, cutoff=0.95, bias=0.2)
#strategy = SignalModelStrategy(SVR(), lambda df: TechnicalSignalSet(df, predict_window=14), cutoff=1., bias=0.1)
bt.comprehensive_backtest(strategy, spy_constituents[:1], "2000-01-01", "2025-01-01", plot=True, train_test_ratio=0.8)
"""

'\nimport backtest as bt\nfrom strategy import PretrainedModelStrategy, SignalModelStrategy\nfrom technical_signals import TechnicalSignalSet\nfrom sklearn.svm import SVR\n\nrandom.shuffle(spy_constituents)\n\n# XXX temporary - need to rework concurrency to be suitable for CUDA\n# (Must use `spawn` as opposed to `fork` based concurrency I believe - separate OS processes?)\nnet = net.to(torch.device(\'cpu\'))\n\ndef predict(net):\n    return lambda X:        net(torch.from_numpy(X).float().cpu()).detach().numpy()\n\ndef df_to_signal_set(df):\n    return TechnicalSignalSet(df, predict_window=predict_window)\n\nstrategy = PretrainedModelStrategy(predict(net), df_to_signal_set, cutoff=0.95, bias=0.2)\n#strategy = SignalModelStrategy(SVR(), lambda df: TechnicalSignalSet(df, predict_window=14), cutoff=1., bias=0.1)\nbt.comprehensive_backtest(strategy, spy_constituents[:1], "2000-01-01", "2025-01-01", plot=True, train_test_ratio=0.8)\n'

## Generate and Store Predictions

In [10]:
from prediction import Prediction
from predict import predict_price_change
from predictive_model import PredictiveModel
from datetime import datetime

net = net.to(torch.device('cpu'))

def df_to_signal_set(df):
    return TechnicalSignalSet(df, predict_window=predict_window)

model = PredictiveModel(net, "TorchMATI", predict_window, datetime.now())

predictions = predict_price_change(model, df_to_signal_set, tickers[:10])
ds.save_predictions([p for t, p in predictions.items()])

## Save Model

In [9]:
model.save()