In [1]:
import os
import numpy as np 
import pandas as pd 
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime 
from pathlib import Path
from Preprocessing_functions import *

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.preprocessing import label_binarize

In [3]:

ticker = "SPY"

# LOAD DF FOR MODEL BUILDING 
FILE_PATH = f"Data/{ticker}/df/"
print("DataFrames for model building: ", os.listdir(FILE_PATH))
idx = 0 if len(os.listdir(FILE_PATH)) < 3 else int(input("Select file index: "))
DF_NAME = os.listdir(FILE_PATH)[idx] 
FILE_PATH_NAME = FILE_PATH + DF_NAME

df_model = pd.read_parquet(FILE_PATH_NAME)
df_model = format_idx_date(df_model)
df_model.head()

DataFrames for model building:  ['df_SPY_k3_202402012133.parquet', 'Junk']


Unnamed: 0_level_0,labels,open_low,open_close,gap,open_high,low_close,high_close,high_low,Dividends,Volume,...,SPY_mom3,SPY_mom4,SPY_mom5,SPY_mom10,SPY_mom15,SPY_mom20,SPY_mom60,SPY_mom120,SPY_mom180,SPY_mom240
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-15,0,1.363875,1.293933,0.380295,-0.192341,-0.070909,-1.483421,1.553229,0.0,176613900,...,-1.79,-2.56,-3.34,-3.17,-4.2,1.12,5.16,1.69,7.28,16.5
2015-01-16,1,0.112333,-1.43668,-0.124004,-1.531276,-1.550755,-0.09317,1.61882,0.0,211879600,...,-0.23,-0.5,-1.28,-1.85,-2.96,0.47,4.47,2.98,8.67,17.21
2015-01-20,2,1.103176,0.168379,0.384683,-0.156767,-0.945224,-0.324638,1.257971,0.0,130991100,...,0.6,-0.01,-0.29,0.17,-3.06,-1.74,5.44,3.64,9.06,17.61
2015-01-21,1,0.274116,-0.787356,-0.279167,-1.073137,-1.06439,-0.282747,1.332949,0.0,122942700,...,2.04,1.11,0.49,1.64,-2.7,-1.66,4.76,4.15,9.41,16.66
2015-01-22,1,0.812306,-1.036986,0.445576,-1.111879,-1.864436,-0.07407,1.903025,0.0,174356000,...,2.22,3.56,2.61,1.88,-0.72,-0.66,5.5,7.83,12.01,16.95


In [4]:
df_model = df_model.sort_index(ascending = False)
df_model.head()

Unnamed: 0_level_0,labels,open_low,open_close,gap,open_high,low_close,high_close,high_low,Dividends,Volume,...,SPY_mom3,SPY_mom4,SPY_mom5,SPY_mom10,SPY_mom15,SPY_mom20,SPY_mom60,SPY_mom120,SPY_mom180,SPY_mom240
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-02-01,1,0.171265,-0.538555,0.362409,-0.744898,-0.711038,-0.204817,0.909389,0.0,59327438,...,-0.82,-0.03,-0.16,2.26,2.24,3.94,12.54,10.1,19.31,21.15
2024-01-31,0,1.17883,1.174737,-0.462425,-0.094143,-0.004142,-1.267686,1.271776,0.0,126011100,...,-0.93,-1.06,-0.52,2.24,1.9,2.16,12.55,9.15,18.65,18.42
2024-01-30,2,0.091732,-0.06727,-0.144523,-0.21608,-0.159148,-0.148489,0.307148,0.0,58618400,...,0.59,1.13,1.24,3.36,3.43,3.28,16.61,10.22,20.46,20.77
2024-01-29,1,0.114818,-0.725811,0.065653,-0.756566,-0.841595,-0.030524,0.864841,0.0,61322800,...,1.21,1.32,1.62,3.06,4.99,3.06,17.95,9.83,20.34,20.81
2024-01-26,2,0.215345,0.036916,-0.090158,-0.313788,-0.178814,-0.349607,0.527478,0.0,76606300,...,0.53,0.82,1.03,2.32,4.31,2.29,17.76,9.92,19.96,21.27


In [5]:
cols = list(df_model.columns)
model_feat = pd.DataFrame(cols)
df_model = min_max_scaling(df_model)
df_model.columns = cols
df_model.head()

Unnamed: 0_level_0,labels,open_low,open_close,gap,open_high,low_close,high_close,high_low,Dividends,Volume,...,SPY_mom3,SPY_mom4,SPY_mom5,SPY_mom10,SPY_mom15,SPY_mom20,SPY_mom60,SPY_mom120,SPY_mom180,SPY_mom240
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-02-01,1,0.02961,0.407241,0.655745,0.885707,0.915083,0.970559,0.094273,0.0,0.080204,...,0.395427,0.542258,0.504104,0.607033,0.542382,0.646134,0.610603,0.458155,0.477058,0.442941
2024-01-31,0,0.203809,0.571033,0.605724,0.985555,0.999505,0.817779,0.137757,0.0,0.217139,...,0.391728,0.509032,0.493915,0.606557,0.536171,0.613208,0.610746,0.445413,0.469813,0.414307
2024-01-30,2,0.01586,0.452296,0.625003,0.966846,0.980993,0.978656,0.022009,0.0,0.078748,...,0.442838,0.579677,0.543731,0.633167,0.564121,0.633925,0.668763,0.459764,0.489682,0.438955
2024-01-29,1,0.019851,0.389339,0.637749,0.883916,0.899491,0.995612,0.088928,0.0,0.084302,...,0.463685,0.585806,0.554486,0.626039,0.59262,0.629856,0.687911,0.454533,0.488364,0.439375
2024-01-26,2,0.037231,0.462256,0.6283,0.951854,0.978645,0.949746,0.048447,0.0,0.115686,...,0.44082,0.569677,0.537787,0.608458,0.580197,0.615612,0.685196,0.45574,0.484193,0.4442


In [6]:
#### TO SHIFT THE TARGET 
df_model['labels'] = df_model['labels'].shift(1)
df_model = df_model.dropna()
df_model['labels'] = df_model['labels'].replace(0,2)
df_model.head()

Unnamed: 0_level_0,labels,open_low,open_close,gap,open_high,low_close,high_close,high_low,Dividends,Volume,...,SPY_mom3,SPY_mom4,SPY_mom5,SPY_mom10,SPY_mom15,SPY_mom20,SPY_mom60,SPY_mom120,SPY_mom180,SPY_mom240
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-01-31,1.0,0.203809,0.571033,0.605724,0.985555,0.999505,0.817779,0.137757,0.0,0.217139,...,0.391728,0.509032,0.493915,0.606557,0.536171,0.613208,0.610746,0.445413,0.469813,0.414307
2024-01-30,2.0,0.01586,0.452296,0.625003,0.966846,0.980993,0.978656,0.022009,0.0,0.078748,...,0.442838,0.579677,0.543731,0.633167,0.564121,0.633925,0.668763,0.459764,0.489682,0.438955
2024-01-29,2.0,0.019851,0.389339,0.637749,0.883916,0.899491,0.995612,0.088928,0.0,0.084302,...,0.463685,0.585806,0.554486,0.626039,0.59262,0.629856,0.687911,0.454533,0.488364,0.439375
2024-01-26,1.0,0.037231,0.462256,0.6283,0.951854,0.978645,0.949746,0.048447,0.0,0.115686,...,0.44082,0.569677,0.537787,0.608458,0.580197,0.615612,0.685196,0.45574,0.484193,0.4442
2024-01-25,2.0,0.077655,0.449904,0.661129,0.977028,0.935045,0.991758,0.056907,0.0,0.107305,...,0.454943,0.580645,0.57713,0.610596,0.576361,0.621347,0.707488,0.450912,0.480022,0.44871


In [7]:


# ----------------------
# Step 1: Data Preparation
# ----------------------
X = df_model.drop(columns=["labels"])
y = df_model["labels"]

# Encode class labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)
classes = le.classes_
print('Number of classes: ', classes)

# Binarize for ROC later
y_binarized = label_binarize(y_encoded, classes=np.arange(len(classes)))

# Train/val/test splits
X_temp, X_test, y_temp, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp)

# Normalize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Convert to torch tensors (no batches!)
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)

X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

Number of classes:  [1. 2.]


In [8]:

# ----------------------
# Step 2: Neural Net Definition
# ----------------------
class SimpleNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(SimpleNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, output_dim)
        )
        
    def forward(self, x):
        return self.net(x)



class DeepNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DeepNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, output_dim)
        )

    def forward(self, x):
        return self.net(x)

input_dim = X.shape[1]
print('Input dims: ', input_dim)
output_dim = len(classes)
model = DeepNN(input_dim, output_dim)

Input dims:  21


In [9]:
best_val_loss = float("inf")
best_model_state = None
epochs = int(1e3)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val_tensor)
        val_loss = criterion(val_outputs, y_val_tensor)

    if val_loss.item() < best_val_loss:
        best_val_loss = val_loss.item()
        best_model_state = model.state_dict()  # Save best model

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}")


Epoch 10/1000, Loss: 0.6358, Val Loss: 0.6339
Epoch 20/1000, Loss: 0.5964, Val Loss: 0.6045
Epoch 30/1000, Loss: 0.5748, Val Loss: 0.6065
Epoch 40/1000, Loss: 0.5648, Val Loss: 0.6032
Epoch 50/1000, Loss: 0.5527, Val Loss: 0.6047
Epoch 60/1000, Loss: 0.5381, Val Loss: 0.6131
Epoch 70/1000, Loss: 0.5204, Val Loss: 0.6272
Epoch 80/1000, Loss: 0.4975, Val Loss: 0.6437
Epoch 90/1000, Loss: 0.4674, Val Loss: 0.6669
Epoch 100/1000, Loss: 0.4281, Val Loss: 0.6993
Epoch 110/1000, Loss: 0.3824, Val Loss: 0.7570
Epoch 120/1000, Loss: 0.3361, Val Loss: 0.8340
Epoch 130/1000, Loss: 0.2913, Val Loss: 0.9205
Epoch 140/1000, Loss: 0.2485, Val Loss: 1.0118
Epoch 150/1000, Loss: 0.2092, Val Loss: 1.1085
Epoch 160/1000, Loss: 0.1792, Val Loss: 1.2265
Epoch 170/1000, Loss: 0.1488, Val Loss: 1.3261
Epoch 180/1000, Loss: 0.1265, Val Loss: 1.4246
Epoch 190/1000, Loss: 0.1070, Val Loss: 1.5248
Epoch 200/1000, Loss: 0.0900, Val Loss: 1.6357
Epoch 210/1000, Loss: 0.0758, Val Loss: 1.7482
Epoch 220/1000, Loss: 

In [10]:
# Load best model before evaluation
if best_model_state is not None:
    model.load_state_dict(best_model_state)
    print(f"\n✅ Loaded best model with validation loss: {best_val_loss:.4f}")


✅ Loaded best model with validation loss: 0.6011


In [11]:

# ----------------------
# Step 4: Evaluation
# ----------------------
def evaluate_model(model, X_tensor, y_true, dataset_name="Test"):
    model.eval()
    with torch.no_grad():
        logits = model(X_tensor)
        probs = torch.softmax(logits, dim=1)
        preds = torch.argmax(probs, dim=1).numpy()
        probs_np = probs.numpy()
        
    print(f"\n--- {dataset_name} Performance ---")
    print("Accuracy:", accuracy_score(y_true, preds))
    print("Precision (macro):", precision_score(y_true, preds, average="macro"))
    print("Recall (macro):", recall_score(y_true, preds, average="macro"))
    print("F1 Score (macro):", f1_score(y_true, preds, average="macro"))
    
    try:
        y_binarized = label_binarize(y_true, classes=np.arange(len(classes)))
        roc_auc = roc_auc_score(y_binarized, probs_np, average="macro", multi_class="ovr")
        print("ROC AUC (macro):", roc_auc)
    except:
        print("ROC AUC not available.")

    print("Classification Report:")
    print(classification_report(y_true, preds))

evaluate_model(model, X_val_tensor, y_val, "Validation")


--- Validation Performance ---
Accuracy: 0.6087912087912087
Precision (macro): 0.509891456582633
Recall (macro): 0.5093181603995853
F1 Score (macro): 0.5091515151515151
ROC AUC not available.
Classification Report:
              precision    recall  f1-score   support

           0       0.30      0.27      0.29       131
           1       0.72      0.74      0.73       324

    accuracy                           0.61       455
   macro avg       0.51      0.51      0.51       455
weighted avg       0.60      0.61      0.60       455



In [12]:
evaluate_model(model, X_test_tensor, y_test, "Test")


--- Test Performance ---
Accuracy: 0.6228070175438597
Precision (macro): 0.500875857910002
Recall (macro): 0.5007163828537874
F1 Score (macro): 0.49707602339181284
ROC AUC not available.
Classification Report:
              precision    recall  f1-score   support

           0       0.29      0.21      0.25       131
           1       0.71      0.79      0.75       325

    accuracy                           0.62       456
   macro avg       0.50      0.50      0.50       456
weighted avg       0.59      0.62      0.60       456



In [13]:
# Save
torch.save(model.state_dict(), "NN_models/simple_nn_multiclass.pt")

# Load later
# model = SimpleNN(input_dim, output_dim)
# model.load_state_dict(torch.load("simple_nn_multiclass.pt"))
# model.eval()

In [14]:
os.getcwd()

'c:\\Users\\User\\Documents\\ATS_Development\\Strat_1'