In [28]:
import torch
from torch import nn
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Subset, Dataset
import pandas as pd
import numpy as np


In [29]:
def test_train_split(data, train_ratio = 0.8):
    train_size = int(len(data) * train_ratio)
    train_dataset = Subset(data, range(train_size))
    test_dataset  = Subset(data, range(train_size, len(data)))
    return train_dataset, test_dataset

In [30]:
class StockDataset(Dataset):
    def __init__(self, csv_file, window_size=30):
        self.window_size = window_size

        df = pd.read_csv(csv_file)
        if "date" in df.columns:
            df["date"] = pd.to_datetime(df["date"])
            df = df.sort_values(["ticker", "date"]).reset_index(drop=True)

        num_cols = ["open", "high", "low", "close", "volume"]

        # --- Build a dense ID map: PAD=0, OOV=1, tickers start at 2
        tickers = df["ticker"].astype(str).unique().tolist()
        PAD_ID, OOV_ID = 0, 1
        sym2id = {s: i+2 for i, s in enumerate(tickers)}
        self.PAD_ID, self.OOV_ID = PAD_ID, OOV_ID
        self.num_embeddings = len(sym2id) + 2

        df["ticker_id"] = df["ticker"].astype(str).map(lambda s: sym2id.get(s, OOV_ID)).astype("int64")

        # --- Normalize numeric features
        X_num = df[num_cols].astype("float32").copy()
        X_num = (X_num
                 .replace([float("inf"), -float("inf")], np.nan)
                 .fillna(method="ffill")
                 .fillna(method="bfill")
                 .fillna(0.0))
        self.mean = X_num.values.mean(axis=0)
        self.std  = X_num.values.std(axis=0)
        self.std[self.std < 1e-8] = 1e-8
        Xz = (X_num.values - self.mean) / self.std

        # --- Keep ticker ids SEPARATE (int64). Do NOT concatenate into float features.
        self.Xz = Xz.astype("float32")
        self.ticker_ids = df["ticker_id"].values.astype("int64")

        # --- Build per-ticker index ranges so windows don’t cross tickers
        self.rows_by_ticker = df.groupby("ticker", sort=False).indices
        self.sample_index = []  # (start_row, end_row] per window, per ticker
        for _, idxs in self.rows_by_ticker.items():
            start, end = idxs.min(), idxs.max()
            # windows entirely inside [start..end]
            for i in range(start, end - window_size + 1):
                self.sample_index.append(i)
        self.target_col_idx = num_cols.index("close")

    def __len__(self):
        return len(self.sample_index)

    def __getitem__(self, k):
        i = self.sample_index[k]
        w = self.window_size
        X = self.Xz[i:i+w, :]                                # (W, 5)
        y = self.Xz[i+w-1, self.target_col_idx].astype("float32")  # predict last step’s normalized close
        ticker_id = int(self.ticker_ids[i+w-1])              # take the same row as y

        return (
            torch.tensor(X, dtype=torch.float32),            # (W, 5)
            torch.tensor([y], dtype=torch.float32),          # (1,)
            torch.tensor(ticker_id, dtype=torch.long),       # ()
        )


In [31]:
stock_dataset = StockDataset("./data/S&P500/all_stocks_5yr.csv", 30)
batch_size = 64

train_dataset, test_dataset = test_train_split(stock_dataset)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
print(f"Total Samples: {len(stock_dataset)}   Train Samples: {len(train_loader)} Test Samples: {len(val_loader)}")


Total Samples: 603890   Train Samples: 7549 Test Samples: 7549


  .fillna(method="ffill")
  .fillna(method="bfill")


In [32]:
def display_results(predictions, stddev, mean, range=30):
    for prediction, expected in predictions:
        prediction = prediction * stddev + mean
        expected = expected * stddev + mean
        print(f"PREDICTION: {prediction}    EXPECTED: {expected}")


In [33]:
class LSTMReg(nn.Module):
    def __init__(self, in_feat, hidden_size=64, layers=2, num_tickers=1000, emb_dim=16):
        super().__init__()
        self.ticker_emb = nn.Embedding(num_tickers, emb_dim)
        self.lstm = nn.LSTM(input_size=in_feat + emb_dim, hidden_size=hidden_size, num_layers=layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x, ticker_ids):
        # x: (B, W, in_feat), ticker_ids: (B,)
        emb = self.ticker_emb(ticker_ids)           # (B, emb_dim)
        emb = emb.unsqueeze(1).expand(-1, x.size(1), -1)  # (B, W, emb_dim)
        x = torch.cat([x, emb], dim=-1)             # concat ticker embedding to each timestep
        out, _ = self.lstm(x)
        last_out = out[:, -1, :]
        return self.fc(last_out)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMReg(in_feat=5, hidden_size=64, layers=1).to(device)


In [34]:
criterion = nn.SmoothL1Loss()
learning_rate = 0.001
weight_decay = 0.01
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.5, patience=3
)

In [35]:
def run_epoch(loader, model, criterion, optimizer=None, device="cpu"):
    is_train = optimizer is not None
    model.train(is_train)

    total_loss, n = 0.0, 0
    predictions = []

    ctx = torch.enable_grad() if is_train else torch.no_grad()
    with ctx:
        for X, y, ticker in loader:
            X = X.to(device, non_blocking=True)           # (B, W, F_num)
            y = y.to(device, non_blocking=True)           # (B, 1)
            ticker = ticker.to(device, non_blocking=True) # (B,) int64 for Embedding

           
            pred = model(X, ticker)

            loss = criterion(pred, y)

            if is_train:
                optimizer.zero_grad(set_to_none=True)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()

            # Accounting
            bs = X.size(0)
            total_loss += loss.item() * bs
            n += bs

            # Collect predictions (detach -> cpu -> list/np)
            predictions.append(pred.detach().cpu())

            # If you want targets too:
            # targets.append(y.detach().cpu())

    # Stack to a single tensor (shape: (N, 1))
    predictions = torch.cat(predictions, dim=0)
    # targets = torch.cat(targets, dim=0) if targets else None

    return total_loss / max(n, 1), predictions


In [36]:
epochs = 20
best_val = float('inf')
patience, bad_epochs = 5, 0
best_state = None



for epoch in range(1, epochs+1):
    train_loss, _ = run_epoch(train_loader, model, criterion, optimizer, device)
    val_loss, _   = run_epoch(val_loader,   model, criterion, None,      device)

    if scheduler is not None:
        scheduler.step(val_loss)

    if val_loss < best_val - 1e-6:
        best_val = val_loss
        best_state = {k:v.cpu().clone() for k,v in model.state_dict().items()}
        bad_epochs = 0
    else:
        bad_epochs += 1

    print(f"Epoch {epoch:02d} | train {train_loss:.4f} | val {val_loss:.4f}")

    if bad_epochs >= patience:
        print("Early stopping.")
        break

if best_state is not None:
    model.load_state_dict(best_state)
    model.to(device)

Epoch 01 | train 0.0130 | val 0.0132
Epoch 02 | train 0.0021 | val 0.0048
Epoch 03 | train 0.0010 | val 0.0029
Epoch 04 | train 0.0008 | val 0.0032
Epoch 05 | train 0.0013 | val 0.0029
Epoch 06 | train 0.0015 | val 0.0035
Epoch 07 | train 0.0004 | val 0.0025
Epoch 08 | train 0.0003 | val 0.0023
Epoch 09 | train 0.0006 | val 0.0056
Epoch 10 | train 0.0005 | val 0.0044
Epoch 11 | train 0.0011 | val 0.0030
Epoch 12 | train 0.0003 | val 0.0041
Epoch 13 | train 0.0003 | val 0.0024
Early stopping.


In [37]:
torch.save(model.state_dict(), "model.pt")


In [38]:
import pandas as pd
df = pd.read_csv("./data/TESTDATA/top_10_stock_data.csv")
feature_order = ["open", "high", "low", "close", "volume", "ticker"]
df = df[feature_order]
print(df.head())
test_dataset = StockDataset("./data/TESTDATA/top_10_stock_data.csv", 30)
test_loader = DataLoader(test_dataset)
batch_size = 64


        open       high        low      close     volume ticker
0  27.847500  27.860001  26.837500  27.332500  212818400   AAPL
1  27.072500  27.162500  26.352501  26.562500  257142000   AAPL
2  26.635000  26.857500  26.157499  26.565001  263188400   AAPL
3  26.799999  27.049999  26.674999  26.937500  160423600   AAPL
4  27.307501  28.037500  27.174999  27.972500  237458000   AAPL


  .fillna(method="ffill")
  .fillna(method="bfill")


In [39]:
state = torch.load("model.pt", weights_only=True)
test_model = LSTMReg(in_feat=5, hidden_size=64, layers=1).to(device)
test_model.load_state_dict(state)
test_model.eval()
with torch.no_grad():
    test_loss, predictions = run_epoch(test_loader, test_model, criterion, None, device)
print("Test loss:", test_loss)




Test loss: 0.0025987274087182844


In [40]:
display_results(predictions, test_dataset.std[3], test_dataset.mean[3])


ValueError: not enough values to unpack (expected 2, got 1)