In [121]:
import torch
from torch import nn
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Subset, Dataset
import pandas as pd


In [122]:
def test_train_split(data, train_ratio = 0.8):
    train_size = int(len(data) * train_ratio)
    train_dataset = Subset(data, range(train_size))
    test_dataset  = Subset(data, range(train_size, len(data)))
    return train_dataset, test_dataset

In [123]:
class StockDataset(Dataset):
    def __init__(self, csv_file, window_size=30):
        self.data = pd.read_csv(csv_file)

        # Keep only needed numeric cols
        cols = ["open", "high", "low", "close", "volume"]
        df = self.data[cols].astype("float32")

        # Handle missing values
        df = df.replace([float("inf"), -float("inf")], pd.NA)
        df = df.fillna(method="ffill").fillna(method="bfill")  # forward/back fill
        df = df.fillna(0)                                      # final fallback

        self.window_size = window_size
        X = df.values

        # Standardize safely
        self.mean = X.mean(axis=0)
        self.std  = X.std(axis=0)
        eps = 1e-8
        self.std[self.std < eps] = eps
        self.features = (X - self.mean) / self.std

    def __len__(self):
        return len(self.features) - self.window_size

    def __getitem__(self, idx):
        X = self.features[idx:idx+self.window_size]        # (W, F)
        y = self.features[idx+self.window_size][3]         # scaled 'close'
        return torch.tensor(X, dtype=torch.float32), torch.tensor([y], dtype=torch.float32)  # (1,)


In [145]:
stock_dataset = StockDataset("./data/S&P500/all_stocks_5yr.csv", 30)
batch_size = 64

train_dataset, test_dataset = test_train_split(stock_dataset)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
print(f"Total Samples: {len(stock_dataset)}   Train Samples: {len(train_loader)} Test Samples: {len(val_loader)}")

Total Samples: 619010   Train Samples: 7738 Test Samples: 7738


  df = df.fillna(method="ffill").fillna(method="bfill")  # forward/back fill


In [None]:
def display_results(predictions, stddev, mean, range=30):
    for prediction, expected in predictions:
        prediction = prediction * stddev + mean
        expected = expected * stddev + mean
        print(f"PREDICTION: {prediction}    EXPECTED: {expected}")


In [125]:
class LSTMReg(nn.Module):
    def __init__(self, in_feat, hidden_size=64, layers=2):
        super().__init__()
        self.lstm = nn.LSTM(input_size=in_feat, hidden_size=hidden_size, num_layers=layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out, _ = self.lstm(x)
        last_out = out[:, -1, :]
        return self.fc(last_out)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMReg(in_feat=5, hidden_size=64, layers=1).to(device)


In [None]:
criterion = nn.SmoothL1Loss()
learning_rate = 0.001
weight_decay = 0.01
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.5, patience=3
)

In [191]:
def run_epoch(loader, model, criterion, optimizer=None, device="cpu"):
    is_train = optimizer is not None
    model.train(is_train)
    total_loss, n = 0.0, 0
    with torch.set_grad_enabled(is_train):
        predictions = []
        for X, y in loader:
            X, y = X.to(device), y.to(device)     
            pred = model(X)
            predictions.append((float(pred), float(y)))
            loss = criterion(pred, y)

            if is_train:
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  
                optimizer.step()

            bs = X.size(0)
            total_loss += loss.item() * bs
            n += bs
    return total_loss / n, predictions

In [None]:
epochs = 10
best_val = float('inf')
patience, bad_epochs = 5, 0
best_state = None



for epoch in range(1, epochs+1):
    train_loss, _ = run_epoch(train_loader, model, criterion, optimizer, device)
    val_loss, _   = run_epoch(val_loader,   model, criterion, None,      device)

    if scheduler is not None:
        scheduler.step(val_loss)

    if val_loss < best_val - 1e-6:
        best_val = val_loss
        best_state = {k:v.cpu().clone() for k,v in model.state_dict().items()}
        bad_epochs = 0
    else:
        bad_epochs += 1

    print(f"Epoch {epoch:02d} | train {train_loss:.4f} | val {val_loss:.4f}")

    if bad_epochs >= patience:
        print("Early stopping.")
        break

if best_state is not None:
    model.load_state_dict(best_state)
    model.to(device)

Epoch 01 | train 0.0128 | val 0.0053
Epoch 02 | train 0.0028 | val 0.0033
Epoch 03 | train 0.0024 | val 0.0026
Epoch 04 | train 0.0015 | val 0.0021
Epoch 05 | train 0.0014 | val 0.0034
Epoch 06 | train 0.0014 | val 0.0029
Epoch 07 | train 0.0012 | val 0.0044
Epoch 08 | train 0.0023 | val 0.0026
Epoch 09 | train 0.0030 | val 0.0016
Epoch 10 | train 0.0011 | val 0.0029


In [132]:
torch.save(model.state_dict(), "model.pt")


In [195]:
import pandas as pd
df = pd.read_csv("./data/TESTDATA/top_10_stock_data.csv")
feature_order = ["open", "high", "low", "close", "volume"]
df = df[feature_order]
print(df.head())
test_dataset = StockDataset("./data/TESTDATA/top_10_stock_data.csv", 30)
test_loader = DataLoader(test_dataset)
batch_size = 64


        open       high        low      close     volume
0  27.847500  27.860001  26.837500  27.332500  212818400
1  27.072500  27.162500  26.352501  26.562500  257142000
2  26.635000  26.857500  26.157499  26.565001  263188400
3  26.799999  27.049999  26.674999  26.937500  160423600
4  27.307501  28.037500  27.174999  27.972500  237458000


  df = df.fillna(method="ffill").fillna(method="bfill")  # forward/back fill


In [None]:
state = torch.load("model.pt", weights_only=True)
test_model = LSTMReg(in_feat=5, hidden_size=64, layers=1).to(device)
test_model.load_state_dict(state)
test_model.eval()
with torch.no_grad():
    test_loss, predictions = run_epoch(test_loader, test_model, criterion, None, device)
print("Test loss:", test_loss)




Test loss: 0.0015559451837915477


In [None]:
display_results(predictions, test_dataset.std[2], test_dataset.mean[2])


PREDICTION: 30.882972717285156    EXPECTED: 32.25697326660156
PREDICTION: 31.13243865966797    EXPECTED: 32.481788635253906
PREDICTION: 31.306671142578125    EXPECTED: 32.41358947753906
PREDICTION: 31.431121826171875    EXPECTED: 32.678810119628906
PREDICTION: 31.570327758789062    EXPECTED: 33.562904357910156
PREDICTION: 32.127464294433594    EXPECTED: 33.353248596191406
PREDICTION: 32.34880828857422    EXPECTED: 32.49945831298828
PREDICTION: 31.90094757080078    EXPECTED: 32.91120147705078
PREDICTION: 31.820465087890625    EXPECTED: 32.416114807128906
PREDICTION: 31.70783233642578    EXPECTED: 32.57524108886719
PREDICTION: 31.739151000976562    EXPECTED: 32.64344024658203
PREDICTION: 31.741455078125    EXPECTED: 32.436309814453125
PREDICTION: 31.71141815185547    EXPECTED: 31.898277282714844
PREDICTION: 31.354263305664062    EXPECTED: 31.946273803710938
PREDICTION: 31.359283447265625    EXPECTED: 32.08268737792969
PREDICTION: 31.37476348876953    EXPECTED: 31.418350219726562
PREDICTI