In [2]:
import yahoo_fin.stock_info as si
import pandas as pd

In [51]:
start_date = "01/01/2011"
end_date = "30/12/2023"
window = 20

df = si.get_data("aapl", start_date=start_date, end_date=end_date)[["close"]]
train = pd.concat([df] + [df.shift(i) for i in range(1, window+1)], axis=1)[window:]
train.columns = ["target"] + ["lag_{}".format(i) for i in range(1, window+1)]
train = train.iloc[:, ::-1]
train

Unnamed: 0,lag_20,lag_19,lag_18,lag_17,lag_16,lag_15,lag_14,lag_13,lag_12,lag_11,...,lag_9,lag_8,lag_7,lag_6,lag_5,lag_4,lag_3,lag_2,lag_1,target
2011-02-01,11.770357,11.831786,11.928571,11.918929,12.004286,12.230357,12.201429,12.300714,12.345714,12.445714,...,12.101429,11.881429,11.668571,12.051786,12.192857,12.280357,12.257500,12.003571,12.118571,12.322500
2011-02-02,11.831786,11.928571,11.918929,12.004286,12.230357,12.201429,12.300714,12.345714,12.445714,12.166071,...,11.881429,11.668571,12.051786,12.192857,12.280357,12.257500,12.003571,12.118571,12.322500,12.297143
2011-02-03,11.928571,11.918929,12.004286,12.230357,12.201429,12.300714,12.345714,12.445714,12.166071,12.101429,...,11.668571,12.051786,12.192857,12.280357,12.257500,12.003571,12.118571,12.322500,12.297143,12.265714
2011-02-04,11.918929,12.004286,12.230357,12.201429,12.300714,12.345714,12.445714,12.166071,12.101429,11.881429,...,12.051786,12.192857,12.280357,12.257500,12.003571,12.118571,12.322500,12.297143,12.265714,12.375000
2011-02-07,12.004286,12.230357,12.201429,12.300714,12.345714,12.445714,12.166071,12.101429,11.881429,11.668571,...,12.192857,12.280357,12.257500,12.003571,12.118571,12.322500,12.297143,12.265714,12.375000,12.567143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-22,189.970001,189.789993,190.399994,189.369995,189.949997,191.240005,189.429993,193.419998,192.320007,194.270004,...,193.179993,194.710007,197.960007,198.110001,197.570007,195.889999,196.940002,194.830002,194.679993,193.600006
2023-12-26,189.789993,190.399994,189.369995,189.949997,191.240005,189.429993,193.419998,192.320007,194.270004,195.710007,...,194.710007,197.960007,198.110001,197.570007,195.889999,196.940002,194.830002,194.679993,193.600006,193.050003
2023-12-27,190.399994,189.369995,189.949997,191.240005,189.429993,193.419998,192.320007,194.270004,195.710007,193.179993,...,197.960007,198.110001,197.570007,195.889999,196.940002,194.830002,194.679993,193.600006,193.050003,193.149994
2023-12-28,189.369995,189.949997,191.240005,189.429993,193.419998,192.320007,194.270004,195.710007,193.179993,194.710007,...,198.110001,197.570007,195.889999,196.940002,194.830002,194.679993,193.600006,193.050003,193.149994,193.580002


In [52]:
from sklearn.preprocessing import MinMaxScaler


def get_X_y(train):
    y = train["target"]
    X = train.drop(["target"], axis=1)
    return X, y

def split_data(train, test_size=0.2):
    n = int(len(train) * (1 - test_size))
    X_train, y_train = get_X_y(train[:n])
    
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    
    X_test, y_test = get_X_y(train[n:])
    X_test = scaler.transform(X_test)
    
    return (X_train, y_train, X_test, y_test), scaler

(X_train, y_train, X_test, y_test), scaler = split_data(train)

In [53]:
from torch.utils.data.dataset import TensorDataset
from torch.utils.data import DataLoader
import torch

batch_size = 32

train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32).unsqueeze(2), torch.tensor(y_train, dtype=torch.float32).flatten())
test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32).unsqueeze(2), torch.tensor(y_test, dtype=torch.float32).flatten())

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

  train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32).unsqueeze(2), torch.tensor(y_train, dtype=torch.float32).flatten())
  test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32).unsqueeze(2), torch.tensor(y_test, dtype=torch.float32).flatten())


In [54]:
assert train_dataset.tensors[0].shape[1] == window

In [55]:
import torch.nn as nn

window = 5
hidden_dim = 64
output_dim = 1
num_layers = 2


class StockLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super(StockLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_()
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_()
        
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
        out = self.fc(out[:, -1, :])
        
        return out


model = StockLSTM(input_dim=1, hidden_dim=hidden_dim, output_dim=output_dim, num_layers=num_layers)

loss_fn = nn.MSELoss()
optimiser = torch.optim.Adam(model.parameters(), lr=0.01)

In [62]:
len(train_loader)

82

In [63]:
from tqdm import tqdm
import numpy as np

num_epochs = 100

history = np.zeros((num_epochs, len(train_loader)))

pbar = tqdm(enumerate(train_loader))

for t in range(num_epochs):
    for i, (X_batch, y_batch) in enumerate(train_loader):
        y_pred = model(X_batch)
        loss = loss_fn(y_pred, y_batch)
        
        optimiser.zero_grad()
        loss.backward()
        optimiser.step()
        if i % 10 == 0 and i !=0:
            pbar.set_description(f"Epoch {t}, MSE: {loss.item():.4f}")
        history[t, i] = loss.item()

Epoch 2, MSE: 1686.2128: : 0it [00:53, ?it/s]
  return F.mse_loss(input, target, reduction=self.reduction)
Epoch 0, MSE: 489.4341: : 0it [00:00, ?it/s]

In [46]:
model.eval()

losses = []
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        y_test_pred = model(X_batch)
        loss = loss_fn(y_test_pred, y_batch)
        losses.append(loss.item())

print(f"Test MSE: {np.mean(losses):.4f}")


Test MSE: 1730.1437


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
