In [33]:
import yfinance as yf
import datetime
import pandas as pd
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
import torch
from src.model import Transformer
from torch.utils.data import DataLoader, Dataset, TensorDataset
import numpy as np
from src.CNN import CNN
import torch.nn.functional as F

In [2]:
ticker = "AAPL"
years = 40

In [3]:
end_date = datetime.date.today().strftime("%Y-%m-%d")
start_date = (datetime.date.today() - datetime.timedelta(days=years*365)).strftime("%Y-%m-%d")

In [4]:
data = yf.download(ticker, start=start_date, end=end_date)

[*********************100%***********************]  1 of 1 completed


# EDA

### Get sample of data


In [5]:
data.head()

Price,Close,High,Low,Open,Volume
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
1985-02-11,0.104851,0.10571,0.104851,0.104851,346953600
1985-02-12,0.102273,0.105281,0.102273,0.104851,226508800
1985-02-13,0.097545,0.102273,0.097545,0.102273,527027200
1985-02-14,0.094967,0.098405,0.094967,0.097545,426832000
1985-02-15,0.096256,0.096686,0.094108,0.094967,173622400


In [6]:
### Shape of data

In [7]:
data.shape

(10071, 5)

In [8]:
### Get summary of data and check for nulls

In [9]:
# check columns and data types
print(data.info())

# get summaries
print(data.describe())

# check for nulls
print(data.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 10071 entries, 1985-02-11 to 2025-01-30
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   (Close, AAPL)   10071 non-null  float64
 1   (High, AAPL)    10071 non-null  float64
 2   (Low, AAPL)     10071 non-null  float64
 3   (Open, AAPL)    10071 non-null  float64
 4   (Volume, AAPL)  10071 non-null  int64  
dtypes: float64(4), int64(1)
memory usage: 472.1 KB
None
Price          Close          High           Low          Open        Volume
Ticker          AAPL          AAPL          AAPL          AAPL          AAPL
count   10071.000000  10071.000000  10071.000000  10071.000000  1.007100e+04
mean       26.326238     26.585971     26.039555     26.305333  3.355846e+08
std        52.353153     52.848583     51.793928     52.299141  3.426937e+08
min         0.049847      0.050706      0.049847      0.050277  2.396800e+06
25%         0.282182      0.287606      0

Get X and Y

In [10]:
data["Target"] = data["Close"].shift(-1) # target: next day closing price
data.dropna(inplace=True) # remove one row with nan target

In [11]:
input_vars = ['Close', 'High', 'Low', 'Open']
output_var = 'Target'
X = data[input_vars]
y = data[output_var]

In [12]:
X

Price,Close,High,Low,Open
Ticker,AAPL,AAPL,AAPL,AAPL
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1985-02-11,0.104851,0.105710,0.104851,0.104851
1985-02-12,0.102273,0.105281,0.102273,0.104851
1985-02-13,0.097545,0.102273,0.097545,0.102273
1985-02-14,0.094967,0.098405,0.094967,0.097545
1985-02-15,0.096256,0.096686,0.094108,0.094967
...,...,...,...,...
2025-01-23,223.660004,227.029999,222.300003,224.740005
2025-01-24,222.779999,225.630005,221.410004,224.779999
2025-01-27,229.860001,232.149994,223.979996,224.020004
2025-01-28,238.259995,240.190002,230.809998,230.850006


In [13]:
y

Date
1985-02-11      0.102273
1985-02-12      0.097545
1985-02-13      0.094967
1985-02-14      0.096256
1985-02-15      0.094967
                 ...    
2025-01-23    222.779999
2025-01-24    229.860001
2025-01-27    238.259995
2025-01-28    239.360001
2025-01-29    237.589996
Name: Target, Length: 10070, dtype: float64

In [14]:
data[input_vars] = scale(data[input_vars], axis=0)
data_in = data[input_vars]
data_out = data[output_var]

In [15]:
data_in

Price,Close,High,Low,Open
Ticker,AAPL,AAPL,AAPL,AAPL
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1985-02-11,-0.500860,-0.501065,-0.500737,-0.500981
1985-02-12,-0.500910,-0.501074,-0.500787,-0.500981
1985-02-13,-0.501000,-0.501131,-0.500878,-0.501030
1985-02-14,-0.501049,-0.501204,-0.500928,-0.501120
1985-02-15,-0.501025,-0.501236,-0.500945,-0.501170
...,...,...,...,...
2025-01-23,3.772734,3.796299,3.792793,3.797738
2025-01-24,3.755911,3.769787,3.775595,3.798504
2025-01-27,3.891256,3.893259,3.825256,3.783960
2025-01-28,4.051835,4.045516,3.957234,3.914662


In [16]:
data_out

Date
1985-02-11      0.102273
1985-02-12      0.097545
1985-02-13      0.094967
1985-02-14      0.096256
1985-02-15      0.094967
                 ...    
2025-01-23    222.779999
2025-01-24    229.860001
2025-01-27    238.259995
2025-01-28    239.360001
2025-01-29    237.589996
Name: Target, Length: 10070, dtype: float64

In [17]:
def create_time_series_windows(data, window_size=30):
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data_in[i:i+window_size])  # past 30 days
        y.append(data_out[i+window_size])    # target: next day's close price
    return np.array(X), np.array(y)

data_values = data[['Open', 'High', 'Low', 'Close', 'Volume']].values 
X, y = create_time_series_windows(data_values, window_size=30)
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

  y.append(data_out[i+window_size])    # target: next day's close price


X shape: torch.Size([10040, 30, 4])
y shape: torch.Size([10040])


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [19]:
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

# Manully build dataloaders

# Train the model

In [20]:
input_dim = 4
n_embedding = 64 # how many embeddings to represent each token with
n_layers = 6
block_size = 30 # how many tokens in each "block"
batch_size = 16
device = torch.device('mps' if torch.mps.is_available() else 'cpu')
print(device)
lr = 1e-4
n_epochs = 100

mps


In [21]:
cnn = CNN(input_dim, block_size, conv_layers=4)
cnn.to(device)

CNN(
  (conv1): Conv1d(4, 64, kernel_size=(2,), stride=(1,), padding=(1,))
  (pool1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (convs): Sequential(
    (0): Conv1d(64, 64, kernel_size=(2,), stride=(1,), padding=(1,))
    (1): Conv1d(64, 64, kernel_size=(2,), stride=(1,), padding=(1,))
    (2): Conv1d(64, 64, kernel_size=(2,), stride=(1,), padding=(1,))
  )
  (pools): Sequential(
    (0): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc1): Linear(in_features=128, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
  (relu): ReLU()
  (flatten): Flatten(start_dim=1, end_dim=-1)
)

In [22]:
# use MSE Loss for regression
criterion = torch.nn.MSELoss()

In [23]:
optimizer = torch.optim.Adam(cnn.parameters(), lr=lr)

In [26]:
# Training the model
for epoch in range(n_epochs):
    cnn.train()  # Set the model to training mode
    total_loss = 0

    for data, labels in train_loader:
        # Forward pass
        outputs = cnn(data.to(device))
        #print(outputs.shape)
        #print(labels.shape)
        loss = criterion(labels.to(device), outputs)
        #print("Loss",loss.item())

        # Backward pass
        optimizer.zero_grad()  # Clear previous gradients
        loss.backward()  # Compute gradients
        optimizer.step()  # Update weights

        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{n_epochs}], Loss: {total_loss/len(train_loader):.4f}")

Epoch [1/100], Loss: 2.9602
Epoch [2/100], Loss: 3.0029
Epoch [3/100], Loss: 3.1438
Epoch [4/100], Loss: 2.8908
Epoch [5/100], Loss: 2.8886
Epoch [6/100], Loss: 3.0241
Epoch [7/100], Loss: 3.1156
Epoch [8/100], Loss: 3.0650
Epoch [9/100], Loss: 3.0603
Epoch [10/100], Loss: 3.0350
Epoch [11/100], Loss: 2.7848
Epoch [12/100], Loss: 2.9474
Epoch [13/100], Loss: 2.8200
Epoch [14/100], Loss: 2.8222
Epoch [15/100], Loss: 2.7686


KeyboardInterrupt: 

In [28]:
def eval_model(model, dataloader):
    model.eval()
    total_loss = 0
    n_batches = len(dataloader)
    
    with torch.no_grad():
        for data, labels in dataloader:
            data, labels = data.to(device), labels.to(device)
            output = model(data.unsqueeze(0))
            
            mse = (output, labels)
            total_loss += mse.item()
            
    avg_loss = total_loss/n_batches
    return avg_loss

In [34]:
eval_model(cnn, test_dataset)

  mse = F.mse_loss(output, labels)


2.9931560380255395