# Data Processing
## Import libraries

In [1]:
import pandas as pd
import numpy as np
import yfinance as yf
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import MinMaxScaler

## Load data

In [2]:
stock = "AAPL"
ticker = yf.Ticker(stock)
data = ticker.history(interval="1d", period="max")

print("Number of rows: ", len(data))
data.head(5)

Number of rows:  11074


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1980-12-12 00:00:00-05:00,0.098834,0.099264,0.098834,0.098834,469033600,0.0,0.0
1980-12-15 00:00:00-05:00,0.094108,0.094108,0.093678,0.093678,175884800,0.0,0.0
1980-12-16 00:00:00-05:00,0.087232,0.087232,0.086802,0.086802,105728000,0.0,0.0
1980-12-17 00:00:00-05:00,0.088951,0.089381,0.088951,0.088951,86441600,0.0,0.0
1980-12-18 00:00:00-05:00,0.09153,0.091959,0.09153,0.09153,73449600,0.0,0.0


## Clean data

In [3]:
data = data.reset_index()
data = data.drop(["Date", "Dividends", "Stock Splits"], axis=1)
data = data[(data != 0).all(axis=1)]

## Preprocess data

In [4]:
data = data.pct_change()
data = data.dropna()
data = data.to_numpy()

print("Contains NaN:", np.isnan(data).any())
print("Contains Inf:", np.isinf(data).any())

Contains NaN: False
Contains Inf: False


In [5]:
scaler = MinMaxScaler((0, 1))
data = scaler.fit_transform(data)

In [6]:
n_columns = 5
n_windows = 64 
size = len(data)

X = np.zeros((size - n_windows, n_windows, n_columns))
y = np.zeros((size - n_windows, n_columns - 1))

for i in range(size - n_windows):
    X[i] = data[i : i + n_windows]
    y[i] = data[i + n_windows, 0: n_columns - 1]

## Setup Dataset

In [7]:
class StockDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype = torch.float32)
        self.y = torch.tensor(y, dtype = torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        x = self.X[index]
        y = self.y[index]
        return x, y

dataset = StockDataset(X, y)

## Setup DataLoader

In [8]:
p_split = 0.8
batch_size = 32

train_dataset, test_dataset = torch.utils.data.random_split(dataset, [p_split, 1 - p_split])
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Model

In [10]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [11]:
n_epochs = 10

model.train()
for epoch in range(n_epochs):
    epoch_loss = 0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f"EPOCH {epoch+1}/{n_epochs}: Loss - {epoch_loss/ len(train_loader):.4f}")

EPOCH 1/10: Loss - 0.2009
EPOCH 2/10: Loss - 0.0012
EPOCH 3/10: Loss - 0.0012
EPOCH 4/10: Loss - 0.0012
EPOCH 5/10: Loss - 0.0012
EPOCH 6/10: Loss - 0.0012
EPOCH 7/10: Loss - 0.0012
EPOCH 8/10: Loss - 0.0012
EPOCH 9/10: Loss - 0.0012
EPOCH 10/10: Loss - 0.0012
