<a href="https://colab.research.google.com/github/ibadrather/pytorch_learn/blob/main/Part%2014%20-%20Multivariate%20Timeseries%20Analysis%20using%20Pytorch%20and%20Pytorch%20Lightening.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Multivariate Time Series Forecasting with LSTM using PyTorch and PyTorch Lightning 

Source: https://www.youtube.com/watch?v=ODEGJ_kh2aA

In [4]:
!pip install --quiet pytorch-lightning
!pip install --quiet tqdm

[K     |████████████████████████████████| 585 kB 7.3 MB/s 
[K     |████████████████████████████████| 596 kB 63.5 MB/s 
[K     |████████████████████████████████| 140 kB 66.7 MB/s 
[K     |████████████████████████████████| 419 kB 69.4 MB/s 
[K     |████████████████████████████████| 1.1 MB 50.3 MB/s 
[K     |████████████████████████████████| 144 kB 60.9 MB/s 
[K     |████████████████████████████████| 94 kB 3.6 MB/s 
[K     |████████████████████████████████| 271 kB 77.5 MB/s 
[?25h

In [3]:
!nvidia-smi

Sun Jun 19 05:57:23 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
import math
import matplotlib

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler

from collections import defaultdict

### Styling Settings

In [6]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#93D30C", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 14, 10

tqdm.pandas()


In [7]:
# Random Seed Pytorch Lightning
pl.seed_everything(42)

Global seed set to 42


42

## Load Data

In [8]:
# Mouting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
# Dataset Source: https://www.cryptodatadownload.com/data/binance/

data_path = "/content/drive/MyDrive/Colab Notebooks/binance_btc_usd_dataset_processed.csv"
features_df = pd.read_csv(data_path)
features_df

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,open,high,low,close_change,close
0,6,8,36,9,10000.00,10000.00,10000.00,0.00,10000.00
1,6,8,36,9,10000.00,10000.00,10000.00,0.00,10000.00
2,6,8,36,9,10000.00,10000.00,10000.00,0.00,10000.00
3,6,8,36,9,10000.00,10000.00,10000.00,0.00,10000.00
4,6,8,36,9,10000.00,10000.00,10000.00,0.00,10000.00
...,...,...,...,...,...,...,...,...,...
1439168,5,18,24,6,20643.04,20643.05,20617.26,-23.83,20619.22
1439169,5,18,24,6,20619.22,20638.02,20613.27,0.11,20619.33
1439170,5,18,24,6,20619.34,20645.13,20597.02,-22.31,20597.02
1439171,5,18,24,6,20597.03,20599.29,20568.90,2.26,20599.28


### Train-Test Split

In [10]:
split_ratio = 0.9
train_size = int(len(features_df) * split_ratio)
print(train_size)

1295255


In [11]:
train_df, test_df = features_df[:train_size], features_df[train_size+1:]
train_df.shape, test_df.shape

((1295255, 9), (143917, 9))

In [12]:
# Normalising the Data
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(train_df)

In [13]:
train_df = pd.DataFrame(
    scaler.transform(train_df),
    index = train_df.index,
    columns = train_df.columns
)
train_df.head()

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,open,high,low,close_change,close
0,1.0,-0.533333,0.346154,0.454545,-0.807237,-0.809233,-0.804245,0.38329,-0.807237
1,1.0,-0.533333,0.346154,0.454545,-0.807237,-0.809233,-0.804245,0.38329,-0.807237
2,1.0,-0.533333,0.346154,0.454545,-0.807237,-0.809233,-0.804245,0.38329,-0.807237
3,1.0,-0.533333,0.346154,0.454545,-0.807237,-0.809233,-0.804245,0.38329,-0.807237
4,1.0,-0.533333,0.346154,0.454545,-0.807237,-0.809233,-0.804245,0.38329,-0.807237


#### Cutting DataFrame in Multiple Sequences

In [14]:
def create_sequences(input_data: pd.DataFrame, target_column, sequence_length=3):
  sequences = []
  data_size = len(input_data)

  for i in tqdm(range(data_size - sequence_length)):
    sequence = input_data[i:i+sequence_length]
    label_position = i + sequence_length
    label = input_data.iloc[label_position][target_column]

    sequences.append((sequence, label))

  return sequences

#### Creating Training and Testing Sequences

In [15]:
SEQUENCE_LENGTH = 120

In [17]:
train_sequences = create_sequences(train_df, "close", sequence_length=SEQUENCE_LENGTH)
test_sequences = create_sequences(test_df, "close", sequence_length=SEQUENCE_LENGTH)

  0%|          | 0/1295135 [00:00<?, ?it/s]

  0%|          | 0/143797 [00:00<?, ?it/s]

In [21]:
# Let's check first sequence label and data
print("Label: ", train_sequences[0][1])
print("")
print("Sequence: ",train_sequences[0][0])
print("Sequence Shape: ",train_sequences[0][0].shape)

Label:  -0.7968720708976027

Sequence:       day_of_week  day_of_month  week_of_year     month      open      high  \
0            1.0     -0.533333      0.346154  0.454545 -0.807237 -0.809233   
1            1.0     -0.533333      0.346154  0.454545 -0.807237 -0.809233   
2            1.0     -0.533333      0.346154  0.454545 -0.807237 -0.809233   
3            1.0     -0.533333      0.346154  0.454545 -0.807237 -0.809233   
4            1.0     -0.533333      0.346154  0.454545 -0.807237 -0.809233   
..           ...           ...           ...       ...       ...       ...   
115          1.0     -0.533333      0.346154  0.454545 -0.796720 -0.798670   
116          1.0     -0.533333      0.346154  0.454545 -0.796688 -0.798673   
117          1.0     -0.533333      0.346154  0.454545 -0.796778 -0.798762   
118          1.0     -0.533333      0.346154  0.454545 -0.796825 -0.798810   
119          1.0     -0.533333      0.346154  0.454545 -0.796849 -0.798709   

          low  close_ch

In [22]:
len(train_sequences), len(test_sequences)

(1295135, 143797)

## Creating PyTorch Datasets

In [43]:
class BTCDataset(Dataset):
  def __init__(self, sequences):
    self.sequences = sequences

  def __len__(self):
    return len(self.sequences)
  
  def __getitem__(self, idx):
    sequence, label = self.sequences[idx]
    return dict(
        sequence = torch.Tensor(sequence.to_numpy()),
        label = torch.tensor(label).float()
    )

In [44]:
class BTCPriceDataModule(pl.LightningDataModule):
  def __init__(
      self, train_sequences, test_sequences, batch_size = 8
  ):
    super().__init__()
    self.train_sequences = train_sequences
    self.test_sequences = test_sequences
    self.batch_size = batch_size
  
  def setup(self):
    self.train_dataset = BTCDataset(self.train_sequences)
    self.test_dataset = BTCDataset(self.test_sequences)
  
  def train_dataloader(self):
    return DataLoader(
        self.train_dataset,
        batch_size = self.batch_size,
        shuffle = False,
        num_workers = 2
    )

  def val_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size = 1,
        shuffle = False,
        num_workers = 1
    )
  def test_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size = 1,
        shuffle = False,
        num_workers = 1
    )

### Model Parameters

In [45]:
N_EPOCHS = 8
BATCH_SIZE = 64

In [46]:
data_module = BTCPriceDataModule(train_sequences, test_sequences, batch_size = BATCH_SIZE)
data_module.setup()

In [47]:
train_dataset = BTCDataset(train_sequences)

In [52]:
# Testing our dataloader
a = iter(train_dataset)
b = next(a)
print("Sequence Shape: ", b["sequence"].shape)
print("Label: {} and Label Shape: {}".format(b["label"], b["label"].shape) )

Sequence Shape:  torch.Size([120, 9])
Label: -0.796872079372406 and Label Shape: torch.Size([])


## Model

In [54]:
class PricePredictionModel(nn.Module):
  def __init__(self, n_features, n_hidden=128, n_layers=2):
    super().__init__()

    self.n_hidden = n_hidden

    self.lstm = nn.LSTM(
        input_size = n_features,
        hidden_size = n_hidden,
        batch_first = True,
        num_layers = n_layers, # Stack LSTMs
        dropout = 0.2
    )

    self.regressor = nn.Linear(n_hidden, 1)

  def forward(self, x):
    self.lstm.flatten_parameters()  # For distrubuted training

    _, (hidden, _) = self.lstm(x)
    # We want the output from the last layer to go into the final
    # regressor linear layer
    out = hidden[-1] 

    return self.regressor(out)

In [None]:
class BTCPricePredictor(pl.LightningModule):

  def __init__(self, n_features: int):
    self.model = PricePredictionModel(n_features)
    self.criterion = nn.MSELoss()

  def forward(self, x, labels=None):
    output = self.model(x)
    
    loss = 0

    if labels is not None:
      loss
    
    return loss, output