<a href="https://colab.research.google.com/github/ibadrather/pytorch_learn/blob/main/Part%2014%20-%20Multivariate%20Timeseries%20Analysis%20using%20Pytorch%20and%20Pytorch%20Lightening.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Multivariate Time Series Forecasting with LSTM using PyTorch and PyTorch Lightning 

Source: https://www.youtube.com/watch?v=ODEGJ_kh2aA

In [48]:
!pip install --quiet pytorch-lightning
!pip install --quiet tqdm

In [49]:
# !pip install --quiet pytorch-lightning#==1.2.5
# !pip install --quiet tqdm#==4.59.0

In [50]:
!nvidia-smi

Sun Jun 19 09:07:34 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   71C    P0    31W /  70W |   1650MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [51]:
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
import math
import matplotlib

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler

from collections import defaultdict

### Styling Settings

In [52]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#93D30C", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 14, 10

tqdm.pandas()


In [53]:
# Random Seed Pytorch Lightning
pl.seed_everything(42)

Global seed set to 42


42

## Load Data

In [54]:
# Mouting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [55]:
# Dataset Source: https://www.cryptodatadownload.com/data/binance/

data_path = "/content/drive/MyDrive/Colab Notebooks/binance_btc_usd_dataset_processed.csv"
features_df = pd.read_csv(data_path)
features_df

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,open,high,low,close_change,close
0,6,8,36,9,10000.00,10000.00,10000.00,0.00,10000.00
1,6,8,36,9,10000.00,10000.00,10000.00,0.00,10000.00
2,6,8,36,9,10000.00,10000.00,10000.00,0.00,10000.00
3,6,8,36,9,10000.00,10000.00,10000.00,0.00,10000.00
4,6,8,36,9,10000.00,10000.00,10000.00,0.00,10000.00
...,...,...,...,...,...,...,...,...,...
1439168,5,18,24,6,20643.04,20643.05,20617.26,-23.83,20619.22
1439169,5,18,24,6,20619.22,20638.02,20613.27,0.11,20619.33
1439170,5,18,24,6,20619.34,20645.13,20597.02,-22.31,20597.02
1439171,5,18,24,6,20597.03,20599.29,20568.90,2.26,20599.28


### Train-Test Split

In [56]:
split_ratio = 0.9
train_size = int(len(features_df) * split_ratio)
print(train_size)

1295255


In [57]:
train_df, test_df = features_df[:train_size], features_df[train_size+1:]
train_df.shape, test_df.shape

((1295255, 9), (143917, 9))

In [58]:
# Normalising the Data
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(train_df)

In [59]:
train_df = pd.DataFrame(
    scaler.transform(train_df),
    index = train_df.index,
    columns = train_df.columns
)
train_df.head()

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,open,high,low,close_change,close
0,1.0,-0.533333,0.346154,0.454545,-0.807237,-0.809233,-0.804245,0.38329,-0.807237
1,1.0,-0.533333,0.346154,0.454545,-0.807237,-0.809233,-0.804245,0.38329,-0.807237
2,1.0,-0.533333,0.346154,0.454545,-0.807237,-0.809233,-0.804245,0.38329,-0.807237
3,1.0,-0.533333,0.346154,0.454545,-0.807237,-0.809233,-0.804245,0.38329,-0.807237
4,1.0,-0.533333,0.346154,0.454545,-0.807237,-0.809233,-0.804245,0.38329,-0.807237


In [62]:
test_df = pd.DataFrame(
    scaler.transform(test_df),
    index = test_df.index,
    columns = test_df.columns
)
test_df.head()

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,open,high,low,close_change,close
1295256,1.0,0.733333,-0.730769,-0.818182,0.040265,0.040479,0.044923,0.387595,0.040717
1295257,1.0,0.733333,-0.730769,-0.818182,0.040717,0.044925,0.044679,0.414015,0.043946
1295258,1.0,0.733333,-0.730769,-0.818182,0.043946,0.043048,0.047134,0.37238,0.042799
1295259,1.0,0.733333,-0.730769,-0.818182,0.042799,0.041982,0.045363,0.365851,0.040966
1295260,1.0,0.733333,-0.730769,-0.818182,0.040967,0.041557,0.04548,0.394792,0.042175


#### Cutting DataFrame in Multiple Sequences

In [63]:
def create_sequences(input_data: pd.DataFrame, target_column, sequence_length=3):
  sequences = []
  data_size = len(input_data)

  for i in tqdm(range(data_size - sequence_length)):
    sequence = input_data[i:i+sequence_length]
    label_position = i + sequence_length
    label = input_data.iloc[label_position][target_column]

    sequences.append((sequence, label))

  return sequences

#### Creating Training and Testing Sequences

In [64]:
SEQUENCE_LENGTH = 120

In [None]:
train_sequences = create_sequences(train_df, "close", sequence_length=SEQUENCE_LENGTH)
test_sequences = create_sequences(test_df, "close", sequence_length=SEQUENCE_LENGTH)

  0%|          | 0/1295135 [00:00<?, ?it/s]

In [None]:
# Let's check first sequence label and data
print("Label: ", train_sequences[0][1])
print("")
print("Sequence: ",train_sequences[0][0])
print("Sequence Shape: ",train_sequences[0][0].shape)

In [None]:
len(train_sequences), len(test_sequences)

## Creating PyTorch Datasets

In [None]:
class BTCDataset(Dataset):
  def __init__(self, sequences):
    self.sequences = sequences

  def __len__(self):
    return len(self.sequences)
  
  def __getitem__(self, idx):
    sequence, label = self.sequences[idx]
    return dict(
        sequence = torch.Tensor(sequence.to_numpy()),
        label = torch.tensor(label).float()
    )

In [None]:
class BTCPriceDataModule(pl.LightningDataModule):
  def __init__(
      self, train_sequences, test_sequences, batch_size = 8
  ):
    super().__init__()
    self.train_sequences = train_sequences
    self.test_sequences = test_sequences
    self.batch_size = batch_size
  
  def setup(self, stage=None):
    self.train_dataset = BTCDataset(self.train_sequences)
    self.test_dataset = BTCDataset(self.test_sequences)
  
  def train_dataloader(self):
    return DataLoader(
        self.train_dataset,
        batch_size = self.batch_size,
        shuffle = False,
        num_workers = 2
    )

  def val_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size = 1,
        shuffle = False,
        num_workers = 1
    )
  def test_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size = 1,
        shuffle = False,
        num_workers = 1
    )

### Model Parameters

In [None]:
N_EPOCHS = 8
BATCH_SIZE = 64

In [None]:
data_module = BTCPriceDataModule(train_sequences, test_sequences, batch_size = BATCH_SIZE)
data_module.setup()

In [None]:
train_dataset = BTCDataset(train_sequences)

In [None]:
# Testing our dataloader
a = iter(train_dataset)
b = next(a)
print("Sequence Shape: ", b["sequence"].shape)
print("Label: {} and Label Shape: {}".format(b["label"], b["label"].shape) )

## Model

In [None]:
class PricePredictionModel(nn.Module):
  def __init__(self, n_features, n_hidden=256, n_layers=2):
    super().__init__()

    self.n_hidden = n_hidden

    self.lstm = nn.LSTM(
        input_size = n_features,
        hidden_size = n_hidden,
        batch_first = True,
        num_layers = n_layers, # Stack LSTMs
        dropout = 0.2
    )

    self.regressor = nn.Linear(n_hidden, 1)

  def forward(self, x):
    self.lstm.flatten_parameters()  # For distrubuted training

    _, (hidden, _) = self.lstm(x)
    # We want the output from the last layer to go into the final
    # regressor linear layer
    out = hidden[-1] 

    return self.regressor(out)

In [None]:
class BTCPricePredictor(pl.LightningModule):

  def __init__(self, n_features: int):
    super().__init__()
    self.model = PricePredictionModel(n_features)
    self.criterion = nn.MSELoss()

  def forward(self, x, labels=None):
    output = self.model(x)
    
    loss = 0

    if labels is not None:
      loss = self.criterion(output, labels.unsqueeze(dim=1))
    
    return loss, output

  def training_step(self, batch, batch_idx):
    sequences = batch["sequence"]
    labels = batch["label"]

    loss, output = self.forward(sequences, labels)

    self.log("train_loss", loss, prog_bar=True, logger=True)
    return loss

  def validation_step(self, batch, batch_idx):
    sequences = batch["sequence"]
    labels = batch["label"]

    loss, output = self.forward(sequences, labels)

    self.log("val_loss", loss, prog_bar=True, logger=True)
    return loss
  
  def test_step(self, batch, batch_idx):
    sequences = batch["sequence"]
    labels = batch["label"]

    loss, output = self.forward(sequences, labels)

    self.log("test_loss", loss, prog_bar=True, logger=True)
    return loss

  def configure_optimizers(self):
    return optim.Adam(self.model.parameters(), lr=0.001)

In [None]:
n_features = b["sequence"].shape[1]

model = BTCPricePredictor(n_features = n_features)

In [None]:
for item in data_module.train_dataloader():
  print(item["sequence"].shape)
  print(item["label"].shape)
  break

In [None]:
# Starting tensorboard
%load_ext tensorboard
%tensorboard --logdir ./lightning_logs

In [None]:
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best-checkpoint",
    save_top_k = 1,
    verbose = True,
    monitor = "val_loss",
    mode = "min"
)

logger = TensorBoardLogger("lightning_logs", name = "btc-price")

early_stopping_callback = EarlyStopping(monitor = "val_loss", patience = 2)

In [None]:
trainer = pl.Trainer(
    logger = logger,
    checkpoint_callback = checkpoint_callback,
    callbacks = [early_stopping_callback],
    max_epochs = N_EPOCHS,
    gpus = 1,
    progress_bar_refresh_rate = 30
)

In [None]:
trainer.fit(model, data_module)

### Testing the Trained Model

In [32]:
checkpoint_path = ""

trained_model = BTCPricePredictor.load_from_checkpoint(
    checkpoint_path,
    n_features = n_features   # 9 in this case
)

FileNotFoundError: ignored

In [None]:
# Freezing the model for faster predictions
trained_model.freeze()

In [None]:
test_dataset = BTCDataset(test_sequences)

predictions = []
labels = []

for item in tqdm(test_dataset):
  sequence = item["sequence"]
  label = item["label"]

  _, output = trained_model(sequence)
  predictions.append(output.item())
  labels.append(label.item())

In [None]:
len(predictions), len(test_df)

In [None]:
len(test_df) - SEQUENCE_LENGTH

### The values are normalised using MinMaxScaler. We have to do Inverse Scaling Now

In [None]:
# The values are normalised using MinMaxScaler
test_df.head()

In [None]:
# Doing the inverse scaling now
print(scaler.min_)
print(scaler.scale_)

In [None]:
descaler = MinMaxScaler
descaler.min_, descaler.scale_ = scaler.min_[-1], scaler.scale_[-1]

In [None]:
def descale(descaler, values):
  # Scaler works only with 2D data
  values_2d = np.array(values)[:, np.newaxis]
  return descaler.inverse_transform(values_2d).flatten()

In [None]:
predictions_descaled = descale(descaler, predictions)
labels_descaled = descaler(descaler, labels)

In [None]:
print(predictions_descaled[:3])
print(labels_descaled[:3])

### Plotting Prediction vs Ground Truth

In [None]:
test_data = features_df[train_size+1:]
len(test_data), len(test_df)

In [None]:
test_sequences_data = test_data.iloc[SEQUENCE_LENGTH:]
len(test_sequences_data), len(test_sequences)

In [None]:
test_sequences_data.head()

In [None]:
dates = matplotlib.dates.date2num(test_sequences_data.date.tolist())
plt.plot_date(dates, predictions_descaled, "-", label= "Predicted", color = "g")
plt.plot_date(dates, labels_descaled, "--", label = "Real", color = "b")
plt.xticks(rotation =45)
plt.legend()