<a href="https://colab.research.google.com/github/ibadrather/pytorch_learn/blob/main/Part%2014%20-%20Multivariate%20Timeseries%20Analysis%20using%20Pytorch%20and%20Pytorch%20Lightening.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --quiet pytorch-lightning
!pip install --quiet tqdm

[K     |████████████████████████████████| 585 kB 8.4 MB/s 
[K     |████████████████████████████████| 596 kB 33.0 MB/s 
[K     |████████████████████████████████| 419 kB 51.9 MB/s 
[K     |████████████████████████████████| 140 kB 54.8 MB/s 
[K     |████████████████████████████████| 1.1 MB 57.9 MB/s 
[K     |████████████████████████████████| 144 kB 60.8 MB/s 
[K     |████████████████████████████████| 271 kB 56.4 MB/s 
[K     |████████████████████████████████| 94 kB 573 kB/s 
[?25h

In [2]:
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler

### Styling Settings

In [3]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#93D30C", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 14, 10

tqdm.pandas()


In [4]:
# Random Seed Pytorch Lightning
pl.seed_everything(42)

Global seed set to 42


42

## Load Data

In [5]:
# Mouting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# Dataset Source: https://www.cryptodatadownload.com/data/binance/

data_path = "/content/drive/MyDrive/Colab Notebooks/binance_btc_usd_dataset_processed.csv"
features_df = pd.read_csv(data_path)
features_df

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,open,high,low,close_change,close
0,6,8,36,9,10000.00,10000.00,10000.00,0.00,10000.00
1,6,8,36,9,10000.00,10000.00,10000.00,0.00,10000.00
2,6,8,36,9,10000.00,10000.00,10000.00,0.00,10000.00
3,6,8,36,9,10000.00,10000.00,10000.00,0.00,10000.00
4,6,8,36,9,10000.00,10000.00,10000.00,0.00,10000.00
...,...,...,...,...,...,...,...,...,...
1439168,5,18,24,6,20643.04,20643.05,20617.26,-23.83,20619.22
1439169,5,18,24,6,20619.22,20638.02,20613.27,0.11,20619.33
1439170,5,18,24,6,20619.34,20645.13,20597.02,-22.31,20597.02
1439171,5,18,24,6,20597.03,20599.29,20568.90,2.26,20599.28


### Train-Test Split

In [7]:
split_ratio = 0.9
train_size = int(len(features_df) * split_ratio)
print(train_size)

1295255


In [8]:
train_df, test_df = features_df[:train_size], features_df[train_size+1:]
train_df.shape, test_df.shape

((1295255, 9), (143917, 9))

In [9]:
# Normalising the Data
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(train_df)

In [10]:
train_df = pd.DataFrame(
    scaler.transform(train_df),
    index = train_df.index,
    columns = train_df.columns
)
train_df.head()

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,open,high,low,close_change,close
0,1.0,-0.533333,0.346154,0.454545,-0.807237,-0.809233,-0.804245,0.38329,-0.807237
1,1.0,-0.533333,0.346154,0.454545,-0.807237,-0.809233,-0.804245,0.38329,-0.807237
2,1.0,-0.533333,0.346154,0.454545,-0.807237,-0.809233,-0.804245,0.38329,-0.807237
3,1.0,-0.533333,0.346154,0.454545,-0.807237,-0.809233,-0.804245,0.38329,-0.807237
4,1.0,-0.533333,0.346154,0.454545,-0.807237,-0.809233,-0.804245,0.38329,-0.807237


#### Cutting DataFrame in Multiple Sequences

In [11]:
def create_sequences(input_data: pd.DataFrame, target_column, sequence_length=3):
  sequences = []
  data_size = len(input_data)

  for i in tqdm(range(data_size - sequence_length)):
    sequence = input_data[i:i+sequence_length]
    label_position = i + sequence_length
    label = input_data.iloc[label_position][target_column]

    sequences.append((sequence, label))

  return sequences

#### Creating Training and Testing Sequences

In [12]:
SEQUENCE_LENGTH = 60

In [13]:
train_sequences = create_sequences(train_df, "close", sequence_length=SEQUENCE_LENGTH)
test_sequences = create_sequences(test_df, "close", sequence_length=SEQUENCE_LENGTH)

  0%|          | 0/1295195 [00:00<?, ?it/s]

  0%|          | 0/143857 [00:00<?, ?it/s]

In [14]:
# Let's check first sequence label and data
print("Label: ", train_sequences[0][1])
print("")
print("Sequence: ",train_sequences[0][0])

Label:  -0.8072370347589881

Sequence:      day_of_week  day_of_month  week_of_year     month      open      high  \
0           1.0     -0.533333      0.346154  0.454545 -0.807237 -0.809233   
1           1.0     -0.533333      0.346154  0.454545 -0.807237 -0.809233   
2           1.0     -0.533333      0.346154  0.454545 -0.807237 -0.809233   
3           1.0     -0.533333      0.346154  0.454545 -0.807237 -0.809233   
4           1.0     -0.533333      0.346154  0.454545 -0.807237 -0.809233   
5           1.0     -0.533333      0.346154  0.454545 -0.807237 -0.809233   
6           1.0     -0.533333      0.346154  0.454545 -0.807237 -0.809233   
7           1.0     -0.533333      0.346154  0.454545 -0.807237 -0.809233   
8           1.0     -0.533333      0.346154  0.454545 -0.807237 -0.809233   
9           1.0     -0.533333      0.346154  0.454545 -0.807237 -0.809233   
10          1.0     -0.533333      0.346154  0.454545 -0.807237 -0.809233   
11          1.0     -0.533333      0

In [15]:
len(train_sequences), len(test_sequences)

(1295195, 143857)