<a href="https://colab.research.google.com/github/ibadrather/pytorch_learn/blob/main/Part%2014%20-%20Multivariate%20Timeseries%20Analysis%20using%20Pytorch%20and%20Pytorch%20Lightening.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
!pip install --quiet pytorch-lightning
!pip install --quiet tqdm

In [16]:
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler

### Styling Settings

In [17]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#93D30C", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 14, 10

tqdm.pandas()


In [18]:
# Random Seed Pytorch Lightning
pl.seed_everything(42)

Global seed set to 42


42

## Load Data

In [19]:
# Mouting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
# Dataset Source: https://www.cryptodatadownload.com/data/binance/

data_path = "/content/drive/MyDrive/Colab Notebooks/Binance_BTCUSDT_minute.csv"
df = pd.read_csv(data_path, parse_dates=["date"])
df

Unnamed: 0,unix,date,symbol,open,high,low,close,Volume BTC,Volume USDT,tradecount
0,1655511420000,2022-06-18 00:17:00,BTC/USDT,20599.28,20626.34,20599.28,20613.06,20.35142,4.195369e+05,429
1,1655511360000,2022-06-18 00:16:00,BTC/USDT,20597.03,20599.29,20568.90,20599.28,64.92620,1.336134e+06,957
2,1655511300000,2022-06-18 00:15:00,BTC/USDT,20619.34,20645.13,20597.02,20597.02,56.02534,1.155457e+06,856
3,1655511240000,2022-06-18 00:14:00,BTC/USDT,20619.22,20638.02,20613.27,20619.33,67.57083,1.393749e+06,758
4,1655511180000,2022-06-18 00:13:00,BTC/USDT,20643.04,20643.05,20617.26,20619.22,58.09482,1.198367e+06,821
...,...,...,...,...,...,...,...,...,...,...
1439168,1567965660000,2019-09-08 18:01:00,BTC/USDT,10000.00,10000.00,10000.00,10000.00,0.00000,0.000000e+00,0
1439169,1567965600000,2019-09-08 18:00:00,BTC/USDT,10000.00,10000.00,10000.00,10000.00,0.00000,0.000000e+00,0
1439170,1567965540000,2019-09-08 17:59:00,BTC/USDT,10000.00,10000.00,10000.00,10000.00,0.00100,1.000000e+01,1
1439171,1567965480000,2019-09-08 17:58:00,BTC/USDT,10000.00,10000.00,10000.00,10000.00,0.00000,0.000000e+00,0


In [21]:
df = df.sort_values(by=["date"]).reset_index()
df.head()

Unnamed: 0,index,unix,date,symbol,open,high,low,close,Volume BTC,Volume USDT,tradecount
0,1439172,1567965420000,2019-09-08 17:57:00,BTC/USDT,10000.0,10000.0,10000.0,10000.0,0.001,10.0,1
1,1439171,1567965480000,2019-09-08 17:58:00,BTC/USDT,10000.0,10000.0,10000.0,10000.0,0.0,0.0,0
2,1439170,1567965540000,2019-09-08 17:59:00,BTC/USDT,10000.0,10000.0,10000.0,10000.0,0.001,10.0,1
3,1439169,1567965600000,2019-09-08 18:00:00,BTC/USDT,10000.0,10000.0,10000.0,10000.0,0.0,0.0,0
4,1439168,1567965660000,2019-09-08 18:01:00,BTC/USDT,10000.0,10000.0,10000.0,10000.0,0.0,0.0,0


In [22]:
df.columns

Index(['index', 'unix', 'date', 'symbol', 'open', 'high', 'low', 'close',
       'Volume BTC', 'Volume USDT', 'tradecount'],
      dtype='object')

# Pre-Processing Data

In [23]:
# Adding a previously_closed data column to our dataframe
df["prev_close"] = df.shift(1)["close"]
df.head()

Unnamed: 0,index,unix,date,symbol,open,high,low,close,Volume BTC,Volume USDT,tradecount,prev_close
0,1439172,1567965420000,2019-09-08 17:57:00,BTC/USDT,10000.0,10000.0,10000.0,10000.0,0.001,10.0,1,
1,1439171,1567965480000,2019-09-08 17:58:00,BTC/USDT,10000.0,10000.0,10000.0,10000.0,0.0,0.0,0,10000.0
2,1439170,1567965540000,2019-09-08 17:59:00,BTC/USDT,10000.0,10000.0,10000.0,10000.0,0.001,10.0,1,10000.0
3,1439169,1567965600000,2019-09-08 18:00:00,BTC/USDT,10000.0,10000.0,10000.0,10000.0,0.0,0.0,0,10000.0
4,1439168,1567965660000,2019-09-08 18:01:00,BTC/USDT,10000.0,10000.0,10000.0,10000.0,0.0,0.0,0,10000.0


In [24]:
# Adding another column that will be a difference from previous close
# this will have the change in close price data per minute
df["close_change"] = df.progress_apply(
    lambda row: 0 if np.isnan(row.prev_close) else row.close - row.prev_close,
    axis=1
)
df.head()

  0%|          | 0/1439173 [00:00<?, ?it/s]

Unnamed: 0,index,unix,date,symbol,open,high,low,close,Volume BTC,Volume USDT,tradecount,prev_close,close_change
0,1439172,1567965420000,2019-09-08 17:57:00,BTC/USDT,10000.0,10000.0,10000.0,10000.0,0.001,10.0,1,,0.0
1,1439171,1567965480000,2019-09-08 17:58:00,BTC/USDT,10000.0,10000.0,10000.0,10000.0,0.0,0.0,0,10000.0,0.0
2,1439170,1567965540000,2019-09-08 17:59:00,BTC/USDT,10000.0,10000.0,10000.0,10000.0,0.001,10.0,1,10000.0,0.0
3,1439169,1567965600000,2019-09-08 18:00:00,BTC/USDT,10000.0,10000.0,10000.0,10000.0,0.0,0.0,0,10000.0,0.0
4,1439168,1567965660000,2019-09-08 18:01:00,BTC/USDT,10000.0,10000.0,10000.0,10000.0,0.0,0.0,0,10000.0,0.0


## Converting Dataframe in Features

In [27]:
rows = []

for _, row in tqdm(df.iterrows(), total=df.shape[0]):
  row_data = dict(
      day_of_week = row.date.dayofweek,
      day_of_month = row.date.day,
      week_of_year = row.date.week,
      month = row.date.month,
      open = row.open,
      high = row.high,
      low = row.low,
      close_change = row.close_change,
      close = row.close
      )
  rows.append(row_data)

# Convert this into Pandas DataFrame
features_df = pd.DataFrame(rows)
features_df

  0%|          | 0/1439173 [00:00<?, ?it/s]

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,open,high,low,close_change,close
0,6,8,36,9,10000.00,10000.00,10000.00,0.00,10000.00
1,6,8,36,9,10000.00,10000.00,10000.00,0.00,10000.00
2,6,8,36,9,10000.00,10000.00,10000.00,0.00,10000.00
3,6,8,36,9,10000.00,10000.00,10000.00,0.00,10000.00
4,6,8,36,9,10000.00,10000.00,10000.00,0.00,10000.00
...,...,...,...,...,...,...,...,...,...
1439168,5,18,24,6,20643.04,20643.05,20617.26,-23.83,20619.22
1439169,5,18,24,6,20619.22,20638.02,20613.27,0.11,20619.33
1439170,5,18,24,6,20619.34,20645.13,20597.02,-22.31,20597.02
1439171,5,18,24,6,20597.03,20599.29,20568.90,2.26,20599.28


In [54]:
features_df.to_csv("binance_btc_usd_dataset_processed.csv", index=False)

### Train-Test Split

In [33]:
split_ratio = 0.9
train_size = int(len(features_df) * split_ratio)
print(train_size)

1295255


In [34]:
train_df, test_df = features_df[:train_size], features_df[train_size+1:]
train_df.shape, test_df.shape

((1295255, 9), (143917, 9))

In [35]:
# Normalising the Data
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(train_df)

In [36]:
train_df = pd.DataFrame(
    scaler.transform(train_df),
    index = train_df.index,
    columns = train_df.columns
)
train_df.head()

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,open,high,low,close_change,close
0,1.0,-0.533333,0.346154,0.454545,-0.807237,-0.809233,-0.804245,0.38329,-0.807237
1,1.0,-0.533333,0.346154,0.454545,-0.807237,-0.809233,-0.804245,0.38329,-0.807237
2,1.0,-0.533333,0.346154,0.454545,-0.807237,-0.809233,-0.804245,0.38329,-0.807237
3,1.0,-0.533333,0.346154,0.454545,-0.807237,-0.809233,-0.804245,0.38329,-0.807237
4,1.0,-0.533333,0.346154,0.454545,-0.807237,-0.809233,-0.804245,0.38329,-0.807237


#### Cutting DataFrame in Multiple Sequences

In [39]:
def create_sequences(input_data: pd.DataFrame, target_column, sequence_length=3):
  sequences = []
  data_size = len(input_data)

  for i in tqdm(range(data_size - sequence_length)):
    sequence = input_data[i:i+sequence_length]
    label_position = i + sequence_length
    label = input_data.iloc[label_position][target_column]

    sequences.append((sequence, label))

  return sequences

#### Creating Training and Testing Sequences

In [50]:
SEQUENCE_LENGTH = 60

In [51]:
train_sequences = create_sequences(train_df, "close", sequence_length=SEQUENCE_LENGTH)
test_sequences = create_sequences(test_df, "close", sequence_length=SEQUENCE_LENGTH)

  0%|          | 0/1295195 [00:00<?, ?it/s]

  0%|          | 0/143857 [00:00<?, ?it/s]

AttributeError: ignored

In [52]:
# Let's check first sequence label and data
print("Label: ", train_sequences[0][1])
print("")
print("Sequence: ",train_sequences[0][0])

Label:  -0.8072370347589881

Sequence:      day_of_week  day_of_month  week_of_year     month      open      high  \
0           1.0     -0.533333      0.346154  0.454545 -0.807237 -0.809233   
1           1.0     -0.533333      0.346154  0.454545 -0.807237 -0.809233   
2           1.0     -0.533333      0.346154  0.454545 -0.807237 -0.809233   
3           1.0     -0.533333      0.346154  0.454545 -0.807237 -0.809233   
4           1.0     -0.533333      0.346154  0.454545 -0.807237 -0.809233   
5           1.0     -0.533333      0.346154  0.454545 -0.807237 -0.809233   
6           1.0     -0.533333      0.346154  0.454545 -0.807237 -0.809233   
7           1.0     -0.533333      0.346154  0.454545 -0.807237 -0.809233   
8           1.0     -0.533333      0.346154  0.454545 -0.807237 -0.809233   
9           1.0     -0.533333      0.346154  0.454545 -0.807237 -0.809233   
10          1.0     -0.533333      0.346154  0.454545 -0.807237 -0.809233   
11          1.0     -0.533333      0

In [53]:
len(train_sequences), len(test_sequences)

(1295195, 143857)