# WorkBook

In this workbook we develop the key components before moving them to Python code. The goal is to interactively develop the code before moving it to pure Python while refactoring it

## Data Loading

We need to first load the data we have

In [1]:
from pathlib import Path
import pandas as pd

In [2]:
# Key variables for data loading
dirpath = Path('Data')
filename = 'data.csv'
path = dirpath / filename

In [3]:
data = pd.read_csv(path)

In [4]:
data.shape

(22695, 2)

In [5]:
data.head()

Unnamed: 0,timestamp,value
0,2013-12-02 21:15:00,73.967322
1,2013-12-02 21:20:00,74.935882
2,2013-12-02 21:25:00,76.124162
3,2013-12-02 21:30:00,78.140707
4,2013-12-02 21:35:00,79.329836


In [6]:
# Number of time series in the data is the number of columns minus 1
# The first column contains the timestamp
nb_ts = data.shape[1] - 1

# TOCHECK: what if the data has no timestamp?
# TOCHECK: it looks like we don't need this variable.

In [7]:
# N is the total length of the dataset. We could use len() here
N = data.shape[0]

In [8]:
# Data has to be manipulated as a numpy array to fit into a torch tensor later
data = data.iloc[:,1:].values

# TOCHECK: what should we do with the timestamp (when there is one)? Here we drop it, but probably wrong...

In [9]:
data

array([[73.96732207],
       [74.935882  ],
       [76.12416182],
       ...,
       [97.13546835],
       [98.05685212],
       [96.90386085]])

In [10]:
# Let's define X and y, given that f(X) = y. 
# The goal of the neural network is the model f()
X = []
y = []

In [11]:
# Let's call w the size of the window we will use on the time series to predict the output (i.e. the size of X)
# Let's call p_w the number of steps in the future we want to predict
w = 10
p_w = 2

In [12]:
# With w and p_w set, we can now define the number of items that will populate X and y
nitems = N + 1 - w - p_w

In [13]:
# Let's not populate X and y
for i in range(nitems):
    X_temp, y_temp = data[i:i+w], data[i+w:i+w+p_w]
    X.append(X_temp)
    y.append(y_temp)

In [14]:
assert nitems == len(X)

In [15]:
# We need to split the dataset into training and validation set
# We cannot do that randomly, the validation dataset needs to follow the training dataset
train_size = 0.8

idxvalid = int(nitems * train_size)

In [16]:
# Let's move the data to a torch tensor
import torch
X, y = torch.Tensor(X).float(), torch.Tensor(y).float()

In [17]:
assert X.shape[0] == nitems
assert X.shape[1] == w
assert X.shape[2] == nb_ts
assert y.shape[0] == nitems
assert y.shape[1] == p_w
assert y.shape[2] == nb_ts

In [18]:
# Import objects from pytorch for dataset and dataloader
from torch.utils.data import DataLoader, TensorDataset

In [19]:
train_ds = TensorDataset(X[:idxvalid], y[:idxvalid])
valid_ds = TensorDataset(X[idxvalid:], y[idxvalid:])

# TOCHECK: should we move to an IterableDataset() instead? See https://pytorch.org/docs/stable/data.html

In [20]:
# Now create the DataLoaders

bs = 64

train_dl = DataLoader(train_ds, batch_size=bs, shuffle=False)
valid_dl = DataLoader(valid_ds, batch_size=bs, shuffle=False)

In [21]:
a = next(iter(train_dl))

# TOCHECK: why is a a list and not a tensor??

In [22]:
a[0].shape

torch.Size([64, 10, 1])

In [23]:
a[1].shape

torch.Size([64, 2, 1])

In [27]:
# Let's define some functions
import torch.nn as nn

def conv(ninputs, nfilters, kernel_size):
    return nn.Conv1d(ninputs, nfilters, kernel_size, padding=1, bias=True)

def maxpool(filter_size):
    return nn.MaxPool1d(filter_size)

def activation():
    return nn.ReLU()