In [1]:
# Get code from WorkBook.ipynb

from pathlib import Path
import pandas as pd
import torch
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn

def conv(ninputs, nfilters, kernel_size):
    return nn.Conv1d(ninputs, nfilters, kernel_size, padding=1, bias=True)

def maxpool(filter_size, padding):
    return nn.MaxPool1d(filter_size, padding=padding)

def activation():
    return nn.ReLU()

def dataloading(path, filename, filetype="CSV"):
    """ Returns a pandas dataframe from a data file
        Args:
            - path: a Path() object pointing to the directory containing the data
            - filename: a file name in CSV format located in the directory pointed 
            to by the Path() object
            - filetype: specify the type of file to read from (at the moment, the
            only possibility is "CSV")
    """
    
    # TODO: we should raise exception here if filetype is not CSV
    
    file = path / filename
    if filetype == "CSV":
        return pd.read_csv(file)
    else:
        print('Only CSV file are supported at the moment.')
        return False

def createxandy(data, window, p_window, firstcolastimestamp=True):
    """ Returns X and y as torch tensors to be used in our training and predictions
        Args:
            - data: loaded data in a pandas dataframe format
            - window: the length of each X item
            - p_window: the length of the prediction window (i.e. how many timesteps in the
            future we want to predict)
            - firstcolastimestamp: whether our firstcolumn consists of timestamp (default to True)
    """
    
    if firstcolastimestamp:
        data = data.iloc[:,1:].values
    else:
        data = data.values
    
    N = len(data)
    
    # X will be a list of sequences of size window
    # y will be a list of sequences of size p_window, 
    # immediately following the corresponding X
    
    X, y = [], []
    
    # Total sequence to go over is N + 1 - window - p_window
    seq = N + 1 - window - p_window
    for i in range(seq):
        X_temp, y_temp = data[i:i+window], data[i+window:i+window+p_window]
        X.append(X_temp)
        y.append(y_temp)
        
    X, y = torch.Tensor(X).float(), torch.Tensor(y).float()
    
    return X, y

def createdatasets(X, y, split=[0.7,0.3,0.0]):
    """ Returns a list of Dataset objects (for training, validation and testing)
        Args:
            - X: tensor containing independent variables
            - y: tensor containing dependent variables
            - split: list containing the split between training, validation 
            and testing datasets (the total should add to 1.0)
    """

    # TODO: we should raise exception here
    assert sum(split) - 1.0 < 1e-10

    train_size, valid_size, _ = split
    
    N = len(X)
    
    trainidx = int(len(X) * train_size)
    valididx = int(len(X) * (train_size + valid_size))
    
    # TODO: what if we don't want test / valid datasets? 
    
    train_ds = TensorDataset(X[:trainidx], y[:trainidx])
    if valid_size == 0.0:
        valid_ds = False
    else:
        valid_ds = TensorDataset(X[trainidx:valididx], y[trainidx:valididx])
    if valid_size + train_size == 1.0:
        test_ds = False
    else:
        test_ds = TensorDataset(X[valididx:], y[valididx:])
    
    return [train_ds, valid_ds, test_ds]

def createdataloaders(datasets, bs=64):
    """ Create the dataloaders for all the datasets and returns a list of 
        dataloaders and/or False when no dataloaders can be created for a given 
        dataset (for example when dataset is empty)
        Args:
            - datasets: a list of Dataset objects in this order: training, 
            validation, testing
            - bs: batch size (64 by default)
    """
    
    # TODO: ensure datasets is of the correct object type
    # TOCHECK: should we have shuffle=True or False for validation / testing?
    
    train_dl = DataLoader(datasets[0], batch_size=bs, shuffle=False)
    if datasets[1]:
        valid_dl = DataLoader(datasets[1], batch_size=bs, shuffle=True)
    else:
        valid_dl = False
    if datasets[2]:
        test_dl = DataLoader(datasets[2], batch_size=bs, shuffle=True)
    else:
        test_dl = False
    
    return [train_dl, valid_dl, test_dl]

In [2]:
n_filters = 32
kernel_size = 3
padding = 1

path = Path('Data')
filename = 'data2.csv'
window = 10
p_window = 3

data = dataloading(path, filename)
X, y = createxandy(data, window, p_window, firstcolastimestamp=True)
datasets = createdatasets(X, y, split=[0.7,0.3,0.0])
dataloaders = createdataloaders(datasets, bs=6)

In [3]:
# get the first batch
Xb, yb = next(iter(dataloaders[0]))

In [4]:
# Let's define the layers one by one
# 2 is the number of variables in the time series

conv1 = conv(2, n_filters, kernel_size)
pool1 = maxpool(kernel_size, padding)
conv2 = conv(n_filters, n_filters, kernel_size)
pool2 = maxpool(kernel_size, padding)

In [5]:
Xb.shape

torch.Size([6, 10, 2])

In [8]:
# now let's run the first batch through the different layers
# We need to permute the dimensions since we consider the number of variables in the
# time series to be the number for channels
# TODO: try something different by only considering one channel. In this case, we need to "add"
# a dimension to the time series (through torch.unsqueeze())
resconv1 = conv1(Xb.permute(0,2,1))
resconv1.shape

torch.Size([6, 32, 10])

In [9]:
respool1 = pool1(resconv1)
respool1.shape

torch.Size([6, 32, 4])

In [10]:
resconv2 = conv2(respool1)
resconv2.shape

torch.Size([6, 32, 4])

In [11]:
respool2 = pool2(resconv2)
resconv2.shape

torch.Size([6, 32, 4])

In [13]:
res = resconv2.view(resconv2.shape[0], -1)

In [14]:
res.shape

torch.Size([6, 128])

In [15]:
# 2 is the number of variables in the time series here
res = resconv2.view(resconv2.shape[0],2,-1)

In [16]:
res.shape

torch.Size([6, 2, 64])

In [22]:
# Let's try a linear layer now
# 64 is basically n_filters * 2 (i.e. number of variables in the time series)

linear = nn.Linear(64, p_window)

In [23]:
reslin = linear(res)

In [24]:
reslin.shape

torch.Size([6, 2, 3])

In [25]:
# Does it work? At least we have the correct dimensions
yb.shape

torch.Size([6, 3, 2])

In [26]:
assert reslin.permute(0,2,1).shape == yb.shape