In [1]:
import sys
sys.path.append("../")

### import packages

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from serieset import TimeSeriesDataset

### load dataset file

In [3]:
data = pd.read_csv("./data/ETTh1.csv")

### process raw dataset: add group_id column (required)

In [4]:
data["group_id"] = "group_id"

In [5]:
print(data.head())
print(f"minimum date: {data['date'].min()}")
print(f"maximum date: {data['date'].max()}")

                  date   HUFL   HULL   MUFL   MULL   LUFL   LULL         OT  \
0  2016-07-01 00:00:00  5.827  2.009  1.599  0.462  4.203  1.340  30.531000   
1  2016-07-01 01:00:00  5.693  2.076  1.492  0.426  4.142  1.371  27.787001   
2  2016-07-01 02:00:00  5.157  1.741  1.279  0.355  3.777  1.218  27.787001   
3  2016-07-01 03:00:00  5.090  1.942  1.279  0.391  3.807  1.279  25.044001   
4  2016-07-01 04:00:00  5.358  1.942  1.492  0.462  3.868  1.279  21.948000   

   group_id  
0  group_id  
1  group_id  
2  group_id  
3  group_id  
4  group_id  
minimum date: 2016-07-01 00:00:00
maximum date: 2018-06-26 19:00:00


### here is some arguments for TimeSeriesDataset

In [6]:
params = {
    'target_col': 'OT',
    'features': ["HUFL", "HULL"],
    'group_id': 'group_id',
    'date_col': 'date',
    'inp_len': 36,
    'pred_len': 12,
    # 'train_val_split_date': '2018-01-01 00:00:00',
    'train_val_split_date': 'last',
    'mode': 'train',
}

### create TimeSeriesDataset instance

In [7]:
torch_dataset = TimeSeriesDataset(
    data=data,
    **params
)

### TimeSeriesDataset returns a dictionary of ['x', 'x_feats', 'y', 'group_id', 'sample_id']

In [8]:
print(f"output of dataset is: {torch_dataset[0].keys()}")

output of dataset is: dict_keys(['x', 'x_feats', 'y', 'group_id', 'sample_id'])


### 'x' is input time series

In [9]:
print(f"shape of x: {torch_dataset[0]['x'].shape}")

shape of x: torch.Size([36])


In [10]:
# check correctness
not_equal = (torch_dataset[0]['x'].numpy() != data["OT"][:params['inp_len']].astype(np.float32)).sum()
print(f"first sample not equal: {not_equal}")

first sample not equal: 0


### 'x_feats' is input covariates

In [11]:
print(f"shape of x_feats: {torch_dataset[0]['x_feats'].shape}")

shape of x_feats: torch.Size([36, 2])


In [12]:
not_equal = (torch_dataset[0]['x_feats'].numpy() != data[params['features']][:params['inp_len']].astype(np.float32)).sum()
print(f"first sample features not equal: {not_equal}")

first sample features not equal: HUFL    0
HULL    0
dtype: int64


### 'y' is target time series

In [13]:
print(f"shape of y: {torch_dataset[0]['y'].shape}")

shape of y: torch.Size([12])


In [14]:
# check correctness
not_equal = (torch_dataset[0]['y'].numpy() != data["OT"][params['inp_len']: params['inp_len']+ params['pred_len']].astype(np.float32)).sum()
print(f"first sample not equal: {not_equal}")

first sample not equal: 0


### 'group_id' is time series group id

In [15]:
print(f"group id: {torch_dataset[0]['group_id']}")

group id: 0


### each training / validation sample has a 'sample_id'

In [16]:
print(f"sample id: {torch_dataset[0]['sample_id']}")

sample id: 0


### furthermore, you can check index manually

In [17]:
torch_dataset.index

Unnamed: 0,index_start,index_end,group_id,predict_start_date,sample_id
0,0,47,0,2016-07-02 12:00:00,0
1,1,48,0,2016-07-02 13:00:00,1
2,2,49,0,2016-07-02 14:00:00,2
3,3,50,0,2016-07-02 15:00:00,3
4,4,51,0,2016-07-02 16:00:00,4
...,...,...,...,...,...
17367,17367,17414,0,2018-06-26 03:00:00,17367
17368,17368,17415,0,2018-06-26 04:00:00,17368
17369,17369,17416,0,2018-06-26 05:00:00,17369
17370,17370,17417,0,2018-06-26 06:00:00,17370


### dataloader

In [18]:
dl = DataLoader(torch_dataset, batch_size=32, shuffle=True)

In [19]:
next(iter(dl))

{'x': tensor([[ 9.7780,  9.9890, 10.2710,  ...,  7.0350,  7.6680,  8.7230],
         [18.6420, 18.9230, 15.5470,  ..., 16.4610, 15.6870, 12.2400],
         [ 3.1660,  3.2360,  3.3060,  ..., 11.3260, 11.8180, 11.1850],
         ...,
         [34.2590, 35.3140, 34.8220,  ..., 35.5960, 35.1030, 34.8920],
         [ 9.0040,  9.2150,  9.2150,  ...,  8.3010,  8.7230,  9.4970],
         [20.4710, 19.3450, 19.0640,  ..., 20.5410, 21.1740, 21.3850]]),
 'x_feats': tensor([[[11.1860, -0.6030],
          [12.5920, -0.2010],
          [12.7260, -0.4690],
          ...,
          [11.1190,  0.4020],
          [10.3820,  0.2680],
          [11.2530,  0.1340]],
 
         [[ 4.6220,  0.5360],
          [ 6.2290, -0.4020],
          [ 6.6980, -0.4690],
          ...,
          [ 6.7650,  0.0000],
          [ 7.1670,  0.8040],
          [ 6.8320,  0.0000]],
 
         [[ 7.3680, -1.2060],
          [ 6.9660, -1.3400],
          [ 6.2960, -1.4740],
          ...,
          [13.3290,  2.8800],
          [

In [20]:
for itr in dl:
    print(itr)
    break

{'x': tensor([[24.1990, 24.0590, 25.2550,  ..., 21.7370, 21.8780, 21.8080],
        [16.2500, 17.3050, 17.4460,  ..., 19.2750, 18.0090, 17.5160],
        [10.6930, 10.2710, 10.0600,  ...,  7.8090,  7.7380,  8.5820],
        ...,
        [12.5220, 14.2800, 13.4360,  ...,  7.6680,  7.7380,  7.5970],
        [ 4.0800,  3.7280,  3.7280,  ...,  3.4470,  3.9390,  3.0950],
        [17.5160, 17.4460, 17.2350,  ..., 15.9690, 15.7580, 15.6870]]), 'x_feats': tensor([[[ -6.4300,   4.3540],
         [ -4.2200,   5.0230],
         [ -3.4160,   4.0860],
         ...,
         [ 19.1560,   1.0720],
         [ 19.1560,   2.4780],
         [ 18.2190,   2.4110]],

        [[ -8.3720,   5.6930],
         [-10.7170,   6.0280],
         [-13.5300,   5.4920],
         ...,
         [  6.7650,   0.4690],
         [  7.5020,   0.2680],
         [  7.1000,   0.0000]],

        [[ 12.2570,   4.7560],
         [ 11.4540,   3.3490],
         [ 10.8510,   3.2820],
         ...,
         [  2.0090,   3.5500],
      

In [21]:
torch_dataset.group_id_map

{0: 'group_id'}