In [1]:
import sys
sys.path.append("../")

### import packages

In [2]:
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from serieset import TimeSeriesDataset

### load dataset file

In [3]:
data = pd.read_csv("./data/ETTh1.csv")

### process raw dataset: add group_id column (required)

In [4]:
data["group_id"] = "group_id"

In [5]:
print(data.head())
print(f"minimum date: {data['date'].min()}")
print(f"maximum date: {data['date'].max()}")

                  date   HUFL   HULL   MUFL   MULL   LUFL   LULL         OT  \
0  2016-07-01 00:00:00  5.827  2.009  1.599  0.462  4.203  1.340  30.531000   
1  2016-07-01 01:00:00  5.693  2.076  1.492  0.426  4.142  1.371  27.787001   
2  2016-07-01 02:00:00  5.157  1.741  1.279  0.355  3.777  1.218  27.787001   
3  2016-07-01 03:00:00  5.090  1.942  1.279  0.391  3.807  1.279  25.044001   
4  2016-07-01 04:00:00  5.358  1.942  1.492  0.462  3.868  1.279  21.948000   

   group_id  
0  group_id  
1  group_id  
2  group_id  
3  group_id  
4  group_id  
minimum date: 2016-07-01 00:00:00
maximum date: 2018-06-26 19:00:00


### here is some arguments for TimeSeriesDataset

In [6]:
params = {
    'target_col': 'OT',
    'features': ["HUFL", "HULL"],
    'group_id': 'group_id',
    'date_col': 'date',
    'inp_len': 36,
    'pred_len': 12,
    'train_val_split_date': '2018-01-01 00:00:00',
    'mode': 'train',
}

### create TimeSeriesDataset instance

In [7]:
torch_dataset = TimeSeriesDataset(
    data=data,
    **params
)

### TimeSeriesDataset returns a dictionary of ['x', 'x_feats', 'y', 'group_id', 'sample_id']

In [8]:
print(f"output of dataset is: {torch_dataset[0].keys()}")

output of dataset is: dict_keys(['x', 'x_feats', 'y', 'group_id', 'sample_id'])


### 'x' is input time series

In [9]:
print(f"shape of x: {torch_dataset[0]['x'].shape}")

shape of x: torch.Size([36])


In [10]:
# check correctness
not_equal = (torch_dataset[0]['x'].numpy() != data["OT"][:params['inp_len']].astype(np.float32)).sum()
print(f"first sample not equal: {not_equal}")

first sample not equal: 0


### 'x_feats' is input covariates

In [11]:
print(f"shape of x_feats: {torch_dataset[0]['x_feats'].shape}")

shape of x_feats: torch.Size([36, 2])


In [12]:
not_equal = (torch_dataset[0]['x_feats'].numpy() != data[params['features']][:params['inp_len']].astype(np.float32)).sum()
print(f"first sample features not equal: {not_equal}")

first sample features not equal: HUFL    0
HULL    0
dtype: int64


### 'y' is target time series

In [13]:
print(f"shape of y: {torch_dataset[0]['y'].shape}")

shape of y: torch.Size([12])


In [14]:
# check correctness
not_equal = (torch_dataset[0]['y'].numpy() != data["OT"][params['inp_len']: params['inp_len']+ params['pred_len']].astype(np.float32)).sum()
print(f"first sample not equal: {not_equal}")

first sample not equal: 0


### 'group_id' is time series group id

In [15]:
print(f"group id: {torch_dataset[0]['group_id']}")

group id: 0


### each training / validation sample has a 'sample_id'

In [16]:
print(f"sample id: {torch_dataset[0]['sample_id']}")

sample id: 0


### furthermore, you can check index manually

In [17]:
torch_dataset.index

Unnamed: 0,index_start,index_end,group_id,predict_start_date,sample_id
0,0,47,0,2016-07-02 12:00:00,0
1,1,48,0,2016-07-02 13:00:00,1
2,2,49,0,2016-07-02 14:00:00,2
3,3,50,0,2016-07-02 15:00:00,3
4,4,51,0,2016-07-02 16:00:00,4
...,...,...,...,...,...
13135,13135,13182,0,2017-12-31 19:00:00,13135
13136,13136,13183,0,2017-12-31 20:00:00,13136
13137,13137,13184,0,2017-12-31 21:00:00,13137
13138,13138,13185,0,2017-12-31 22:00:00,13138


### dataloader

In [18]:
dl = DataLoader(torch_dataset, batch_size=2, shuffle=True)

In [22]:
next(iter(dl))

{'x': tensor([[14.3510, 14.5620, 14.1400, 14.3510, 13.9990, 13.7880, 13.5070, 13.3660,
          13.0850, 14.3510, 13.6470, 14.0690, 13.6470, 12.9440, 12.4510, 12.3810,
          12.1700, 12.1000, 11.8180, 11.5370, 11.3960, 11.6070, 11.1850, 11.1850,
          10.9740, 11.0440, 11.0440, 10.8330, 11.1850, 11.3960, 11.3960, 11.0440,
          11.1850, 11.3260, 11.2560, 11.5370],
         [36.0180, 36.2290, 36.5100, 36.7210, 36.5800, 37.7060, 37.7060, 37.6360,
          37.0030, 38.5500, 38.0580, 37.0030, 37.0030, 29.8270, 32.8520, 36.1580,
          35.8070, 37.7760, 39.5350, 38.2690, 34.7510, 33.7670, 32.8520, 33.1330,
          34.3290, 35.3850, 34.5400, 32.7110, 33.9070, 34.6810, 35.6660, 34.4700,
          35.3140, 33.8370, 35.8770, 35.5250]]),
 'x_feats': tensor([[[12.8600,  1.2730],
          [11.8550,  0.4690],
          [11.1190, -0.2010],
          [11.7880,  0.0670],
          [10.7840,  0.6700],
          [ 8.6400,  0.6700],
          [ 8.1720,  0.0670],
          [ 8.2390,  0

In [26]:
for itr in dl:
    print(itr)
    break

{'x': tensor([[ 3.2360,  2.9550,  0.9850,  0.7740,  0.0000,  0.9850,  0.0000,  0.6330,
          0.0000, -0.6330,  2.1810,  2.4620,  3.0950,  5.2760,  5.3460,  4.2910,
          5.3460,  4.7130,  5.9790,  4.5020,  6.6130,  6.8240,  6.9640,  4.9950,
          5.6980,  6.6130,  6.4020,  3.7280,  3.1660,  5.0650,  5.2760,  3.7280,
          5.3460,  6.8240,  7.0350,  6.6130],
        [ 9.4260,  9.3560,  8.5120,  8.8640,  8.3010,  9.0040,  5.0650,  7.1050,
          5.5570,  5.9090,  4.2210,  4.6430,  6.1200,  7.3860,  3.1660,  3.2360,
          3.3060,  3.2360,  4.0100,  4.0100,  6.7530,  6.7530,  6.7530,  6.7530,
          6.7530,  6.7530,  6.7530,  6.7530,  6.7530,  6.7530,  6.7530,  6.7530,
          6.7530,  6.7530,  6.7530,  6.7530]]), 'x_feats': tensor([[[ 7.1670, -1.6080],
         [ 6.7650, -1.7410],
         [ 6.6310, -1.5410],
         [ 6.7650, -1.7410],
         [ 7.2340, -1.6740],
         [ 9.2430, -1.4070],
         [11.3870, -0.9380],
         [10.5160, -0.7370],
         

In [21]:
torch_dataset.group_id_map

{0: 'group_id'}