In [2]:
import sys
sys.path.append("../")

### import packages

In [28]:
import pandas as pd
import numpy as np
from serieset import TimeSeriesDataset

### load dataset file

In [4]:
data = pd.read_csv("./data/ETTh1.csv")

### process raw dataset: add group_id column (required)

In [5]:
data["group_id"] = "ETTh1"

In [10]:
print(data.head())
print(f"minimum date: {data['date'].min()}")
print(f"maximum date: {data['date'].max()}")

                  date   HUFL   HULL   MUFL   MULL   LUFL   LULL         OT  \
0  2016-07-01 00:00:00  5.827  2.009  1.599  0.462  4.203  1.340  30.531000   
1  2016-07-01 01:00:00  5.693  2.076  1.492  0.426  4.142  1.371  27.787001   
2  2016-07-01 02:00:00  5.157  1.741  1.279  0.355  3.777  1.218  27.787001   
3  2016-07-01 03:00:00  5.090  1.942  1.279  0.391  3.807  1.279  25.044001   
4  2016-07-01 04:00:00  5.358  1.942  1.492  0.462  3.868  1.279  21.948000   

  group_id  
0    ETTh1  
1    ETTh1  
2    ETTh1  
3    ETTh1  
4    ETTh1  
minimum date: 2016-07-01 00:00:00
maximum date: 2018-06-26 19:00:00


### here is some arguments for TimeSeriesDataset

In [13]:
params = {
    'target_col': 'OT',
    'features': ["HUFL", "HULL"],
    'group_id': 'group_id',
    'date_col': 'date',
    'inp_len': 36,
    'pred_len': 12,
    'train_val_split_date': '2018-01-01 00:00:00',
    'mode': 'train',
}

### create TimeSeriesDataset instance

In [14]:
torch_dataset = TimeSeriesDataset(
    data=data,
    **params
)

### TimeSeriesDataset returns a dictionary of ['x', 'x_feats', 'y', 'group_id', 'sample_id']

In [15]:
print(f"output of dataset is: {torch_dataset[0].keys()}")

output of dataset is: dict_keys(['x', 'x_feats', 'y', 'group_id', 'sample_id'])


### 'x' is input time series

In [19]:
print(f"shape of x: {torch_dataset[0]['x'].shape}")

shape of x: torch.Size([36])


In [37]:
# check correctness
not_equal = (torch_dataset[0]['x'].numpy() != data["OT"][:params['inp_len']].astype(np.float32)).sum()
print(f"first sample not equal: {not_equal}")

first sample not equal: 0


### 'x_feats' is input covariates

In [41]:
print(f"shape of x_feats: {torch_dataset[0]['x_feats'].shape}")

shape of x_feats: torch.Size([36, 2])


In [42]:
not_equal = (torch_dataset[0]['x_feats'].numpy() != data[params['features']][:params['inp_len']].astype(np.float32)).sum()
print(f"first sample features not equal: {not_equal}")

first sample features not equal: HUFL    0
HULL    0
dtype: int64


### 'y' is target time series

In [43]:
print(f"shape of y: {torch_dataset[0]['y'].shape}")

shape of y: torch.Size([12])


In [44]:
# check correctness
not_equal = (torch_dataset[0]['y'].numpy() != data["OT"][params['inp_len']: params['inp_len']+ params['pred_len']].astype(np.float32)).sum()
print(f"first sample not equal: {not_equal}")

first sample not equal: 0


### 'group_id' is time series group id

In [46]:
print(f"group id: {torch_dataset[0]['group_id']}")

group id: 0


### each training / validation sample has a 'sample_id'

In [47]:
print(f"sample id: {torch_dataset[0]['sample_id']}")

sample id: 0


### furthermore, you can check index manually

In [48]:
torch_dataset.index

Unnamed: 0,index_start,index_end,group_id,predict_start_date,sample_id
0,0,47,0,2016-07-02 12:00:00,0
1,1,48,0,2016-07-02 13:00:00,1
2,2,49,0,2016-07-02 14:00:00,2
3,3,50,0,2016-07-02 15:00:00,3
4,4,51,0,2016-07-02 16:00:00,4
...,...,...,...,...,...
13135,13135,13182,0,2017-12-31 19:00:00,13135
13136,13136,13183,0,2017-12-31 20:00:00,13136
13137,13137,13184,0,2017-12-31 21:00:00,13137
13138,13138,13185,0,2017-12-31 22:00:00,13138
