In [1]:
import os
import sys
sys.path.append(os.path.abspath('..'))

In [2]:
print(sys.path[-1])

/data/shizhaoshu/project/dltime-torch


In [4]:
import torch
import numpy as np
from tqdm import tqdm
from sktime.datasets import load_UCR_UEA_dataset
from dltime.data.tsc_dataset_names import *
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from torch.utils.data import Dataset, DataLoader

In [5]:
univariate_equal_length_for_train

['Adiac',
 'ArrowHead',
 'BME',
 'ChlorineConcentration',
 'CricketX',
 'CricketY',
 'CricketZ',
 'Crop',
 'ECG200',
 'ECG5000',
 'Wafer']

In [6]:
x, y = load_UCR_UEA_dataset(multivariate_equal_length[0], split=None, return_X_y=True, \
            extract_path="./ucr_uea_archive") # x, y => Dataframe

In [5]:
def uniform_scaling(data, max_len):
    """
    This is a function to scale the time series uniformly
    :param data:
    :param max_len:
    :return:
    """
    seq_len = len(data)
    scaled_data = [data[int(j * seq_len / max_len)] for j in range(max_len)]

    return scaled_data

In [6]:
def process_data(X, min_len, normalise=None):
    """
    This is a function to process the data, i.e. convert dataframe to numpy array
    :param X:
    :param min_len:
    :param normalise:
    :return:
    """
    tmp = []
    seq_len = 0
    for i in range(len(X)):
        _x = X.iloc[i, :].copy(deep=True)
        seq_len = max(seq_len, max([len(y) for y in _x]))

    for i in tqdm(range(len(X))):
        # 取出第i个数据
        _x = X.iloc[i, :].copy(deep=True)

        # 1. 计算该数据每一维的数据长度
        all_len = [len(y) for y in _x]
        max_len = max(all_len)

        # 2. 统一每一维的数据长度
        _y = []
        for y in _x:
            # 2.1 如果有缺失值, 进行插补
            if y.isnull().any():
                y = y.interpolate(method='linear', limit_direction='both')

            # 2.2. 如果有维度间的数据长度不相等, 则填充到一致
            if len(y) < max_len:
                y = uniform_scaling(y, max_len)
            _y.append(y)
        _y = np.array(np.transpose(_y))

        # 3. adjust the length of the series, chop of the longer series
        # _y = _y[:min_len, :]

        # 4. 归一化
        if normalise == "standard":
            scaler = StandardScaler().fit(_y)
            _y = scaler.transform(_y)
        if normalise == "minmax":
            scaler = MinMaxScaler().fit(_y)
            _y = scaler.transform(_y)

        tmp.append(_y)
    X = np.array(tmp)
    return X

In [7]:
x_ = process_data(x, min_len=None)

100%|██████████| 575/575 [00:00<00:00, 838.86it/s]


In [8]:
x_.shape

(575, 144, 9)

In [9]:
def dataframe2ndarray(X):
    "X 是具体某一条数据, 而非整个数据集"
    # 1. 统计各维度的数据长度
    all_len = [len(x) for x in X]
    max_len = max(all_len)

    # 2. 统一每一维度的数据长度
    _X = []
    for x in X:
        # 2.1 如果有缺失值, 进行插补
        if x.isnull().any():
            x = x.interpolate(method='linear', limit_direction='both')

        # 2.2. 如果有维度间的数据长度不相等, 则填充到一致
        if len(x) < max_len:
            x = uniform_scaling(x, max_len)
        _X.append(x)
    _X = np.array(np.transpose(_X))

    return _X

def get_max_seq_len(data_df):
    "获取一个完整数据集中的最大序列长度"
    max_seq_len = 0
    for i in range(len(data_df)):
        # 取出第i个数据
        X = data_df.iloc[i, :].copy(deep=True)
        max_seq_len = max(max_seq_len, max([len(x) for x in X]))
        return max_seq_len


In [10]:
class tsMinMaxNormlizer:
    "用于对dataframe型的序列做最大最小归一化"
    def __init__(self, scale=(0, 1)):
        self.scale = scale

    def fit(self, X):
        # 输入x为sktime型的dataframe
        self.data_max_ = []
        self.data_min_ = []
        for dim in X.columns:
            x = X[dim]
            total_x = []
            for _x in x:
                total_x.extend(list(_x))
            self.data_max_.append(max(total_x))
            self.data_min_.append(min(total_x))

    def transform(self, x):
        # 输入x为numpy.array, x shape: (seq_len, dim)
        result = []
        for i in range(x.shape[-1]):
            _x = x[:, i]
            _x = (_x - self.data_min_[i]) / (self.data_max_[i] - self.data_min_[i])
            _x = self.scale[0] + _x * (self.scale[1] - self.scale[0])
            result.append(_x[:, np.newaxis])
        
        return np.concatenate(result, axis=-1)

In [11]:
class UCR_UEADataset(Dataset):
    "Torch Datasets for UCR/UEA archive"

    def __init__(self, name, split=None, extract_path="ucr_uea_archive", max_len=256, return_y=True, mask=False, 
        normalize=None):
        assert split in ["train", "test", None]
        assert normalize in ["standard", "minmax", None]

        super().__init__()
        self.return_y = return_y
        self.mask = mask
        self.normalize = normalize

        self.data, self.label = load_UCR_UEA_dataset(name, split=split, return_X_y=True, \
            extract_path=extract_path) # x, y => Dataframe
        
        self.max_len = max(max_len, get_max_seq_len(self.data) + 1) # 获取最大序列长度
        self.normalizer = tsMinMaxNormlizer(scale=(0.05, 0.95))
        self.normalizer.fit(self.data)

        # 处理标签
        self.label = np.array(self.label) # label 为具体标签的名称
        self.label2y = dict([(y, i) for i, y in enumerate(np.unique(self.label))]) # 标签名与其对应的数字标签
        self.y2label = list(self.label2y.values()) # 数字标签所对应的标签名
        self.y = [self.label2y[label] for label in self.label] # 转换具体标签至标签名

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        X = self.data.iloc[idx].copy(deep=True)
        X = dataframe2ndarray(X)    # dataframe 转 numpy 数组

        # 数据归一化, 均按维度进行归一化
        X = self.normalizer.transform(X)
        padding_mask = [0] + [0] * X.shape[0] + [1] * (self.max_len - X.shape[0] - 1)

        cls = np.ones((1, X.shape[-1])) # [CLS]
        pad = np.zeros((self.max_len - X.shape[0] - 1, X.shape[-1])) # [PAD]
        X = np.concatenate([cls, X, pad], axis=0)
        item = {"input": torch.from_numpy(X).float(), "padding_mask": torch.LongTensor(padding_mask)}

        if self.return_y:
            item["label"] = torch.tensor(self.y[idx]).long()
        
        return item


In [12]:
dataset = UCR_UEADataset(multivariate_equal_length[0])

In [13]:
dataset[0]['padding_mask']

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [14]:
dataset[0]['input'].size()

torch.Size([256, 9])

In [15]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [16]:
from dltime.models.ts_transformer import TSTransformerEncoderClassifier
from dltime.classifiers import FCN

In [17]:
model = TSTransformerEncoderClassifier(feat_dim=dataset[0]['input'].shape[-1], max_len=dataset.max_len, d_model=128, n_heads=4, num_layers=2, dim_feedforward=512, num_classes=len(dataset.y2label))

In [18]:
for i, item in enumerate(dataloader):
    print(item['input'].size())
    print(item['padding_mask'])
    outputs = model(item['input'], item['padding_mask'])

torch.Size([32, 256, 9])
tensor([[0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        ...,
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1]])
