In [1]:
import joblib
import sys
import torch
import math
import numpy as np
# from tqdm.notebook import tqdm
from tqdm import tqdm
import os
import pandas as pd
from sktime.datasets import load_from_tsfile_to_dataframe

import warnings
warnings.filterwarnings("ignore")

### Prepare the UEA classification datasets

In [2]:
dataset_names=[
    'ArticularyWordRecognition', 
    'AtrialFibrillation', 
    'BasicMotions', 
    'CharacterTrajectories', 
    'Cricket', 
    'DuckDuckGeese', 
    'ERing', 
    'EigenWorms', 
    'Epilepsy', 
    'EthanolConcentration', 
    'FaceDetection', 
    'FingerMovements', 
    'HandMovementDirection', 
    'Handwriting', 
    'Heartbeat', 
    # 'InsectWingbeat',
    'JapaneseVowels', 
    'LSST', 
    'Libras', 
    'MotorImagery', 
    'NATOPS', 
    'PEMS-SF', 
    'PenDigits', 
    'PhonemeSpectra', 
    'RacketSports', 
    'SelfRegulationSCP1', 
    'SelfRegulationSCP2', 
    'SpokenArabicDigits', 
    'StandWalkJump', 
    'UWaveGestureLibrary'
]


# Change the dataset root to the path of your dataset
dataset_root='/data/wanghaokai/VQShape-main/Multivariate_ts'
save_root = '/data/wanghaokai/VQShape-main/uea'

def interpolate_uts(x, new_len):
    x = torch.from_numpy(x)
    x = torch.nn.functional.interpolate(x.view(1, 1, -1), new_len, mode='linear')
    return x.squeeze()

def dataframe_to_list(df):
    """
    Convert each element of a pandas DataFrame to a list.

    :param df: pandas DataFrame.
    :return: List containing all elements of the DataFrame.
    """
    elements_list = []
    for _, row in df.iterrows():
        elements_list.extend(row.tolist())
    return elements_list

def load_single(dataset_root, dataset):
    df, labels = load_from_tsfile_to_dataframe(f"{dataset_root}/{dataset}/{dataset}_{flag}.ts")
    data_list = dataframe_to_list(df)
    df = None
    data_list = [interpolate_uts(x.values.flatten(), new_len=512).float() for x in data_list]
    # data_list = [(x - np.mean(x))/(np.std(x) + np.finfo(float).eps) for x in data_list]
    data_list = [(x - x.mean()) / (x.var() + 1e-5).sqrt() for x in data_list]
    return torch.stack(data_list, dim=0).numpy()


# Load the UEA datasets and write each univariate TS into a csv file
for flag in ['TRAIN', 'TEST']:
    for dataset in dataset_names:
        print(dataset, end=": ")
        save_dir = f"{save_root}/{flag}/{dataset}"
        os.makedirs(save_dir, exist_ok=True)
        data = load_single(dataset_root, dataset)
        print(data.shape)

        for i, x in enumerate(tqdm(data)):
            df = pd.DataFrame(x)
            df.to_csv(f"{save_dir}/{i}.csv", index=False, header=False)

ArticularyWordRecognition: (2475, 512)


100%|██████████| 2475/2475 [00:04<00:00, 501.13it/s]


AtrialFibrillation: (30, 512)


100%|██████████| 30/30 [00:00<00:00, 497.36it/s]


BasicMotions: (240, 512)


100%|██████████| 240/240 [00:00<00:00, 508.33it/s]


CharacterTrajectories: (4266, 512)


100%|██████████| 4266/4266 [00:05<00:00, 752.99it/s] 


Cricket: (648, 512)


100%|██████████| 648/648 [00:00<00:00, 770.76it/s]


DuckDuckGeese: (67250, 512)


100%|██████████| 67250/67250 [01:39<00:00, 674.59it/s] 


ERing: (120, 512)


100%|██████████| 120/120 [00:00<00:00, 978.57it/s] 


EigenWorms: (768, 512)


100%|██████████| 768/768 [00:01<00:00, 576.14it/s]


Epilepsy: (411, 512)


100%|██████████| 411/411 [00:00<00:00, 989.22it/s] 


EthanolConcentration: (783, 512)


100%|██████████| 783/783 [00:00<00:00, 1121.92it/s]


FaceDetection: (848160, 512)


100%|██████████| 848160/848160 [22:00<00:00, 642.27it/s] 


FingerMovements: (8848, 512)


100%|██████████| 8848/8848 [00:16<00:00, 529.23it/s]


HandMovementDirection: (1600, 512)


100%|██████████| 1600/1600 [00:02<00:00, 564.69it/s]


Handwriting: (450, 512)


100%|██████████| 450/450 [00:00<00:00, 996.60it/s] 


Heartbeat: (12444, 512)


100%|██████████| 12444/12444 [00:18<00:00, 668.33it/s]


JapaneseVowels: (3240, 512)


100%|██████████| 3240/3240 [00:03<00:00, 1075.41it/s]


LSST: (14754, 512)


100%|██████████| 14754/14754 [00:24<00:00, 596.71it/s] 


Libras: (360, 512)


100%|██████████| 360/360 [00:00<00:00, 948.80it/s] 


MotorImagery: (17792, 512)


100%|██████████| 17792/17792 [00:25<00:00, 703.54it/s] 


NATOPS: (4320, 512)


100%|██████████| 4320/4320 [00:08<00:00, 514.17it/s]


PEMS-SF: (257121, 512)


100%|██████████| 257121/257121 [05:22<00:00, 796.91it/s] 


PenDigits: (14988, 512)


100%|██████████| 14988/14988 [00:20<00:00, 744.87it/s] 


PhonemeSpectra: (36465, 512)


100%|██████████| 36465/36465 [00:46<00:00, 789.70it/s] 


RacketSports: (906, 512)


100%|██████████| 906/906 [00:00<00:00, 1000.21it/s]


SelfRegulationSCP1: (1608, 512)


100%|██████████| 1608/1608 [00:01<00:00, 1009.45it/s]


SelfRegulationSCP2: (1400, 512)


100%|██████████| 1400/1400 [00:01<00:00, 1025.34it/s]


SpokenArabicDigits: (85787, 512)


100%|██████████| 85787/85787 [01:41<00:00, 844.62it/s] 


StandWalkJump: (48, 512)


100%|██████████| 48/48 [00:00<00:00, 432.42it/s]


UWaveGestureLibrary: (360, 512)


100%|██████████| 360/360 [00:00<00:00, 810.44it/s] 


ArticularyWordRecognition: (2700, 512)


100%|██████████| 2700/2700 [00:02<00:00, 1025.56it/s]


AtrialFibrillation: (30, 512)


100%|██████████| 30/30 [00:00<00:00, 789.49it/s]


BasicMotions: (240, 512)


100%|██████████| 240/240 [00:00<00:00, 814.19it/s] 


CharacterTrajectories: (4308, 512)


100%|██████████| 4308/4308 [00:04<00:00, 968.05it/s] 


Cricket: (432, 512)


100%|██████████| 432/432 [00:00<00:00, 1074.66it/s]


DuckDuckGeese: (67250, 512)


100%|██████████| 67250/67250 [01:10<00:00, 960.28it/s] 


ERing: (1080, 512)


100%|██████████| 1080/1080 [00:01<00:00, 1039.19it/s]


EigenWorms: (786, 512)


100%|██████████| 786/786 [00:00<00:00, 1043.45it/s]


Epilepsy: (414, 512)


100%|██████████| 414/414 [00:00<00:00, 1064.10it/s]


EthanolConcentration: (789, 512)


100%|██████████| 789/789 [00:00<00:00, 1022.21it/s]


FaceDetection: (507456, 512)


100%|██████████| 507456/507456 [13:22<00:00, 632.61it/s] 


FingerMovements: (2800, 512)


100%|██████████| 2800/2800 [00:06<00:00, 456.68it/s]


HandMovementDirection: (740, 512)


100%|██████████| 740/740 [00:01<00:00, 465.43it/s]


Handwriting: (2550, 512)


100%|██████████| 2550/2550 [00:05<00:00, 464.86it/s]


Heartbeat: (12505, 512)


100%|██████████| 12505/12505 [00:18<00:00, 672.11it/s] 


JapaneseVowels: (4440, 512)


100%|██████████| 4440/4440 [00:08<00:00, 493.98it/s]


LSST: (14796, 512)


100%|██████████| 14796/14796 [00:19<00:00, 757.18it/s] 


Libras: (360, 512)


100%|██████████| 360/360 [00:00<00:00, 1036.16it/s]


MotorImagery: (6400, 512)


100%|██████████| 6400/6400 [00:07<00:00, 903.60it/s] 


NATOPS: (4320, 512)


100%|██████████| 4320/4320 [00:04<00:00, 871.33it/s] 


PEMS-SF: (166599, 512)


100%|██████████| 166599/166599 [04:05<00:00, 677.84it/s] 


PenDigits: (6996, 512)


100%|██████████| 6996/6996 [00:07<00:00, 880.45it/s] 


PhonemeSpectra: (36883, 512)


100%|██████████| 36883/36883 [00:51<00:00, 715.33it/s] 


RacketSports: (912, 512)


100%|██████████| 912/912 [00:01<00:00, 508.93it/s]


SelfRegulationSCP1: (1758, 512)


100%|██████████| 1758/1758 [00:01<00:00, 949.31it/s] 


SelfRegulationSCP2: (1260, 512)


100%|██████████| 1260/1260 [00:01<00:00, 740.16it/s] 


SpokenArabicDigits: (28587, 512)


100%|██████████| 28587/28587 [00:37<00:00, 772.06it/s] 


StandWalkJump: (60, 512)


100%|██████████| 60/60 [00:00<00:00, 451.93it/s]


UWaveGestureLibrary: (960, 512)


100%|██████████| 960/960 [00:01<00:00, 800.29it/s]


### Prepare the Forecasting datasets

In [7]:
import pandas as pd
import glob

glob.glob("../data/PILE/forecasting/autoformer/*")

['../data/PILE/forecasting/autoformer/weather.csv',
 '../data/PILE/forecasting/autoformer/ETTm2.csv',
 '../data/PILE/forecasting/autoformer/ETTh1.csv',
 '../data/PILE/forecasting/autoformer/electricity.csv',
 '../data/PILE/forecasting/autoformer/national_illness.csv',
 '../data/PILE/forecasting/autoformer/traffic.csv',
 '../data/PILE/forecasting/autoformer/exchange_rate.csv',
 '../data/PILE/forecasting/autoformer/ETTh2.csv',
 '../data/PILE/forecasting/autoformer/ETTm1.csv']

In [22]:
from tqdm import tqdm
import os
import numpy as np

dataset = 'traffic'
df = pd.read_csv(f'/data/wanghaokai/Time-Series-Library/dataset/traffic/{dataset}.csv')
save_dir = f'/data/wanghaokai/RQ-VAE-Project/forecasting/TEST/{dataset}'
test_size = 0.2
step_size = 48
seq_lengths = [512] # [96, 192, 336, 512]
data = df.values[:, 1:]
data = data[int(data.shape[0]*(1-test_size)):, :]
start_idx = np.arange(0, data.shape[0] - step_size, step_size)

if not os.path.isdir(save_dir):
    os.makedirs(save_dir)

for l in seq_lengths:
    for c in tqdm(range(data.shape[1]), desc=f'{l}'):
        for start in start_idx:
            x = data[start:start+l, c]
            pd.DataFrame(x).to_csv(f"{save_dir}/{c}_{start}_{l}.csv", index=False, header=False)


512: 100%|██████████| 862/862 [00:50<00:00, 17.04it/s]


In [18]:
from tqdm import tqdm
import os
import numpy as np
import pandas as pd

dataset = 'ETTh2'
df = pd.read_csv(f'/data/wanghaokai/Time-Series-Library/dataset/ETT-small/{dataset}.csv')
save_dir = f'/data/wanghaokai/RQ-VAE-Project/forecasting/TEST/{dataset}'

step_size = 24
seq_lengths = [512] # [96, 192, 336, 512]
data = df.values[:, 1:]
data = data[12 * 30 * 24 + 4 * 30 * 24:, :]
print(data.shape)
start_idx = np.arange(0, data.shape[0] - step_size, step_size)

if not os.path.isdir(save_dir):
    os.makedirs(save_dir)

for l in seq_lengths:
    for c in tqdm(range(data.shape[1]), desc=f'{l}'):
        for start in start_idx:
            x = data[start:start+l, c]
            pd.DataFrame(x).to_csv(f"{save_dir}/{c}_{start}_{l}.csv", index=False, header=False)


(5900, 7)


512: 100%|██████████| 7/7 [00:01<00:00,  3.59it/s]


In [19]:
from tqdm import tqdm
import os
import numpy as np

dataset = 'national_illness'
df = pd.read_csv(f'/data/wanghaokai/Time-Series-Library/dataset/illness/{dataset}.csv')
save_dir = f'/data/wanghaokai/RQ-VAE-Project/forecasting/TEST/{dataset}'
test_size = 0.2
step_size = 12
seq_lengths = [24, 36, 48, 60]
data = df.values[:, 1:]
data = data[int(data.shape[0]*(1-test_size)):, :]
start_idx = np.arange(0, data.shape[0] - step_size, step_size)

if not os.path.isdir(save_dir):
    os.makedirs(save_dir)

for l in seq_lengths:
    for c in tqdm(range(data.shape[1]), desc=f'{l}'):
        for start in start_idx:
            x = data[start:start+l, c]
            pd.DataFrame(x).to_csv(f"{save_dir}/{c}_{start}_{l}.csv", index=False, header=False)


24: 100%|██████████| 7/7 [00:00<00:00, 119.90it/s]
36: 100%|██████████| 7/7 [00:00<00:00, 128.66it/s]
48: 100%|██████████| 7/7 [00:00<00:00, 118.76it/s]
60: 100%|██████████| 7/7 [00:00<00:00, 116.63it/s]


### Prepare the UCR classification datasets


In [11]:
from sktime.datasets import load_from_ucr_tsv_to_dataframe
import os
from tqdm import tqdm
import numpy as np


def df_to_feature(df, labels):
    features = []
    for _, row in df.iterrows():
        sample = []
        for c in row.tolist():
            # sample.append(interpolate_uts(c.values.flatten(), seq_len))
            sample.append(c.values.flatten())
        features.append(np.stack(sample, axis=0))
    features = np.stack(features, axis=0)

    labels = pd.Series(labels, dtype="category")
    labels = pd.DataFrame(labels.cat.codes, dtype=np.int8).values

    return features, labels


root = f"../data/timeseries_lib/UCR_2018"
datasets = os.listdir(root)
datasets = sorted([d for d in datasets if not d.startswith(".")])

count = 0

for d in tqdm(datasets):
    save_dir = f'../data/VQShape/ucr/TEST/{d}'
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)

    features, labels = load_from_ucr_tsv_to_dataframe(f"{root}/{d}/{d}_TEST.tsv")
    features, labels = df_to_feature(features, labels)

    for i in range(features.shape[0]):
        x = features[i].flatten()
        pd.DataFrame(x).to_csv(f"{save_dir}/{i}.csv", index=False, header=False)
        count += 1

count

100%|██████████| 128/128 [01:01<00:00,  2.09it/s]


130603

In [2]:
import pandas as pd
import torch
import torch.nn as nn
# import plotly.express as px
# import plotly.graph_objects as go  # 用于多序列对比可视化
from sklearn.preprocessing import MinMaxScaler

# 定义移动平均模块
class moving_avg(nn.Module):
    """移动平均块：提取时间序列趋势项"""
    def __init__(self, kernel_size, stride):
        super(moving_avg, self).__init__()
        self.kernel_size = kernel_size
        self.avg = nn.AvgPool1d(kernel_size=kernel_size, stride=stride, padding=0)

    def forward(self, x):
        # 两端填充，保证输出长度与输入一致
        front = x[:, 0:1, :].repeat(1, (self.kernel_size - 1) // 2, 1)
        end = x[:, -1:, :].repeat(1, (self.kernel_size - 1) // 2, 1)
        x = torch.cat([front, x, end], dim=1)
        # 适配AvgPool1d维度要求 (bs, channels, seq_len)
        x = x.permute(0, 2, 1)
        x = self.avg(x)
        x = x.permute(0, 2, 1)
        return x

# 定义序列分解模块
class series_decomp(nn.Module):
    """序列分解块：分解为残差项（高频）和趋势项（低频）"""
    def __init__(self, kernel_size):
        super(series_decomp, self).__init__()
        self.moving_avg = moving_avg(kernel_size, stride=1)

    def forward(self, x):
        moving_mean = self.moving_avg(x)
        res = x - moving_mean
        return res, moving_mean

In [5]:
from tqdm import tqdm
import os
import numpy as np

dataset = 'traffic'
df = pd.read_csv(f'/data/wanghaokai/Time-Series-Library/dataset/traffic/{dataset}.csv')
save_dir = f'/data/wanghaokai/RQ-VAE-Project/forecasting_96/TRAIN/{dataset}'
test_size = 0.2
step_size = 24
seq_lengths = [96] # [96, 192, 336, 512]
data = df.values[:, 1:]
data = data[:int(data.shape[0]*(1-test_size)), :]
start_idx = np.arange(0, data.shape[0] - step_size, step_size)

# kernel_size = 13  # 可根据数据趋势调整（比如电力日周期用7，周周期用15）
# decomp = series_decomp(kernel_size=kernel_size)

if not os.path.isdir(save_dir):
    os.makedirs(save_dir)

for l in seq_lengths:
    for c in tqdm(range(data.shape[1]), desc=f'{l}'):
        for start in start_idx:
            x = data[start:start+l, c]
            pd.DataFrame(x).to_csv(f"{save_dir}/{c}_{start}_{l}.csv", index=False, header=False)


96: 100%|██████████| 862/862 [04:46<00:00,  3.00it/s]


In [6]:
from tqdm import tqdm
import os
import numpy as np

dataset = 'weather'
df = pd.read_csv(f'/data/wanghaokai/Time-Series-Library/dataset/weather/{dataset}.csv')
save_dir = f'/data/wanghaokai/RQ-VAE-Project/forecasting_96/TRAIN/{dataset}'
test_size = 0.2
step_size = 24
seq_lengths = [96] # [96, 192, 336, 512]
data = df.values[:, 1:]
data = data[:int(data.shape[0]*(1-test_size)), :]
start_idx = np.arange(0, data.shape[0] - step_size, step_size)

# kernel_size = 13  # 可根据数据趋势调整（比如电力日周期用7，周周期用15）
# decomp = series_decomp(kernel_size=kernel_size)

if not os.path.isdir(save_dir):
    os.makedirs(save_dir)

for l in seq_lengths:
    for c in tqdm(range(data.shape[1]), desc=f'{l}'):
        for start in start_idx:
            x = data[start:start+l, c]
            pd.DataFrame(x).to_csv(f"{save_dir}/{c}_{start}_{l}.csv", index=False, header=False)


96: 100%|██████████| 21/21 [00:21<00:00,  1.04s/it]


In [7]:
from tqdm import tqdm
import os
import numpy as np

dataset = 'weather'
df = pd.read_csv(f'/data/wanghaokai/Time-Series-Library/dataset/weather/{dataset}.csv')
save_dir = f'/data/wanghaokai/RQ-VAE-Project/forecasting_96/TRAIN/{dataset}'
test_size = 0.2
step_size = 96
seq_lengths = [96] # [96, 192, 336, 512]
data = df.values[:, 1:]
data = data[:int(data.shape[0]*(1-test_size)), :]
start_idx = np.arange(0, data.shape[0] - step_size, step_size)

# kernel_size = 13  # 可根据数据趋势调整（比如电力日周期用7，周周期用15）
# decomp = series_decomp(kernel_size=kernel_size)

if not os.path.isdir(save_dir):
    os.makedirs(save_dir)

for l in seq_lengths:
    for c in tqdm(range(data.shape[1]), desc=f'{l}'):
        for start in start_idx:
            x = data[start:start+l, c]
            pd.DataFrame(x).to_csv(f"{save_dir}/{c}_{start}_{l}.csv", index=False, header=False)


96: 100%|██████████| 21/21 [00:04<00:00,  4.62it/s]
