In [1]:
import tsdb
import pickle
import logging
import numpy as np
import pandas as pd


In [2]:
force = False

In [3]:
def normalize(s: np.ndarray) -> list[float]:
    n = ((s - s.mean()) / np.std(s)).tolist()
    if (n.count('Nan') > 0 or np.std(s) < 0.00001):
        print('NaN or std: ', np.std(s))
    return n


def save(data: np.ndarray,
         to: str):
    assert isinstance(data, list)
    assert isinstance(data[0], list)
    assert isinstance(data[0][0], float)
    with open(to, "wb") as f:
        pickle.dump(data, f)
        

def split(data: list[np.ndarray],
          train: float = .9) -> tuple[list[np.ndarray],
                                      list[np.ndarray]]:
    """Generate a train/test split."""
    p = int(len(data) * train)
    return data[:p], data[p:]

## UCR Time Series Classification Repo

In [7]:
datasets = [
    'ACSF1',
    'CinCECGTorso',
    'HouseTwenty',
    'Mallat',
    'MixedShapesRegularTrain',
    'Phoneme',
    'PigArtPressure',
    'PigCVP',
    'Rock',
    'SemgHandGenderCh2',
    'SemgHandMovementCh2',
    'SemgHandSubjectCh2'
]

path = "../data/raw/UCR_TSC_raw"

In [5]:
def load_UCI_DS(s: str, path: str, split: str) -> pd.DataFrame:
    file_name = f"{path}/{s}_{split}.tsv"  
    df = pd.read_csv(file_name, sep='\t', header=None)
    return df

In [21]:
train = load_UCI_DS(datasets[0], path, 'TRAIN')
train = train.astype(float)
# train.drop(columns=0, inplace=True)
# train = train.values.tolist()


In [24]:
"""
load and aplit all downloaded UCI datasets
chosen datasets are already normalized with z-normalization
"""
for dataset_name in datasets:
    train = load_UCI_DS(dataset_name, path, 'TRAIN')
    test = load_UCI_DS(dataset_name, path, 'TEST')
    train = train.astype(float)
    test = test.astype(float)
    train.drop(columns=0, inplace=True)
    test.drop(columns=0, inplace=True)
    train = train.values.tolist()
    train = [normalize(np.array(s)) for s in train]
    test = test.values.tolist()
    test = [normalize(np.array(s)) for s in test]
    dataset_train, dataset_test = split(train + test)
    save(dataset_train,
        f"../data/processed/{dataset_name}_TRAIN.pickle")
    save(dataset_test,
        f"../data/processed/{dataset_name}_VAL.pickle")

In [132]:
from pathlib import Path

stems = [
   (p.stem
     .removesuffix("_TRAIN")
     .removesuffix("_VAL")) for p in Path("../data/processed/").glob("*_VAL.pickle")
]
stems

['ACSF1',
 'CinCECGTorso',
 'HouseTwenty',
 'Mallat',
 'MixedShapesRegularTrain',
 'Phoneme',
 'PigArtPressure',
 'PigCVP',
 'Rock',
 'SemgHandGenderCh2',
 'SemgHandMovementCh2',
 'SemgHandSubjectCh2']