In [3]:
import tsdb
import pickle
import logging
import numpy as np


In [4]:
force = False

In [5]:
def normalize(s: np.ndarray) -> list[float]:
    n = ((s - s.mean()) / np.std(s)).tolist()
    if (n.count('Nan') > 0 or np.std(s) < 0.00001):
        print('NaN or std: ', np.std(s))
    return n


def save(data: np.ndarray,
         to: str):
    assert isinstance(data, list)
    assert isinstance(data[0], list)
    assert isinstance(data[0][0], float)
    with open(to, "wb") as f:
        pickle.dump(data, f)
        

def split(data: list[np.ndarray],
          train: float = .9) -> tuple[list[np.ndarray],
                                      list[np.ndarray]]:
    """Generate a train/test split."""
    p = int(len(data) * train)
    return data[:p], data[p:]

## TSDB

In [6]:
def download_tsdb(s: str):
    tsdb.utils.logging.logger.setLevel(logging.ERROR)
    raw = tsdb.load_dataset(s)
    if s == "electricity_load_diagrams":
        raw = (raw["X"].select_dtypes(include=[np.number])
                       .values.T)
        raw = [normalize(s) for s in raw]
    return raw

In [10]:
for dataset_name in ["electricity_load_diagrams"]:
    dataset = download_tsdb(dataset_name)
    dataset_train, dataset_test = split(dataset)
    save(dataset_train,
         f"../data/processed_2/{dataset_name}_TRAIN.pickle")
    save(dataset_test,
         f"../data/processed_2/{dataset_name}_VAL.pickle")

## TSER

In [11]:
from aeon.datasets import load_regression

def download_tser(name: str) -> None:
    try:
        load_regression(name,
                        extract_path="../data/raw/tser")
    except ValueError:
        pass

for tser_dataset in ["HouseholdPowerConsumption1"]:
    download_tser(tser_dataset)

In [13]:
from pathlib import Path
import re

def loads_ts(file: str):
    lines = [line for line in open(file)
             if not (line.startswith("#") or line.startswith("@"))]
    series = list()
    for line in lines:
        channels = line.split('):(')
        for channel in channels:
            data = re.findall(r",(\d+\.\d+)\),", channel)
            data = [float(p) for p in data]
            if all(v > 0 for v in data):
                series.append(normalize(np.array(data)))
    return series

for dataset_name in Path("../data/raw/tser").glob("**/*.ts"):
    dataset = loads_ts(dataset_name)
    save(dataset,
         f"../data/processed_2/{dataset_name.stem}.pickle")

## Forecasting

In [15]:
from aeon.datasets import load_forecasting


def download_forecasting(name: str):
    data = load_forecasting(name,
                            extract_path="../data/raw/forecasting",
                            return_metadata=False)
    data = data["series_value"].values
    data = [s.to_numpy() for s in data]
    data = [s for s in data if s.sum() > 0 and len(s) > 1024]
    data = [normalize(s) for s in data]
    return data


for dataset_name in ["solar_10_minutes_dataset",
                     "london_smart_meters_dataset_without_missing_values",
                     "australian_electricity_demand_dataset",
                     "wind_farms_minutely_dataset_without_missing_values",
                     "electricity_hourly_dataset"
                     ]:
    dataset = download_forecasting(dataset_name)
    dataset_train, dataset_test = split(dataset)
    save(dataset_train,
         f"../data/processed_2/{dataset_name}_TRAIN.pickle")
    save(dataset_test,
         f"../data/processed_2/{dataset_name}_VAL.pickle")

## List

In [17]:
stems = [
   (p.stem
     .removesuffix("_TRAIN")
     .removesuffix("_TEST")) for p in Path("../data/processed_2/").glob("*_TEST.pickle")
]
stems

['australian_electricity_demand_dataset',
 'electricity_load_diagrams',
 'HouseholdPowerConsumption2',
 'london_smart_meters_dataset_without_missing_values',
 'solar_10_minutes_dataset']