In [1]:
import tsdb
import pickle
import logging
import numpy as np


In [2]:
force = False

In [3]:
def normalize(s: np.ndarray) -> list[float]:
    return ((s - s.mean()) / np.std(s)).tolist()


def save(data: np.ndarray,
         to: str):
    assert isinstance(data, list)
    assert isinstance(data[0], list)
    assert isinstance(data[0][0], float)
    with open(to, "wb") as f:
        pickle.dump(data, f)


def split(data: list[np.ndarray],
          train: float = .8,
          val: float = .1) -> tuple[list[np.ndarray],
                                    list[np.ndarray],
                                    list[np.ndarray]]:
    """Generate a train/validation/test split."""
    if len(data) > 10:
        p_train = int(len(data) * train)
        p_val = int(len(data) * val)
    else:
        p_val = 1
        p_train = len(data) - 2
    return data[:p_train], data[p_train:p_train+p_val], data[p_train+p_val:]


def split_2way(data: list[np.ndarray],
          train: float = .9) -> tuple[list[np.ndarray],
                                      list[np.ndarray]]:
    """Generate a train/test split."""
    p = int(len(data) * train)
    return data[:p], data[p:]

## TSDB

In [4]:
def download_tsdb(s: str):
    tsdb.utils.logging.logger.setLevel(logging.ERROR)
    raw = tsdb.load_dataset(s)
    if s == "electricity_load_diagrams":
        raw = (raw["X"].select_dtypes(include=[np.number])
                       .values.T)
        raw = [normalize(s) for s in raw]
    return raw

In [5]:
for dataset_name in ["electricity_load_diagrams"]:
    dataset = download_tsdb(dataset_name)
    dataset_normalized = map(normalize, dataset)
    dataset_train,dataset_validate, dataset_test = split(dataset)
    save(dataset_train,
         f"../data/processed/{dataset_name}_TRAIN.pickle")
    save(dataset_validate,
         f"../data/processed/{dataset_name}_VAL.pickle")
    save(dataset_test,
         f"../data/processed/{dataset_name}_TEST.pickle")

## TSER

In [6]:
from aeon.datasets import load_regression

def download_tser(name: str) -> None:
    try:
        load_regression(name,
                        extract_path="../data/raw/tser")
    except ValueError:
        pass

for tser_dataset in ["HouseholdPowerConsumption1",
                     "HouseholdPowerConsumption2"]:
    download_tser(tser_dataset)

In [7]:
from pathlib import Path
import re

def loads_ts(file: str):
    lines = [line for line in open(file)
             if not (line.startswith("#") or line.startswith("@"))]
    series = list()
    for line in lines:
        channels = line.split('):(')
        for channel in channels:
            data = re.findall(r",(\d+\.\d+)\),", channel)
            data = [float(p) for p in data]
            if all(v > 0 for v in data):
                series.append(normalize(np.array(data)))
    return series

for dataset_name in Path("../data/raw/tser").glob("**/*.ts"):
    print(dataset_name.stem)
    dataset = loads_ts(dataset_name)
    if dataset_name.stem=="HouseholdPowerConsumption1_TEST" or dataset_name.stem=="HouseholdPowerConsumption2_TEST":
        p = int(len(dataset)/2)
        print(p)
        print(dataset_name)
        save(dataset[:p],
         f"../data/processed_HH/{dataset_name.stem[:-4]}VAL.pickle")
        save(dataset[p:],
         f"../data/processed_HH/{dataset_name.stem}.pickle")
    else:
        save(dataset,
         f"../data/processed_HH/{dataset_name.stem}.pickle")

# for dataset_name in Path("../data/raw/tser").glob("**/*.ts"):
#     dataset = loads_ts(dataset_name)
#     save(dataset,
#          f"../data/processed/{dataset_name.stem}.pickle")

HouseholdPowerConsumption2_TEST
688
../data/raw/tser/HouseholdPowerConsumption2/HouseholdPowerConsumption2_TEST.ts
HouseholdPowerConsumption2_TRAIN
HouseholdPowerConsumption1_TEST
688
../data/raw/tser/HouseholdPowerConsumption1/HouseholdPowerConsumption1_TEST.ts
HouseholdPowerConsumption1_TRAIN


## Forecasting

In [4]:
from aeon.datasets import load_forecasting


def download_forecasting(name: str):
    data = load_forecasting(name,
                            extract_path="../data/raw/forecasting",
                            return_metadata=False)
    data = data["series_value"].values
    data = [s.to_numpy() for s in data]
    data = [s for s in data if s.sum() > 0 and len(s) > 1024]
    data = [normalize(s) for s in data]
    return data


for dataset_name in [# "solar_10_minutes_dataset",
                     #"london_smart_meters_dataset_without_missing_values",
                     # "australian_electricity_demand_dataset",
                     "wind_farms_minutely_dataset_without_missing_values",
                     # "electricity_hourly_dataset"
                     ]:
    dataset = download_forecasting(dataset_name)
    dataset_train,dataset_val, dataset_test = split(dataset)
    save(dataset_train,
         f"../data/processed/{dataset_name}_TRAIN.pickle")
    save(dataset_val,
         f"../data/processed/{dataset_name}_VAL.pickle")
    save(dataset_test,
         f"../data/processed/{dataset_name}_TEST.pickle")

## List

In [9]:
stems = [
   (p.stem
     .removesuffix("_TRAIN")
     .removesuffix("_TEST")) for p in Path("../data/processed/").glob("*_TEST.pickle")
]
stems

['electricity_load_diagrams',
 'australian_electricity_demand_dataset',
 'HouseholdPowerConsumption2',
 'electricity_hourly_dataset',
 'wind_farms_minutely_dataset_without_missing_values',
 'london_smart_meters_dataset_without_missing_values',
 'solar_10_minutes_dataset',
 'HouseholdPowerConsumption1']

In [35]:
from azure.ai.ml import MLClient
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes
from azure.identity import DefaultAzureCredential

#autheticate
credential = DefaultAzureCredential()

# Get a handle to the workspace
ml_client = MLClient.from_config(
    credential=credential,
    path="config.json",
)

# Set the version number of the data asset (for example: '1')
VERSION = "1"

# Set the path, supported paths include:
# local: './<path>/<folder>' (this will be automatically uploaded to cloud storage)
# blob:  'wasbs://<container_name>@<account_name>.blob.core.windows.net/<path>/<folder>'
# ADLS gen2: 'abfss://<file_system>@<account_name>.dfs.core.windows.net/<path>/<folder>'
# Datastore: 'azureml://datastores/<data_store_name>/paths/<path>/<folder>'
path = "../data/processed/"

# Define the Data asset object
my_data = Data(
    path=path,
    type=AssetTypes.URI_FOLDER,
    description="collection of datasets from aeon, tser, tsdb normalized with 0 mean and std 1",
    name="train_dataset_tser_tsdb_normalized",
    version=VERSION,
)

# Create the data asset in the workspace
ml_client.data.create_or_update(my_data)

Found the config file in: config.json
Your file exceeds 100 MB. If you experience low speeds, latency, or broken connections, we recommend using the AzCopyv10 tool for this file transfer.

Example: azcopy copy '/workspaces/AICoE_Ramping_Artefacts/artifactory-master/data/processed' 'https://m3mlopssadev.blob.core.windows.net/azureml-blobstore-206414f2-5a5c-4209-8dbe-6d0e233cd920/LocalUpload/8900158fba37b47eeea8e2334ce3e540/processed' 

See https://docs.microsoft.com/azure/storage/common/storage-use-azcopy-v10 for more information.
[32mUploading processed (3676.35 MBs): 100%|██████████| 3676350770/3676350770 [00:12<00:00, 288555586.77it/s]
[39m



Data({'skip_validation': False, 'mltable_schema_url': None, 'referenced_uris': None, 'type': 'uri_folder', 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'train_dataset_tser_tsdb_normalized', 'description': 'collection of datasets from aeon, tser, tsdb normalized with 0 mean and std 1', 'tags': {}, 'properties': {}, 'print_as_yaml': True, 'id': '/subscriptions/8de3e85d-b97f-48c1-a25b-5bddf9dc484c/resourceGroups/m3-mlops-dev/providers/Microsoft.MachineLearningServices/workspaces/m3-mlops-mlw-dev/data/train_dataset_tser_tsdb_normalized/versions/1', 'Resource__source_path': None, 'base_path': '/workspaces/AICoE_Ramping_Artefacts/artifactory-master/notebooks', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x7f740cbb2c10>, 'serialize': <msrest.serialization.Serializer object at 0x7f740cbb01d0>, 'version': '1', 'latest_version': None, 'path': 'azureml://subscriptions/8de3e85d-b97f-48c1-a25b-5bddf9dc484c/resourcegroup