In [1]:
%reload_ext autoreload
%autoreload 2

import sys
sys.path.append('../src_jobs/')

In [2]:
import pickle
from itertools import repeat
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# from detector import WindowTransformerDetector

In [5]:
"""
returns: list of list of arrays containing the time series in the datasets in names
    only adds the series that are longer than the provided width (1024)
returns: weights for the time series in the list according to number of time series in the dataset
    all datasets are sampled at the same rate
"""
def load_series(names: list[str], split: str):
    series = list()
    counts = list()
    for name in names:
        with open(f"../data/processed/{name}_{split}.pickle", "rb") as f:
            raw = [a for a in pickle.load(f) if len(a) > 512]
            series.extend(np.array(a).astype(np.float32) for a in raw)
            counts.extend(repeat(1 / len(raw), len(raw)))
    counts = np.array(counts)
    return series, counts / counts.sum()

In [6]:
datasetname = "normalized_deviation_updated"
test_dataset, weights = load_series([datasetname], "TEST")

In [7]:
import pandas as pd

df_real = pd.DataFrame(test_dataset)
stats_df_real = pd.DataFrame()
stats_df_real["max"] = df_real.max(axis=1)
stats_df_real["min"] = df_real.min(axis=1)
stats_df_real["mean"] = df_real.mean(axis=1)
stats_df_real["stddev"] = df_real.std(axis=1)
stats_df_real


Unnamed: 0,max,min,mean,stddev
0,12.352987,-11.329762,-3.948346e-09,0.999453


In [8]:
gradient_df_real = df_real.diff(axis=1).abs()
stats_df_real["increment_max"] = gradient_df_real.max(axis=1)
stats_df_real["increment_min"] = gradient_df_real.min(axis=1)
stats_df_real["increment_mean"] = gradient_df_real.mean(axis=1)
stats_df_real["increment_stddev"] = gradient_df_real.std(axis=1)
stats_df_real

Unnamed: 0,max,min,mean,stddev,increment_max,increment_min,increment_mean,increment_stddev
0,12.352987,-11.329762,-3.948346e-09,0.999453,11.9681,0.0,0.044143,0.182534


In [9]:
from scipy.stats import skew, kurtosis

stats_df_real["skew"] = skew(df_real.iloc[0], bias=True, nan_policy='omit')
stats_df_real["kurtosis"] = kurtosis(df_real.iloc[0], fisher=False, bias=True, nan_policy='omit')
stats_df_real

Unnamed: 0,max,min,mean,stddev,increment_max,increment_min,increment_mean,increment_stddev,skew,kurtosis
0,12.352987,-11.329762,-3.948346e-09,0.999453,11.9681,0.0,0.044143,0.182534,-0.776235,47.798541


In [10]:
datasetnames = [
    "australian_electricity_demand_dataset",
    "electricity_hourly_dataset",
    "electricity_load_diagrams",
    "HouseholdPowerConsumption1",
    "HouseholdPowerConsumption2",
    "london_smart_meters_dataset_without_missing_values",
    "solar_10_minutes_dataset",
    "wind_farms_minutely_dataset_without_missing_values",
]

In [12]:
from scipy.stats import skew, kurtosis

overall_stats = pd.DataFrame()

for datasetname in datasetnames:
    print(datasetname)
    train_dataset, weights = load_series([datasetname], "TRAIN")
    val_dataset, weights = load_series([datasetname], "VAL")
    test_dataset, weights = load_series([datasetname], "TEST")

    df = pd.DataFrame(train_dataset + val_dataset + test_dataset)
    stats_df = pd.DataFrame()
    stats_df["max"] = df.max(axis=1)
    stats_df["min"] = df.min(axis=1)
    stats_df["mean"] = df.mean(axis=1)
    stats_df["stddev"] = df.std(axis=1)

    gradient_df = df.diff(axis=1).abs()
    stats_df["increment_max"] = gradient_df.max(axis=1)
    stats_df["increment_min"] = gradient_df.min(axis=1)
    stats_df["increment_mean"] = gradient_df.mean(axis=1)
    stats_df["increment_stddev"] = gradient_df.std(axis=1)

    stats_df["skew"] = df.apply(lambda row: skew(row, bias=True, nan_policy='omit'), axis=1)
    stats_df["kurtosis"] = df.apply(lambda row: kurtosis(row, fisher=False, bias=True, nan_policy='omit'), axis=1)

    stats_df["datasetname"] = datasetname

    overall_stats = pd.concat([overall_stats, stats_df])

    print(len(overall_stats))

australian_electricity_demand_dataset
5
electricity_hourly_dataset
326
electricity_load_diagrams
696
HouseholdPowerConsumption1
3549
HouseholdPowerConsumption2
6402
london_smart_meters_dataset_without_missing_values
11957
solar_10_minutes_dataset
12094
wind_farms_minutely_dataset_without_missing_values
12415


In [17]:
overall_stats

Unnamed: 0,max,min,mean,stddev,increment_max,increment_min,increment_mean,increment_stddev,skew,kurtosis,datasetname
0,4.497548,-2.380567,3.047708e-11,1.000002,1.716814,1.788139e-06,0.128401,0.114290,0.152237,2.731366,australian_electricity_demand_dataset
1,5.804921,-2.332044,4.282331e-11,1.000002,1.794654,5.960464e-08,0.139539,0.118310,0.505313,3.278638,australian_electricity_demand_dataset
2,3.645503,-2.636488,-2.399149e-11,1.000002,1.033061,1.192093e-07,0.121321,0.110958,0.102077,2.416598,australian_electricity_demand_dataset
3,6.282798,-2.662571,-9.066986e-11,1.000002,1.870366,3.576279e-07,0.129179,0.120519,0.943893,4.970581,australian_electricity_demand_dataset
4,3.979702,-5.030494,-2.715977e-11,1.000002,4.139492,1.490116e-07,0.136037,0.139761,0.625044,3.015661,australian_electricity_demand_dataset
...,...,...,...,...,...,...,...,...,...,...,...
316,3.496888,-0.977085,-7.877086e-09,1.000002,2.763337,0.000000e+00,0.014267,0.034839,1.882728,5.714475,wind_farms_minutely_dataset_without_missing_va...
317,6.018838,-0.476784,6.237921e-09,1.000002,4.173205,0.000000e+00,0.016180,0.082980,3.103853,12.393844,wind_farms_minutely_dataset_without_missing_va...
318,22.267677,-0.071893,3.066879e-09,1.000002,20.575920,0.000000e+00,0.005740,0.178165,16.052691,275.881975,wind_farms_minutely_dataset_without_missing_va...
319,6.648432,-0.413981,3.930968e-09,1.000005,3.537043,0.000000e+00,0.021311,0.093488,3.417189,16.288454,wind_farms_minutely_dataset_without_missing_va...


In [19]:
overall_stats.iloc[overall_stats[datasetname] == "HouseholdPowerConsumption1"]

KeyError: 'wind_farms_minutely_dataset_without_missing_values'

In [11]:
overall_stats.to_csv('stats_all.csv')