In [1]:
%reload_ext autoreload
%autoreload 2

import sys
sys.path.append('../src_jobs/')

In [2]:
import pickle
from itertools import repeat
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# from detector import WindowTransformerDetector

In [4]:
"""
returns: list of list of arrays containing the time series in the datasets in names
    only adds the series that are longer than the provided width (1024)
returns: weights for the time series in the list according to number of time series in the dataset
    all datasets are sampled at the same rate
"""
def load_series(names: list[str], split: str):
    series = list()
    counts = list()
    for name in names:
        with open(f"../data/processed/{name}_{split}.pickle", "rb") as f:
            raw = [a for a in pickle.load(f) if len(a) > 512]
            series.extend(np.array(a).astype(np.float32) for a in raw)
            counts.extend(repeat(1 / len(raw), len(raw)))
    counts = np.array(counts)
    return series, counts / counts.sum()

In [5]:
datasetname = "normalized_deviation_updated"
test_dataset, weights = load_series([datasetname], "TEST")
df_real = pd.DataFrame(test_dataset)

stats_df_real = pd.DataFrame()
stats_df_real["max"] = df_real.max(axis=1)
stats_df_real["min"] = df_real.min(axis=1)
stats_df_real["mean"] = df_real.mean(axis=1)
stats_df_real["stddev"] = df_real.std(axis=1)

gradient_df_real = df_real.diff(axis=1).abs()
stats_df_real["increment_max"] = gradient_df_real.max(axis=1)
stats_df_real["increment_min"] = gradient_df_real.min(axis=1)
stats_df_real["increment_mean"] = gradient_df_real.mean(axis=1)
stats_df_real["increment_stddev"] = gradient_df_real.std(axis=1)

In [6]:
from scipy.stats import skew, kurtosis

stats_df_real["skew"] = skew(df_real.iloc[0], bias=True)
stats_df_real["kurtosis"] = kurtosis(df_real.iloc[0], fisher=False, bias=True)
stats_df_real

Unnamed: 0,max,min,mean,stddev,increment_max,increment_min,increment_mean,increment_stddev,skew,kurtosis
0,12.352987,-11.329762,-3.948346e-09,0.999453,11.9681,0.0,0.044143,0.182534,-0.776235,47.798541


In [7]:
overall_stats = pd.read_csv('/workspaces/AICoE_Ramping_Artefacts/artifactory-master/data/stats_all.csv')

In [8]:
overall_stats.drop(columns=['Unnamed: 0'], inplace=True)
stats_df_real['datasetname'] = 'real'

overall_and_real = pd.concat([stats_df_real, overall_stats], ignore_index=True)

In [26]:
norm_overall_and_real = pd.DataFrame()
for column in overall_stats.columns[:-1]:
    norm_overall_and_real[column] = (overall_and_real[column] - overall_and_real[column].mean()) / overall_and_real[column].std()

norm_overall_and_real['datasetname'] = overall_and_real["datasetname"]

In [103]:
norm_stats_real = pd.DataFrame(columns=norm_overall_and_real.columns)
norm_stats_real.loc[0] = norm_overall_and_real.loc[0]
norm_stats_overall = norm_overall_and_real.loc[1:]
norm_stats_overall.reset_index(inplace=True)

In [104]:
norm_stats_overall.drop(columns=['index'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  norm_stats_overall.drop(columns=['index'], inplace=True)


In [105]:
norm_stats_overall_grouped = norm_stats_overall.groupby(['datasetname']).mean()

In [9]:
overall_and_real_grouped = overall_and_real.groupby(['datasetname']).mean()

In [10]:
real = overall_and_real_grouped.iloc[6]
rest = pd.concat([overall_and_real_grouped.iloc[:6], overall_and_real_grouped.iloc[7:]])
rest

Unnamed: 0_level_0,max,min,mean,stddev,increment_max,increment_min,increment_mean,increment_stddev,skew,kurtosis
datasetname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
HouseholdPowerConsumption1,3.646425,-2.075451,1.93e-11,1.000349,2.208064,0.0,0.140969,0.220009,0.643361,4.544977
HouseholdPowerConsumption2,3.646425,-2.075451,1.93e-11,1.000349,2.208064,0.0,0.140969,0.220009,0.643361,4.544977
australian_electricity_demand_dataset,4.842094,-3.008433,-1.370415e-11,1.000002,2.110877,4.947186e-07,0.130896,0.120768,0.465713,3.282569
electricity_hourly_dataset,4.00783,-2.986226,6.67508e-08,1.000017,3.498252,0.0,0.274538,0.316935,0.362436,3.393236
electricity_load_diagrams,4.057863,-1.963208,-1.204722e-05,0.999989,3.664582,0.0,0.120715,0.165669,0.363441,3.639756
london_smart_meters_dataset_without_missing_values,11.732024,-0.93592,-4.852708e-12,1.000018,10.399583,0.0,0.428477,0.737539,3.319336,25.107594
solar_10_minutes_dataset,3.040003,-0.687743,1.773482e-06,0.999994,1.44744,0.0,0.058718,0.110002,1.161848,2.864924
wind_farms_minutely_dataset_without_missing_values,7.677969,-0.777223,2.277345e-10,1.000001,5.831305,0.0,0.016738,0.065179,4.927484,330.467983


In [38]:
real

max                 1.235299e+01
min                -1.132976e+01
mean               -3.948346e-09
stddev              9.994529e-01
increment_max       1.196810e+01
increment_min       0.000000e+00
increment_mean      4.414336e-02
increment_stddev    1.825335e-01
skew               -7.762350e-01
kurtosis            4.779854e+01
Name: real, dtype: float64

In [30]:
overall_and_real_grouped

Unnamed: 0_level_0,max,min,mean,stddev,increment_max,increment_min,increment_mean,increment_stddev,skew,kurtosis
datasetname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
HouseholdPowerConsumption1,3.646425,-2.075451,1.93e-11,1.000349,2.208064,0.0,0.140969,0.220009,0.643361,4.544977
HouseholdPowerConsumption2,3.646425,-2.075451,1.93e-11,1.000349,2.208064,0.0,0.140969,0.220009,0.643361,4.544977
australian_electricity_demand_dataset,4.842094,-3.008433,-1.370415e-11,1.000002,2.110877,4.947186e-07,0.130896,0.120768,0.465713,3.282569
electricity_hourly_dataset,4.00783,-2.986226,6.67508e-08,1.000017,3.498252,0.0,0.274538,0.316935,0.362436,3.393236
electricity_load_diagrams,4.057863,-1.963208,-1.204722e-05,0.999989,3.664582,0.0,0.120715,0.165669,0.363441,3.639756
london_smart_meters_dataset_without_missing_values,11.732024,-0.93592,-4.852708e-12,1.000018,10.399583,0.0,0.428477,0.737539,3.319336,25.107594
real,12.352987,-11.329762,-3.948346e-09,0.999453,11.9681,0.0,0.044143,0.182534,-0.776235,47.798541
solar_10_minutes_dataset,3.040003,-0.687743,1.773482e-06,0.999994,1.44744,0.0,0.058718,0.110002,1.161848,2.864924
wind_farms_minutely_dataset_without_missing_values,7.677969,-0.777223,2.277345e-10,1.000001,5.831305,0.0,0.016738,0.065179,4.927484,330.467983


In [106]:
similarities = np.zeros(len(norm_stats_overall_grouped))
# max, min, mean, stddev, inc_max, inc_min, inc_mean, inc_stddev, 3rd_m, 4th_m --> weights
weights = [1, 1, 0, 0, 1, 1, 1, 1, 1, 1]
weight = [val/sum(weights) for val in weights]

for row in range(len(norm_stats_overall_grouped)):
    Dist   = np.sqrt(
                     sum( 
                         ((norm_stats_overall_grouped.iloc[row, norm_stats_overall_grouped.columns != 'datasetname'] - norm_stats_real.iloc[0, norm_stats_real.columns != 'datasetname']) * weight)**2
                         ) 
                    )
    similarities[row]=Dist

norm_stats_overall_grouped["euclidean_dist"] = similarities

In [107]:
overall_norm_sorted = norm_stats_overall_grouped.sort_values(by=["euclidean_dist"], ascending=True)

In [108]:
norm_stats_real

Unnamed: 0,max,min,mean,stddev,increment_max,increment_min,increment_mean,increment_stddev,skew,kurtosis,datasetname
0,0.805495,-9.045215,0.015895,-4.22346,1.021762,-0.012102,-1.284429,-0.915934,-0.795368,0.037637,real


In [109]:
overall_norm_sorted

Unnamed: 0_level_0,max,min,mean,stddev,increment_max,increment_min,increment_mean,increment_stddev,skew,kurtosis,euclidean_dist
datasetname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
electricity_hourly_dataset,-0.547375,-1.337759,0.019261,-0.894116,-0.437058,-0.012102,0.035625,-0.450777,-0.462102,-0.027395,1.011168
HouseholdPowerConsumption1,-0.605963,-0.496418,0.016084,1.067345,-0.659276,-0.012102,-0.729665,-0.786235,-0.379881,-0.025708,1.106806
HouseholdPowerConsumption2,-0.605963,-0.496418,0.016084,1.067345,-0.659276,-0.012102,-0.729665,-0.786235,-0.379881,-0.025708,1.106806
electricity_load_diagrams,-0.539263,-0.392732,-0.557647,-1.057198,-0.40841,-0.012102,-0.845708,-0.974301,-0.461808,-0.027033,1.111239
wind_farms_minutely_dataset_without_missing_values,0.047608,0.702838,0.016093,-0.987742,-0.03522,-0.012102,-1.44145,-1.322091,0.873996,0.451609,1.249149
london_smart_meters_dataset_without_missing_values,0.704828,0.556239,0.016082,-0.890147,0.751606,-0.012102,0.917623,1.00491,0.403323,0.004406,1.26397
solar_10_minutes_dataset,-0.704273,0.785496,0.100542,-1.030176,-0.790284,-0.012102,-1.200925,-1.166961,-0.22813,-0.028168,1.266157
australian_electricity_demand_dataset,-0.412128,-1.358273,0.016082,-0.980954,-0.676015,30.038346,-0.78738,-1.129703,-0.431875,-0.027557,3.886902
