In [None]:
# default stuff (display width, dir change, jupyter extentions)
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import os
os.chdir('..')
%load_ext autoreload
%autoreload 2

In [None]:
# import stuff
import anodeclstmgru.constants as const
import pandas as pd
import numpy as np
from datetime import datetime
from pandas_profiling import ProfileReport

# Physical

In [None]:
# read the excel files and store the into an h5 store
# this takes ages...
if not os.path.isfile(const.HDF_STORE_PATH_INTERIM):
    from anodeclstmgru.data import create_h5_file
    create_h5_file.run()

In [None]:
# read files from h5 store
store = pd.HDFStore(const.HDF_STORE_PATH)
df_phys_norm_v0 = store['df_phys_norm_v0']
df_phys_norm_v1 = store['df_phys_norm_v1']
df_phys_att_v0 = store['df_phys_att_v0']
store.close()

In [None]:
# print heads
df_phys_norm_v0.head()

In [None]:
df_phys_norm_v1.head()

In [None]:
df_phys_att_v0.head()

## What is the difference between those two files?

### Time frames
Accoring to the paper, the dataset is supposed to hold 7 days of normal operation... lets see

In [None]:
for ind, df in enumerate([df_phys_norm_v0, df_phys_norm_v1]):
    print(f'Start timestamp v{ind}: {df.Timestamp.min()}')
    print(f'End timestamp v{ind}: {df.Timestamp.max()}')

looks like v0 starts 30 minutes earlier ...

### Schema

In [None]:
# are the columns the same? 
all(df_phys_norm_v0.columns == df_phys_norm_v1.columns)

In [None]:
# what are the number of observations?
for ind, df in enumerate([df_phys_norm_v0, df_phys_norm_v1]):
    print(f'Number of rows v{ind}: {len(df)}')

Alright this seems to fit the 30 minutes observation: v0 has 1800 entries more than v1. Looks like the sample rate is 1HZ

### Values
Lets take a subsample and see if the values are equal

In [None]:
# two timestamps randomly picked
min_sample_ts = datetime(2015, 12, 22, 17)
max_sample_ts = datetime(2015, 12, 28, 8)

In [None]:
condition_v0 = (df_phys_norm_v0.Timestamp > min_sample_ts) &\
    (df_phys_norm_v0.Timestamp < max_sample_ts)
condition_v1 = (df_phys_norm_v1.Timestamp > min_sample_ts) &\
    (df_phys_norm_v1.Timestamp < max_sample_ts)
df_phys_norm_v0_sample = \
    df_phys_norm_v0[condition_v0].reset_index(drop=True)
df_phys_norm_v1_sample = \
    df_phys_norm_v1[condition_v1].reset_index(drop=True)

In [None]:
df_compare = df_phys_norm_v0_sample == df_phys_norm_v1_sample
df_compare.head()

In [None]:
np.unique(df_compare.values)

Alright, the values are the same, so lets just pick v1 because they has to be a reason why they created it, maybe the first 30 minutes aren't really normal

### What is the Frequency?

In [None]:
df_phys_norm_v1.Timestamp.diff()[1:].unique()

In [None]:
freq = df_phys_norm_v1.Timestamp.diff()[1:].unique()[0]
print(f'Frequenzy in 1/HZ: {int(freq/1e9)}')

We further know that the data frame is sorted, because the diffs are all positive and unique.

### DF Profile? (missing vals, variables et al.)
Takes quite a while to run (14 min or so) --> don't do that if you're in a hurry

In [None]:
profile = ProfileReport(df_phys_norm_v1, title='Pandas Profiling Report', explorative=True)
profile.to_widgets()

# Labels

In [None]:
df_label = pd.read_excel(const.LABELS_FILE_PATH)
df_label

In [None]:
df_label.info()