In [None]:
import os
os.chdir('..')
%load_ext autoreload
%autoreload 2

In [None]:
import src.constants as const
import pandas as pd
import numpy as np
from datetime import datetime

# Physical

In [None]:
# read in normla state files
df_phys_norm_v0 = pd.read_excel(const.PHYS_NORMAL_PATHS[0], header=1)
df_phys_norm_v1 = pd.read_excel(const.PHYS_NORMAL_PATHS[1], header=1)

In [None]:
df_phys_norm_v0.head()

In [None]:
df_phys_norm_v1.head()

In [None]:
# look quite similar, lets format the timestamps
df_phys_norm_v0['Timestamp'] = pd.to_datetime(df_phys_norm_v0[' Timestamp'])
df_phys_norm_v1['Timestamp'] = pd.to_datetime(df_phys_norm_v1[' Timestamp'])

In [None]:
# reading the excel files takes ages, so i'll store the tables as h5
store = pd.HDFStore(const.HDF_STORE_PATH)

In [None]:
store['df_phys_norm_v0'] = df_phys_norm_v0
store['df_phys_norm_v1'] = df_phys_norm_v1

In [None]:
#read them in if necessary
df_phys_norm_v0 = store['df_phys_norm_v0']
df_phys_norm_v1 = store['df_phys_norm_v1']

## What is the difference between those two files?

### Time frames
Accoring to the paper, the dataset is supposed to hold 7 days of normal operation... lets see

In [None]:
for ind, df in enumerate([df_phys_norm_v0, df_phys_norm_v1]):
    print(f'Start timestamp v{ind}: {df.Timestamp.min()}')
    print(f'End timestamp v{ind}: {df.Timestamp.max()}')

looks like v0 starts 30 minutes earlier ...

### Schema

In [None]:
# are the columns the same? 
all(df_phys_norm_v0.columns == df_phys_norm_v1.columns)

In [None]:
# what are the number of observations?
for ind, df in enumerate([df_phys_norm_v0, df_phys_norm_v1]):
    print(f'Number of rows v{ind}: {len(df)}')

Alright this seems to fit the 30 minutes observation: v0 has 1800 entries more than v1. Looks like the sample rate is 1HZ

### Values
Lets take a subsample and see if the values are equal

In [None]:
# two timestamps randomly picked
min_sample_ts = datetime(2015, 12, 22, 17)
max_sample_ts = datetime(2015, 12, 28, 8)

In [None]:
condition_v0 = (df_phys_norm_v0.Timestamp > min_sample_ts) &\
    (df_phys_norm_v0.Timestamp < max_sample_ts)
condition_v1 = (df_phys_norm_v1.Timestamp > min_sample_ts) &\
    (df_phys_norm_v1.Timestamp < max_sample_ts)
df_phys_norm_v0_sample = \
    df_phys_norm_v0[condition_v0].reset_index(drop=True)
df_phys_norm_v1_sample = \
    df_phys_norm_v1[condition_v1].reset_index(drop=True)

In [None]:
df_compare = df_phys_norm_v0_sample == df_phys_norm_v1_sample
df_compare.head()

In [None]:
np.unique(df_compare.values)

Alright, the values are the same, so lets just pick v1 because they has to be a reason why they created it, maybe the first 30 minutes aren't really normal

### What is the Frequency?

In [None]:
df_phys_norm_v1.Timestamp.diff()[1:].unique()

In [None]:
freq = df_phys_norm_v1.Timestamp.diff()[1:].unique()[0]
print(f'Frequenzy in 1/HZ: {int(freq/1e9)}')

We further know that the data frame is sorted, because the diffs are all positive and unique.

### What about misssing values?

In [None]:
from pandas_profiling import ProfileReport
profile = ProfileReport(df_phys_norm_v1, title='Pandas Profiling Report', explorative=True)

In [None]:
profile.to_widgets()