In [None]:
# default stuff (display width, dir change, jupyter extentions)
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import os
os.chdir('..')
%load_ext autoreload
%autoreload 2

In [None]:
# import stuff
import anodeclstmgru.constants as const
import pandas as pd
import numpy as np
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
from ipywidgets import interact
from datetime import datetime
import matplotlib.pyplot as plt

In [None]:
# load data frames from h5 store
store = pd.HDFStore(const.HDF_STORE_PATH_INTERIM)
df_phys_norm_v1 = store['df_phys_norm_v1']
df_phys_att_v0 = store['df_phys_att_v0']
df_labels = store['df_labels']
store.close()

In [None]:
# create timestamp column and drop old column (notice the white space in the col name)
for df in [df_phys_norm_v1, df_phys_att_v0]:
    df['Timestamp'] = pd.to_datetime(df[' Timestamp'])
    df = df.drop(' Timestamp', axis=1)

In [None]:
# convert to time series
df_phys_norm_v1 = df_phys_norm_v1.set_index('Timestamp', drop=True)
df_phys_att_v0 = df_phys_att_v0.set_index('Timestamp', drop=True)
df_phys_norm_v1.head()

In [None]:
df_labels

There seams to be a bug in the labels data set. There are entries for jan 2015. For that reason we filter the data frames according to a min and max date in the config file (constants.py)

In [None]:
# filter labels df to the attack that have a end date attached
# transofrm end time to full timestmap
df_labels_time = df_labels[df_labels['End Time'].notna()].copy()
df_labels_time.loc[:, 'End Time'] = [datetime.combine(datetime.date(a), b) for a,b in zip(
    df_labels_time['Start Time'], df_labels_time['End Time'])]
df_labels_time = df_labels_time.reset_index(drop=True)

In [None]:
# plot timestamp for attach df
plt.plot(df_phys_att_v0.index.values)

In [None]:
# plot timestamps of labels
plt.plot(df_labels_time['Start Time'].values)

In [None]:
# ok, lets remove everything smaller than min_date and larger than max date...
df_labels_time = df_labels_time[(df_labels_time['Start Time'] > const.MIN_DATE) &
                         (df_labels_time['Start Time'] < const.MAX_DATE)]
df_phys_att_v0 = df_phys_att_v0[(df_phys_att_v0.index > const.MIN_DATE) &
                         (df_phys_att_v0.index < const.MAX_DATE)]

In [None]:
# fix column names (some begin with white spaces)
df_phys_norm_v1.columns = [s.replace(' ', '') for s in df_phys_norm_v1.columns]
df_phys_att_v0.columns = [s.replace(' ', '') for s in df_phys_att_v0.columns]

In [None]:
# define drop down lists (see plots below)
day_list_normal = [f'2015-12-{d}' for d in range(22,28)] + ['all']
day_list_attack = [f'2015-12-{d}' for d in range(28,31)] + ['2016-01-01'] + \
                  ['2016-02-01'] + ['all']
sample_frequency_list = ['1s', '5s', '10s', '60s']
sensors = list(set(df_phys_att_v0.columns).union(df_phys_norm_v1.columns))
sensors.remove('Normal/Attack')

In [None]:
# define plot function (single time series)
def get_single_time_series(sensor, day='all', sample_freq='60s'):
    df_plot = df.copy()
    if not day == 'all':
        df_plot = df_plot[(df_plot.index > pd.Timestamp(day)) &
                         (df_plot.index < pd.Timestamp(day)+pd.Timedelta('1d'))]
    df_plot = df_plot.resample(sample_freq).mean()
    title = f'{sensor} over time (averaged over {sample_freq})'
    fig = px.line(df_plot, x=df_plot.index, y=sensor, title=title)
    fig.show()

In [None]:
# define plot function (single time series)
def get_4_signal_plot(sensor0='AIT503', sensor1='AIT203',
                      sensor2='FIT501', sensor3='LIT401', day='all', sample_freq='60s', 
                     show_attacks=False):
    df_plot = df.copy()
    df_plot_label = df_labels_time.copy()
    if not day == 'all':
        df_plot = df_plot[(df_plot.index > pd.Timestamp(day)) &
                         (df_plot.index < pd.Timestamp(day)+pd.Timedelta('1d'))]
        df_plot_label = df_plot_label[(df_plot_label['Start Time'] > pd.Timestamp(day)) &
                         (df_plot_label['Start Time'] < pd.Timestamp(day)+pd.Timedelta('1d'))]
    df_plot = df_plot.resample(sample_freq).mean()

    
    sensor_selection_list=[sensor0, sensor1, sensor2, sensor3]
    num_plots = 5 if show_attacks else 4
    fig = make_subplots(rows=num_plots, cols=1, shared_xaxes=True)
    for i, s in enumerate(sensor_selection_list):
        fig.add_trace(
            go.Scatter(x=df_plot.index,
                       y=df_plot[s], name=s),
            row=i+1, col=1,
        )
    if show_attacks:
        for i in range(len(df_plot_label)):
            df_plot_label = df_plot_label.reset_index(drop=True)
            start = df_plot_label.loc[i, 'Start Time']
            end = df_plot_label.loc[i, 'End Time']
            attack = df_plot_label.loc[i, 'Attack #']
            fig.add_trace(
                go.Scatter(x=[start, end],
                           y=[1, 1], name=f'Attack #{attack}'),
                row=5, col=1,
            )

    title = f'{sensor0}, {sensor1}, {sensor2} and {sensor3}'\
            f' over time (averaged over {sample_freq})'

    fig.update_layout(height=600, width=800, title_text=title)
    fig.show()

# 4 time series during attack period

In [None]:
df = df_phys_att_v0.copy()
sensors.sort()
interact(get_4_signal_plot, sensor0=sensors, sensor1=sensors,
         sensor2=sensors, sensor3=sensors, day=day_list_attack,
         sample_freq=sample_frequency_list, show_attacks=True)

# 4 time series during normal period

In [None]:
df = df_phys_norm_v1.copy()
interact(get_4_signal_plot, sensor0=sensors, sensor1=sensors,
         sensor2=sensors, sensor3=sensors, day=day_list_normal,
         sample_freq=sample_frequency_list, show_attacks=False)

# What is the distribution of attack / anomaly durations?

In [None]:
df_labels_time['duration'] = df_labels_time['End Time'] - df_labels_time['Start Time']

In [None]:
df_labels_time.duration[0].total_seconds()

In [None]:
x = np.array([s.total_seconds() for s in df_labels_time.duration])
print(f'Minimum: {int(x.min())} s')
print(f'Maximum: {int(x.max())} s')
print(f'Median: {int(np.median(x))} s')
print(f'9% quantile: {int(np.quantile(x, .9))} s')


fig = px.histogram(x=x, nbins=400)
fig.show()

# Single time series during attack period

In [None]:
# df = df_phys_norm_v1.copy()
# interact(get_single_time_series, sensor=sensors, day=day_list_normal,
#          sample_freq=sample_frequency_list)

# Single time series during attack period

In [None]:
# df = df_phys_att_v0.copy()
# interact(get_single_time_series, sensor=sensors, day=day_list_attack,
#          sample_freq=sample_frequency_list)