# Automatic detection of anomalies in MONROE data

### Description of the notebook

This notebook provides methods for automatic detection of anomalies in MONROE data. It also enables visualisation of anomalies within all data. Additionally, dataframe with anomalies can be stored to a file and further analysed with tools such as Orange.

### Instructions for use

1. Run all cells under headings Import modules and connect to the database, Anomaly detection methods, Significance analysis, and Menu widgets.
2. Run first cell under heading Get and plot selected dataframe.
3. Run cell under heading Display menu widgets.

Use the GUI to select which data is visualised and scanned for anomalies. You can input optional arguments in text box named arguments, thus controlling various parameters in detection of the anomalies. The parameters are described in the docstrings of methods for detecting anomalies.

To store the dataframe with identified anomalies to a file, run function sdf.transform_store_df (cell above header Display menu widgets).

### Import modules and connect to the database

In [2]:
%matplotlib notebook

from IPython.core.display import display, HTML, clear_output
display(HTML("<style>.container { width:100% !important; }</style>"))

import ricercando as ric
import ast
import datetime as dt
import pandas as pd
import numpy as np
import qgrid
from scipy import stats
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from skgarden import RandomForestQuantileRegressor
import ipywidgets as ipw
import matplotlib.pyplot as plt
import matplotlib.patches as ptc
import matplotlib.dates as mdates
from ricercando.significance import hyper_test
from sklearn.mixture import GaussianMixture

# connect to ricercando database
# database_ip = '192.168.249.119'
# database_ip = '192.168.249.123'
database_ip = '192.168.27.75'
# database_ip = 'localhost'
ric.set_connection_params(host=database_ip)

### Anomaly detection methods 

#### Select detector

In [3]:
# run selected detector
method_list = ['baseline', 'rolling window', 'distribution']

def select_detector(method_name=None, df=None, observed_parameter=None, arguments='', *args, **kwargs):
    r"""
    Select anomaly detection method by its name and analyse df.
    If method name is not available, return list of available methods.

    Parameters
    ----------
    method_name : str
        Name of the anomaly detection method.
    df : dataframe
        Dataframe to analyse.
    observed_parameter : str
        Name of the observed parameter column in df.
    arguments : str
        String with arguments for anomaly detection method.
        Positional arguments are considered in order as they appear in the string, even if there are keyword arguments between them.
        Keyword arguments should be input in string format, e.g. 'key_arg=[1,2,3]'.
        Any positional string argument should not contain equal sign ('=').
        Example content of arguments textbox: 'positional string', 3.14, 'key_arg1=[1,2,3]', '3rd positional argument', 'key_arg2=7', 'key_arg3="key=string"'
    *args, **kwargs
        Arbitrary optional arguments.

    Returns
    -------
    tuple(df_around_anomaly_list, df_outliers_list)

    df_around_anomaly_list : list of dataframes
        Each dataframe in list contains data before anomaly region + the anomaly region + data after anomaly region.
    df_outliers_list : list of dataframes
        Each dataframe in list contains outliers on some time interval.
    """
    # convert string arguments to list arg_list and dictionary arg_dict
    if arguments != '':
        # put all arguments into the list that is represented as string
        arguments_list = ast.literal_eval('[' + arguments + ']')
        arg_list = [arg for arg in arguments_list if '=' not in str(arg)]
        arg_dict = {key: ast.literal_eval(value) for [key, value] in [kwarg.split('=', 1) for kwarg in arguments_list if '=' in str(kwarg)]}
    else:
        arg_list = []
        arg_dict = {}

    # select method baseline
    if method_name == method_list[0]:
        return baseline(df, observed_parameter, *arg_list, **arg_dict)
    # select method rolling window
    elif method_name == method_list[1]:
        return rolling_window(df, observed_parameter, *arg_list, **arg_dict)
    # select method distribution
    elif method_name == method_list[2]:
        return distribution(df, observed_parameter, *arg_list, **arg_dict)
    # return list of available methods
    else:
        return method_list

In [4]:
# get dataframes with outliers and mark regions with anomalies on plot
def get_df_outliers(df, observed_parameter, outliers, *args, **kwargs):
    r"""
    Get list containing the dataframes with outliers and mark regions with anomalies on plot.

    Parameters
    ----------
    df : dataframe
        Dataframe to analyse.
    observed_parameter : str
        Name of the observed parameter column in df.
    outliers : dataframe
        Dataframe with outliers.

    optional arguments in args or kwargs
    time_delta : pandas.Timedelta
        Anomaly region contains consecutive outliers that are at most time_delta apart.
    min_df_size : int
        Minimal number of samples required for a dataframe with outliers.
        
    optional arguments only in kwargs
    before_anomaly : pandas.Timedelta
        Dataframe df_around_anomaly contains data from time before_anomaly before the start of anomaly region.
    after_anomaly : pandas.Timedelta
        Dataframe df_around_anomaly contains data until time after_anomaly after the end of anomaly region.

    Returns
    -------
    tuple(df_around_anomaly_list, df_outliers_list)

    df_around_anomaly_list : list of dataframes
        Each dataframe in list contains data before anomaly region + the anomaly region + data after anomaly region.
    df_outliers_list : list of dataframes
        Each dataframe in list contains outliers on some time interval.
    """
    if list(outliers) == []:
        return ([], [])

    edgecolor = '#d5b60a'
    facecolor = '#fffe7a'

    # default values for required arguments
    default_resolution = '10min'
    default_time_delta = pd.Timedelta(default_resolution)
    default_min_df_size = int(np.ceil(df.shape[0] * default_resolution / (df.index.max() - df.index.min())))
    default_before_anomaly = '120min'
    default_after_anomaly = '120min'

    args = list(args)
    # get time_delta value
    if args:
        time_delta = args.pop(0)
    elif 'time_delta' in kwargs:
        time_delta = kwargs.pop('time_delta')
    else:
        time_delta = default_time_delta
    # get min_df_size value
    if args:
        min_df_size = args.pop(0)
    elif 'min_df_size' in kwargs:
        min_df_size = kwargs.pop('min_df_size')
    else:
        min_df_size = default_min_df_size
    # get before_anomaly value
    if 'before_anomaly' in kwargs:
        before_anomaly = kwargs.pop('before_anomaly')
    else:
        before_anomaly = default_before_anomaly
    # get after_anomaly value
    if 'after_anomaly' in kwargs:
        after_anomaly = kwargs.pop('after_anomaly')
    else:
        after_anomaly = default_after_anomaly


    # plot outliers
    plt.scatter(outliers.index, outliers, s=1, color='r', label='outliers')

    df_outliers_list = []
    df_around_anomaly_list = []
    # start first anomaly region
    current_region = []
    start_index = previous_index = outliers.index.min()
    # add label 'anomaly region'
    plt.gca().add_patch(ptc.Rectangle((mdates.date2num(start_index), plt.ylim()[0]), 0, 0, linewidth=0, facecolor=facecolor, label='anomaly region'))
    for index, value in outliers.iteritems():
        # end anomaly region
        if index - time_delta > previous_index:
            if len(current_region) >= min_df_size:
                df_outliers_list.append(df.loc[current_region])
                df_around_anomaly_list.append(df.loc[df_outliers_list[-1].first_valid_index() - pd.Timedelta(before_anomaly) :
                                                     df_outliers_list[-1].last_valid_index() + pd.Timedelta(after_anomaly)])
                # mark regions with anomalies on plot
                min_outlier_value = df_outliers_list[-1][observed_parameter].min()
                plt.gca().add_patch(ptc.Rectangle(
                    (mdates.date2num(start_index), min_outlier_value), (previous_index - start_index).total_seconds()/(24*3600),
                    df_outliers_list[-1][observed_parameter].max() - min_outlier_value,
                    alpha=0.3, linewidth=1, edgecolor=edgecolor, facecolor=facecolor))
            # start new anomaly region
            current_region = []
            start_index = index
            current_region.append(index)
        # value is in the current anomaly region
        else:
            current_region.append(index)

        previous_index = index

    # end anomaly region
    if len(current_region) >= min_df_size:
        df_outliers_list.append(df.loc[current_region])
        df_around_anomaly_list.append(df.loc[df_outliers_list[-1].first_valid_index() - pd.Timedelta(before_anomaly) :
                                             df_outliers_list[-1].last_valid_index() + pd.Timedelta(after_anomaly)])
        # mark regions with anomalies on plot
        min_outlier_value = df_outliers_list[-1][observed_parameter].min()
        plt.gca().add_patch(ptc.Rectangle(
            (mdates.date2num(start_index), min_outlier_value), (previous_index - start_index).total_seconds()/(24*3600),
            df_outliers_list[-1][observed_parameter].max() - min_outlier_value,
            alpha=0.3, linewidth=1, edgecolor=edgecolor, facecolor=facecolor))

    return (df_around_anomaly_list, df_outliers_list)

#### Baseline detector

In [5]:
def baseline(df, measured_variable, *args, **kwargs):
    r"""
    Determine the baseline of the measured variable regarding independent variables.
    Detect and plot anomalies with respect to the baseline.

    Parameters
    ----------
    df : dataframe
        Dataframe to analyse.
    measured_variable : str
        Name of the measured variable column in df.

    optional arguments in args or kwargs
    bl_ind_vars : list of strings
        List of the independent variables, e.g. ['DeviceMode', 'RSSI'].
        Determine the baseline of the measured variable regarding independent variables in list bl_ind_vars.
    bl_method : str
        Method to determine the baseline regarding independent variables, e.g. 'Random Forest'.
    bl_plot_values : bool
        If True plot base values of the measured variable for values of independent variables.
    bl_measured_dec : bool
        True iff measured variable is decreasing, i.e. smaller values are better (e.g. RTT).

    optional arguments only in kwargs
    bl_k1 : numeric
        Coefficient in expression to determine the baseline, bl_k1 determines (100*(1 - bl_k1)) percentile.
        Every base value is the mean of the best 100*bl_k1 percent samples for each value of independent variables.
    bl_k2 : numeric
        Coefficient in expression to determine outliers.
        Sample is an outlier when it is more than a factor of bl_k2 worse than baseline.
        Worse means larger if bl_measured_dec==True and smaller otherwise.
    bl_time_delta : pandas.Timedelta
        Anomaly region contains consecutive outliers that are at most time_delta apart.
    bl_min_df_size : int
        Minimal number of samples required for a dataframe with outliers.
    bl_additional_nodes : list of ints
        Additional nodes to train the regressor.

    Returns
    -------
    tuple(df_around_anomaly_list, df_outliers_list)

    df_around_anomaly_list : list of dataframes
        Each dataframe in list contains data before anomaly region + the anomaly region + data after anomaly region.
    df_outliers_list : list of dataframes
        Each dataframe in list contains outliers on some time interval.
    """
    # regression cannot be calculated with categorical object
    if type(df[measured_variable].dtype) is pd.core.dtypes.dtypes.CategoricalDtype:
        return []

    baseline_color = '#d8dcd6'

    # default values for required arguments
#     default_ind_vars = ['DeviceMode', 'RSSI', 'CPU_Apps', 'EventType']
    default_ind_vars = list(bl_ind_vars_selectmultiple.value)
    default_method = 'Quantile Regression Forest'
    default_plot_values = False
    default_measured_dec = True
    default_k1 = bl_k1_floatslider.value
    default_k2 = bl_k2_floattext.value
    default_resolution = str(bl_anomalyresolution_inttext.value) + 'min'
    default_time_delta = pd.Timedelta(default_resolution)
    default_min_df_size = int(np.ceil(df.shape[0] * default_resolution / (df.index.max() - df.index.min())))
#     default_additional_nodes = [598, 599, 601]
    default_additional_nodes = []

    args = list(args)
    # get list of independent variables
    if args:
        ind_vars = args.pop(0)
    elif 'bl_ind_vars' in kwargs:
        ind_vars = kwargs.pop('bl_ind_vars')
    else:
        ind_vars = default_ind_vars
    # get method
    if args:
        method = args.pop(0)
    elif 'bl_method' in kwargs:
        method = kwargs.pop('bl_method')
    else:
        method = default_method
    # get plot_values value
    if args:
        plot_values = args.pop(0)
    elif 'bl_plot_values' in kwargs:
        plot_values = kwargs.pop('bl_plot_values')
    else:
        plot_values = default_plot_values
    # get measured_dec value
    if args:
        measured_dec = args.pop(0)
    elif 'bl_measured_dec' in kwargs:
        measured_dec = kwargs.pop('bl_measured_dec')
    else:
        measured_dec = default_measured_dec
    # get k1 value
    if 'bl_k1' in kwargs:
        k1 = kwargs.pop('bl_k1')
    else:
        k1 = default_k1
    # get k2 value
    if 'bl_k2' in kwargs:
        k2 = kwargs.pop('bl_k2')
    else:
        k2 = default_k2
    # get time_delta value
    if 'bl_time_delta' in kwargs:
        time_delta = kwargs.pop('bl_time_delta')
    else:
        time_delta = default_time_delta
    # get min_df_size value
    if 'bl_min_df_size' in kwargs:
        min_df_size = kwargs.pop('bl_min_df_size')
    else:
        min_df_size = default_min_df_size
    # get additional_nodes value
    if 'bl_additional_nodes' in kwargs:
        additional_nodes = kwargs.pop('bl_additional_nodes')
    else:
        additional_nodes = default_additional_nodes


    # determine and plot the baseline

    # base values of the measured variable for each value of the independent variable
    # every base value is the mean of best k1 * all values in the group
    if method == 'Aggregate':
        ind_var = ind_vars[0]
        df_baseline = df.groupby(ind_var)[measured_variable].agg(
            lambda x: x.nsmallest(int(np.ceil(k1 * x.count()))).mean() if measured_dec else x.nlargest(int(np.ceil(k1 * x.count()))).mean())
        df_baseline_pad = df_baseline[df[ind_var].fillna(method='ffill').fillna(method='bfill')]


    # Random Forest
    elif method == 'Random Forest':
        rfr = RandomForestRegressor()
        rfr.fit(df[[measured_variable] + ind_vars].fillna(method='ffill').fillna(method='bfill').iloc[:df.shape[0] // 2],
                df[measured_variable].fillna(method='ffill').fillna(method='bfill').iloc[:df.shape[0] // 2])
        df_baseline = pd.DataFrame(rfr.predict(df[[measured_variable]  + ind_vars].fillna(method='ffill').fillna(method='bfill')),
                                   columns=[measured_variable])
        df_baseline_pad = df_baseline


    # Quantile Regression Forest
    elif method == 'Quantile Regression Forest':

        # no valid value of measured variable
        if df[measured_variable].isnull().all():
            return []

        percentile = 100*k1 if measured_dec else 100*(1 - k1)

        input_samples = df[ind_vars].fillna(method='ffill').fillna(method='bfill').iloc[:df.shape[0] // 1]
        target_values = df[measured_variable].fillna(method='ffill').fillna(method='bfill').iloc[:df.shape[0] // 1].values.tolist()

        for node in additional_nodes:
            additional_sdf = SelectedDataframe()
            additional_sdf.get_selected_df(str(node), sdf.start_time, sdf.end_time, sdf.freq)

            additional_input_samples = additional_sdf.df[ind_vars].fillna(
                method='ffill').fillna(method='bfill').iloc[:additional_sdf.df.shape[0] // 1]
            additional_target_values = additional_sdf.df[measured_variable].fillna(
                method='ffill').fillna(method='bfill').iloc[:additional_sdf.df.shape[0] // 1].values.tolist()

            input_samples = pd.concat([input_samples, additional_input_samples], ignore_index=True)
            target_values += additional_target_values

        # transform DeviceMode to numerical categories
        if 'DeviceMode' in ind_vars:
            le = preprocessing.LabelEncoder()
            input_samples['DeviceMode'] = input_samples['DeviceMode'].astype('category')
            input_samples['DeviceMode'] = input_samples['DeviceMode'].cat.rename_categories(
                le.fit_transform(input_samples['DeviceMode'].cat.categories))
            df['DeviceMode'] = df['DeviceMode'].astype('category')
            df['DeviceMode'] = df['DeviceMode'].cat.rename_categories(le.fit_transform(df['DeviceMode'].cat.categories))
        # transform EventType to numerical categories
        if 'EventType' in ind_vars:
            le = preprocessing.LabelEncoder()
            input_samples['EventType'] = input_samples['EventType'].astype('category')
            input_samples['EventType'] = input_samples['EventType'].cat.rename_categories(
                le.fit_transform(input_samples['EventType'].cat.categories))
            df['EventType'] = df['EventType'].astype('category')
            df['EventType'] = df['EventType'].cat.rename_categories(le.fit_transform(df['EventType'].cat.categories))

#         display(qgrid.show_grid(pd.concat([input_samples.reset_index(), pd.DataFrame(target_values, columns=[measured_variable])],
#                                         axis=1)), grid_options={'forceFitColumns': False})

#         rfqr = RandomForestQuantileRegressor(n_estimators=1, max_features=1, max_depth=1, min_samples_split=1000, min_samples_leaf=100)
        rfqr = RandomForestQuantileRegressor()
        rfqr.fit(input_samples, target_values)
        df_baseline = pd.DataFrame(rfqr.predict(df[ind_vars].fillna(method='ffill').fillna(method='bfill'), quantile=percentile),
                                   columns=[measured_variable])
        df_baseline_pad = df_baseline


    # invalid method
    else:
        return []


    plt.plot(df.index, df_baseline_pad, color='black', label='baseline')

    # find and plot outliers

    # sample is an outlier when it is more than a factor of k2 worse than baseline
    if measured_dec:
#         mask = df.reset_index()[measured_variable] > k2 * df_baseline_pad
        mask = ((df.reset_index()[measured_variable] > k2 * df_baseline_pad.reset_index()[measured_variable])
                | (df.reset_index()[measured_variable] < (2 - k2) * df_baseline_pad.reset_index()[measured_variable]))
    else:
#         mask = df.reset_index()[measured_variable] < k2 * df_baseline_pad
        mask = df.reset_index()[measured_variable] < k2 * df_baseline_pad.reset_index()[measured_variable]
    mask = pd.DataFrame(mask).assign(new_index=df.index).set_index('new_index')
    outliers = df[measured_variable][mask[measured_variable]]

    # get dataframes with outliers and mark regions with anomalies on plot
    df_outliers_list = get_df_outliers(df, measured_variable, outliers, time_delta, min_df_size, args, kwargs)
    plt.legend(loc='upper right')


    # plot base values of the measured variable for values of independent variables

    if plot_values:
#         display(df.groupby(ind_var)[measured_variable].count())
        plt.figure(num='node id={0}, iccid={1} baseline'.format(df['NodeId'][df['NodeId'].first_valid_index()],
                                                                df['Iccid'][df['Iccid'].first_valid_index()]), tight_layout=True)
        if type(df_baseline.index) is pd.core.indexes.category.CategoricalIndex:
            plt.scatter(df_baseline.index.categories.astype('str'), df_baseline)
            plt.xticks(range(len(df_baseline.index.categories)), df_baseline.index.categories)
        else:
            plt.scatter(df_baseline.index, df_baseline)
        plt.xlabel(ind_vars[0])
        plt.ylabel(measured_variable)

#     display(df_outliers_list)
    return df_outliers_list

#### Rolling window detector

In [6]:
def rolling_window(df, observed_parameter, *args, **kwargs):
    r"""
    Compute rolling window calculations and plot anomalies.

    Parameters
    ----------
    df : dataframe
        Dataframe to analyse.
    observed_parameter : str
        Name of the observed parameter column in df.
        
    optional arguments in args or kwargs
    rw_calc : str
        Rolling calculation to perform.
    rw_res : str in pandas.Timestamp format
        Temporal resolution of anomalies.
        Minimal length of the time interval that contains anomalies.
    rw_k1 : numeric
        Coefficient in expression to determine maximal separation between consecutive outliers in anomaly region.
        maximal separation = rw_k1 * rw_res
    rw_k2 : numeric
        Coefficient in expression to determine minimal size of a dataframe with outliers.
        min_df_size = rw_k2 * number of samples in a window of size rw_res

    optional arguments only in kwargs
    rw_k1_std : int
        Coefficient in expression to determine rolling window size when rw_calc=std.
        rolling window size = rw_k1_std * rw_res
    rw_k2_std : numeric
        Coefficient in expression to determine outliers when rw_calc=std.
        Sample is an outlier when abs(value(sample) - rw_mean) > rw_k2_std * rw_std.
    rw_k1_mean : numeric
        Coefficient in expression to determine outliers when rw_calc=mean.
        Sample is an outlier when abs((rw_mean shifted by 1) - rw_mean) > rw_k1_mean * rw_mean,
        i.e. rw_mean changes for more than 100*rw_k1_mean percent.
    rw_k1_dist : int
        Coefficient in expression to determine rolling window size when rw_calc=dist.
        rolling window size = rw_k1_dist * rw_res

    Returns
    -------
    tuple(df_around_anomaly_list, df_outliers_list)

    df_around_anomaly_list : list of dataframes
        Each dataframe in list contains data before anomaly region + the anomaly region + data after anomaly region.
    df_outliers_list : list of dataframes
        Each dataframe in list contains outliers on some time interval.
    """
    linecolor = '#d8dcd6'
    bold_linecolor = '#000000'

    # statistic cannot be calculated on categorical object
    if type(df[observed_parameter].dtype) is pd.core.dtypes.dtypes.CategoricalDtype:
        return []

    # default values for required arguments
    default_calculation = rw_method_radiobuttons.value
    default_resolution = str(rw_anomalyresolution_inttext.value) + 'min'
#     default_k1 = {'std': 0.1, 'mean': 0.03, 'dist': 0.5}
    default_k1 = rw_outlierseparation_floattext.value
#     default_k2 = {'std': 1, 'mean': 0.1, 'dist': 1}
    default_k2 =  rw_anomalysize_floattext.value
    default_k1_std = 10
    default_k2_std = rw_outliertolerance_floattext.value
    default_k1_mean = rw_outliertolerance_floattext.value
    default_k1_dist = 6

    args = list(args)
    # get calculation value
    if args:
        calculation = args.pop(0)
    elif 'rw_calc' in kwargs:
        calculation = kwargs.pop('rw_calc')
    else:
        calculation = default_calculation
    # get resolution value
    if args:
        resolution = args.pop(0)
    elif 'rw_res' in kwargs:
        resolution = kwargs.pop('rw_res')
    else:
        resolution = default_resolution
    # get k1 value
    if args:
        k1 = args.pop(0)
    elif 'rw_k1' in kwargs:
        k1 = kwargs.pop('rw_k1')
    else:
#         k1 = default_k1.get(calculation)
        k1 = default_k1
    # get k2 value
    if args:
        k2 = args.pop(0)
    elif 'rw_k2' in kwargs:
        k2 = kwargs.pop('rw_k2')
    else:
#         k2 = default_k2.get(calculation)
        k2 = default_k2


    # compute rolling window calculation

    # calculate rolling standard deviation
    # sample is an outlier when abs(value(sample) - rw_mean) > k2_std * rw_std
    if calculation == 'std':
        # get k1_std value
        if 'rw_k1_std' in kwargs:
            k1_std = kwargs.pop('rw_k1_std')
        else:
            k1_std = default_k1_std
        # get k2_std value
        if 'rw_k2_std' in kwargs:
            k2_std = kwargs.pop('rw_k2_std')
        else:
            k2_std = default_k2_std

        rw_mean = df[observed_parameter].rolling(k1_std * resolution, *args, **kwargs).mean()
        rw_std = df[observed_parameter].rolling(k1_std * resolution, *args, **kwargs).std()

        # samples not between rw_high and rw_low are outliers
        rw_high = rw_mean + k2_std * rw_std
        rw_low = rw_mean - k2_std * rw_std
#         outliers = df[observed_parameter][df[observed_parameter] - rw_mean > k2_std * rw_std]
        outliers = df[observed_parameter][abs(df[observed_parameter] - rw_mean) > k2_std * rw_std]

        # plot rolling window calculation
        plt.fill_between(rw_high.index, rw_high, rw_low, alpha=0.5, facecolor=linecolor, label='rolling ' + calculation)

    # calculate rolling mean
    # sample is an outlier when abs((rw_mean shifted by 1) - rw_mean) > k1_mean * rw_mean
    elif calculation == 'mean':
        # get k1_mean value
        if 'rw_k1_mean' in kwargs:
            k1_mean = kwargs.pop('rw_k1_mean')
        else:
            k1_mean = default_k1_mean

        rw_mean = df[observed_parameter].rolling(resolution, *args, **kwargs).mean()
        # outliers lie on time interval where rw_mean changes by more than a factor of k1_mean
#         mask = abs(rw_mean.shift(1) - rw_mean) > k1_mean * rw_mean
        mask = rw_mean - rw_mean.shift(1) < k1_mean * rw_mean
        outliers = df[observed_parameter][mask]

        # plot rolling window calculation
        plt.plot(rw_mean.index, rw_mean, color=linecolor, label='rolling ' + calculation)
        plt.plot(rw_mean.index, rw_mean.where(mask), color=bold_linecolor, label='_nolegend_')

    # calculate rolling distribution
    elif calculation == 'dist':
        # get k1_dist value
        if 'rw_k1_dist' in kwargs:
            k1_dist = kwargs.pop('rw_k1_dist')
        else:
            k1_dist = default_k1_dist

        start_index = df.index.min()
        offset = k1_dist * pd.Timedelta(resolution)
        end_index = start_index + offset
        while end_index <= df.index.max():
            kernel = stats.gaussian_kde(df[observed_parameter][start_index : end_index].fillna(0))
            stats.kstest(kernel.dataset, 'norm')

#             z = df[observed_parameter][start_index : end_index].reset_index().fillna(0)
#             display(z[observed_parameter])
#             z[observed_parameter].plot(kind='hist')

            start_index += pd.Timedelta(resolution)
            end_index = start_index + offset

        outliers = pd.DataFrame()

    # invalid calculation
    else:
        return []

    # plot outliers
#     plt.scatter(outliers.index, outliers, s=1, color='r', label='outliers')

    # get dataframes with outliers and mark regions with anomalies on plot
    time_delta = k1 * pd.Timedelta(resolution)
    min_df_size = int(k2 * np.ceil(df.shape[0] * resolution / (df.index.max() - df.index.min())))
    df_outliers_list = get_df_outliers(df, observed_parameter, outliers, time_delta, min_df_size, args, kwargs)

    plt.legend(loc='upper right')
#     display(df_outliers_list)
    return df_outliers_list

#### Distribution detector

In [7]:
def distribution(df, measured_variable, *args, **kwargs):
    
    # no valid value of measured variable
    if df[measured_variable].isnull().all():
        return []

    # left edges of bins + rightmost edge
#     bin_sequence = [value for value in range(int(df[measured_variable].min()), int(df[measured_variable].max()) + 1)]
    ind_sequence_scale = 10
    ind_sequence = [value/ind_sequence_scale for value in range(ind_sequence_scale*int(df[measured_variable].min()),
                                                                ind_sequence_scale*int(df[measured_variable].max()) + 1)]

    plt.figure(num='node id={0}, iccid={1} distribution'.format(df['NodeId'][df['NodeId'].first_valid_index()],
                                                                df['Iccid'][df['Iccid'].first_valid_index()]), tight_layout=True)
    ax = df[measured_variable].plot(kind='hist', bins=ind_sequence)
    df[measured_variable].plot(kind='kde', ind=ind_sequence, ax=ax, secondary_y=True)
    ax.set_xlabel(measured_variable)

### Significance analysis

In [8]:
#find column cerrelation why RTT changed, rised=True => why it rised, rised=False => why falls, middle is how set is devided: integer divides set above and below value; 'Gaus' tries to create two Gauss sets, else median
def significance_test(df, rised=False, middle='median'):
    result = []

    for iccid,group in df.groupby('Iccid'):
        if isinstance(middle, int):
            if rised:
                rtt_ok = group.apply(lambda row: True if row['RTT'] > middle  else False, axis=1)
            else:
                rtt_ok = group.apply(lambda row: True if row['RTT'] < middle  else False, axis=1)
        elif middle=='Gaus':
            X = group.reset_index()[['time','RTT']].apply(lambda row: pd.Series ((row.time.value,row.RTT)), axis=1).fillna(0).values
            rtt_ok = [x == 1 for x in GaussianMixture(n_components=2).fit(X).predict(X)]
            
        else:
            median = group['RTT'].median()
            if rised:
                rtt_ok = group.apply(lambda row: True if row['RTT'] > median  else False, axis=1)
            else:
#                 rtt_ok = group.apply(lambda row: True if row['RTT'] < median  else False, axis=1)
                rtt_ok = group.apply(lambda row: True if not row['RTT_OK'] else False, axis=1)
            
        for col in [g for g in group.columns if g not in['RTT', 'NodeId', 'Iccid', 'MCC_MNC', 'Interface', 'Error'] and group[g].dtype.name == 'category']:
#         for col in [g for g in group.columns if g not in['RTT', 'NodeId', 'Iccid', 'MCC_MNC', 'Interface', 'Error']]:
                
#             if col=='EventType': #Clear Watchdog, pad Scheduling 
#                 temp  = hyper_test(group[col].map(lambda x: None if x.startswith('Watch') else x).fillna(method='pad'), rtt_ok).reset_index()
#             else:
            if not rtt_ok.empty:
                temp = hyper_test(group[col], rtt_ok).reset_index()
                if not temp.empty:
                    temp.rename( columns={temp.columns[0]: 'Variable'}, inplace=True)
                    temp['Variable'] = temp.apply(lambda row: iccid+','+col+'='+str(row['Variable'][0]), axis=1)
                    temp.rename( columns={temp.columns[0]: 'Iccid,Variable'}, inplace=True)
                    result.append(temp)
    temp = pd.concat(result).sort_values(by='enrichment', ascending=False)
    temp = temp[temp['count']>1000]
#     temp = temp[temp['Iccid,Variable'].map(lambda x: x.startswith('8934041514050774028'))]
#     return temp[temp['p-value']<=0.2]
    return temp

In [None]:
#find column cerrlation why RTT changed, growth=True => why it raised, growth=False => why lowered, middle is where set is divided
from ricercando.significance import (hyper_test)
def test_columns (df, growth=True, middle=-5000):
    result = []
    group = df
    
    rtt_ok = group.apply(lambda row: True if not row['RTT_OK'] else False, axis=1)
#     rtt_ok = group.apply(lambda row: True if row['RTT'] < 100 else False, axis=1)
#     display(qgrid.show_grid(rtt_ok, grid_options={'forceFitColumns': False}))

    for col in [g for g in group.columns if g not in['RTT_OK', 'RTT', 'NodeId', 'Iccid', 'MCC_MNC', 'Interface', 'Error']]:
        temp = hyper_test(group[col], rtt_ok).reset_index()
        if not temp.empty:
            temp.rename( columns={temp.columns[0]: 'Variable'}, inplace=True)
            temp['Variable'] = temp.apply(lambda row: col+'='+str(row['Variable'][0]), axis=1)
            result.append(temp)
    temp = pd.concat(result).sort_values(by='enrichment', ascending=False)
    temp = temp[temp['count']>1000]
#     return temp[temp['p-value']<=0.2]
    return temp

display(qgrid.show_grid(sdf.get_anomalies_df()[0][['Iccid', 'EventType', 'RTT', 'RTT_OK']], grid_options={'forceFitColumns': False}))
# display (pd.concat([sdf.get_anomalies_df()[0].iloc[sdf.get_anomalies_df()[0].shape[0] // 2 : 3*sdf.get_anomalies_df()[0].shape[0] // 4],
#                     sdf.get_anomalies_df()[1].iloc[sdf.get_anomalies_df()[1].shape[0] // 2 : 3*sdf.get_anomalies_df()[1].shape[0] // 4]]))

#find out why RTT has spiked
# test_columns (sdf.get_anomalies_df()[0])
# qgrid.show_grid(test_columns (pd.concat(sdf.get_anomalies_df()), middle=90), grid_options={'forceFitColumns': False})
qgrid.show_grid(test_columns (pd.concat(sdf.get_anomalies_df())), grid_options={'forceFitColumns': False})
# qgrid.show_grid(test_columns (pd.concat([sdf.get_anomalies_df()[0].iloc[sdf.get_anomalies_df()[0].shape[0] // 2 : 3*sdf.get_anomalies_df()[0].shape[0] // 4],
#                                          sdf.get_anomalies_df()[1].iloc[0]])),
# qgrid.show_grid(test_columns (pd.concat([sdf.get_anomalies_df()[0],
#                                          sdf.get_anomalies_df()[1].iloc[0]])),
#                                         sdf.get_anomalies_df()[1].iloc[sdf.get_anomalies_df()[1].shape[0] // 2 : 3*sdf.get_anomalies_df()[1].shape[0] // 4]])),
#                 grid_options={'forceFitColumns': False})

### Menu widgets

In [9]:
label1_width = '130px'
label23_width = '110px'
widget12_width = '150px'
nodeid_default = '601'
# nodeid_default = '582'
observedparameter_default = 'RTT'
startdate_default = dt.datetime(2018, 6, 3)
# could be startdate_default = dt.date(2017, 10, 13) with newer packages

# widgets for selecting dataframe
# column 1

nodeid_dropdown = ipw.Dropdown(options=ric.all_nodes(), value=nodeid_default, layout=ipw.Layout(width=widget12_width))
nodeid_box = ipw.HBox([ipw.Label('node id', layout=ipw.Layout(width=label1_width)), nodeid_dropdown])

observedparameter_dropdown = ipw.Dropdown(options=list(ric.getdf(ric.tables_for_node(nodeid_dropdown.value), nodeid=nodeid_dropdown.value, limit=1)),
                                          value=observedparameter_default, layout=ipw.Layout(width=widget12_width))
observedparameter_box = ipw.HBox([ipw.Label('observed parameter', layout=ipw.Layout(width=label1_width)), observedparameter_dropdown])

temporalresolution_dropdown = ipw.Dropdown(options=['default', '10 milliseconds', '1 second', '1 minute', '30 minutes'],
                                           value='1 minute', layout=ipw.Layout(width=widget12_width))
temporalresolution_box = ipw.HBox([ipw.Label('temporal resolution', layout=ipw.Layout(width=label1_width)), temporalresolution_dropdown])

# column 2

startdate_datepicker = ipw.DatePicker(value=startdate_default, layout=ipw.Layout(width=widget12_width))
startdate_box = ipw.HBox([ipw.Label('start date', layout=ipw.Layout(width=label23_width)), startdate_datepicker])

starthour_intslider = ipw.IntSlider(min=0, max=23, step=1, value=0, continuous_update=False, layout=ipw.Layout(width=widget12_width))
starthour_box = ipw.HBox([ipw.Label('start hour', layout=ipw.Layout(width=label23_width)), starthour_intslider])

timespan_inttext = ipw.IntText(value=24, continuous_update=False, layout=ipw.Layout(width=widget12_width))
timespan_box = ipw.HBox([ipw.Label('time span [hours]', layout=ipw.Layout(width=label23_width)), timespan_inttext])

# widgets for selecting anomaly detection method
# column 3

detector_dropdown = ipw.Dropdown(options=['none'] + method_list, value=method_list[0])
detector_box = ipw.HBox([ipw.Label('anomaly detector', layout=ipw.Layout(width=label23_width)), detector_dropdown])

arguments_text = ipw.Text(placeholder='arguments for detector', continuous_update=False)
arguments_box = ipw.HBox([ipw.Label('arguments', layout=ipw.Layout(width=label23_width)), arguments_text])

showplot_button = ipw.Button(description='show plot')

# column 4

significanceanalysis_checkbox = ipw.Checkbox(value=False, description='significance analysis')

sameoperators_checkbox = ipw.Checkbox(value=False, description='all nodes with same operators')


def handle_widget_change_new_df(change):
    r"""
    Observe the change of a widget value.
    Get new selected dataframe.
    """
    sdf.get_selected_df()
    sdf.plot_df()


def handle_widget_change_keep_df(change):
    r"""
    Observe the change of a widget value.
    Keep current dataframe unless it is None.
    """
#     if sdf.df is None:
#         sdf.get_selected_df()
#     sdf.plot_df()
    sdf.get_selected_df()
    sdf.plot_df()


def handle_nodeid_change(change):
    r"""
    Observe the change of nodeid Dropdown value.
    Different node is selected, so its possible parameters list (from tables_for_node) must be updated.
    """
    # temporarily unobserve handle_widget_change_keep_df, otherwise it is executed after this function
    observedparameter_current = observedparameter_dropdown.value
    observedparameter_dropdown.unobserve(handle_widget_change_keep_df, names='value')
    observedparameter_dropdown.options = list(ric.getdf(ric.tables_for_node(nodeid_dropdown.value), nodeid=nodeid_dropdown.value, limit=1))
    if not observedparameter_dropdown.options:
        observedparameter_dropdown.value = None
    elif observedparameter_current in observedparameter_dropdown.options:
        observedparameter_dropdown.value = observedparameter_current
    elif observedparameter_default in observedparameter_dropdown.options:
        observedparameter_dropdown.value = observedparameter_default
    else:
        observedparameter_dropdown.value = observedparameter_dropdown.options[0]
    observedparameter_dropdown.observe(handle_widget_change_keep_df, names='value')

    sdf.node_id = nodeid_dropdown.value
    sdf.get_selected_df()
    sdf.plot_df()


# register callbacks executed on value change
nodeid_dropdown.observe(handle_nodeid_change, names='value')
observedparameter_dropdown.observe(handle_widget_change_keep_df, names='value')
temporalresolution_dropdown.observe(handle_widget_change_new_df, names='value')
startdate_datepicker.observe(handle_widget_change_new_df, names='value')
starthour_intslider.observe(handle_widget_change_new_df, names='value')
timespan_inttext.observe(handle_widget_change_new_df, names='value')
detector_dropdown.observe(handle_widget_change_keep_df, names='value')
arguments_text.observe(handle_widget_change_keep_df, names='value')
showplot_button.on_click(handle_widget_change_keep_df)
significanceanalysis_checkbox.observe(handle_widget_change_keep_df, names='value')
sameoperators_checkbox.observe(handle_widget_change_keep_df, names='value')


menu_widgets = ipw.HBox([ipw.VBox([nodeid_box, observedparameter_box, temporalresolution_box]), ipw.VBox(layout=ipw.Layout(width=widget12_width)),
                         ipw.VBox([startdate_box, starthour_box, timespan_box]), ipw.VBox(layout=ipw.Layout(width=widget12_width)),
                         ipw.VBox([detector_box, arguments_box, showplot_button]), ipw.VBox(layout=ipw.Layout(width=widget12_width)),
                         ipw.VBox([significanceanalysis_checkbox, sameoperators_checkbox])])

In [10]:
# baseline parameters
bl_default_ind_vars = ['RSRP', 'RSRQ', 'RSSI']
bl_default_k1 = .1
bl_default_k2 = 1.5
bl_default_resolution = 5

bl_label1_width = '130px'
bl_label2_width = '80px'
bl_label3_width = '200px'
bl_widget_width = '150px'
bl_space_width = '50px'

bl_title_box = ipw.HBox([ipw.Label('baseline parameters:')])

bl_ind_vars_selectmultiple = ipw.SelectMultiple(options=list(ric.getdf(ric.tables_for_node(nodeid_dropdown.value), nodeid=nodeid_dropdown.value, limit=1)),
                                                value=bl_default_ind_vars, layout=ipw.Layout(width=bl_widget_width))
bl_ind_vars_box = ipw.HBox([ipw.Label('independent variables', layout=ipw.Layout(width=bl_label1_width)), bl_ind_vars_selectmultiple])

bl_k1_floatslider = ipw.FloatSlider(value=bl_default_k1, min=0, max=1, step=.1, continuous_update=False, layout=ipw.Layout(width=bl_widget_width))
bl_k1_box = ipw.HBox([ipw.Label('percentile', layout=ipw.Layout(width=bl_label2_width)), bl_k1_floatslider])

bl_k2_floattext = ipw.FloatText(value=bl_default_k2, step=.1, continuous_update=False, layout=ipw.Layout(width=bl_widget_width))
bl_k2_box = ipw.HBox([ipw.Label('outlier tolerance', layout=ipw.Layout(width=bl_label1_width)), bl_k2_floattext])

bl_anomalyresolution_inttext = ipw.IntText(value=bl_default_resolution, continuous_update=False, layout=ipw.Layout(width=bl_widget_width))
bl_anomalyresolution_box = ipw.HBox([ipw.Label('anomaly resolution [minutes]', layout=ipw.Layout(width=bl_label3_width)), bl_anomalyresolution_inttext])


def handle_bl_widget_change(change):
    r"""
    Observe the change of a baseline widget value.
    """
    sdf.get_selected_df()
    sdf.plot_df()


# register callbacks executed on value change
bl_ind_vars_selectmultiple.observe(handle_bl_widget_change, names='value')
bl_k1_floatslider.observe(handle_bl_widget_change, names='value')
bl_k2_floattext.observe(handle_bl_widget_change, names='value')
bl_anomalyresolution_inttext.observe(handle_bl_widget_change, names='value')


baseline_widgets = ipw.VBox([bl_title_box,
                             ipw.HBox([ipw.VBox([bl_ind_vars_box]), ipw.VBox(layout=ipw.Layout(width=bl_space_width)),
                                      ipw.VBox([bl_k1_box]), ipw.VBox(layout=ipw.Layout(width=bl_space_width)),
                                      ipw.VBox([bl_k2_box]), ipw.VBox(layout=ipw.Layout(width=bl_space_width)),
                                      ipw.VBox([bl_anomalyresolution_box])])])

In [11]:
# rolling window parameters
rw_default_method = 'std'
rw_default_outlierseparation = .5
rw_default_anomalysize = .15
rw_default_outliertolerance = 1.6
rw_default_resolution = 60
# rw_default_outlierseparation = .1
# rw_default_anomalysize = 1.0
# rw_default_outliertolerance = .5
# rw_default_resolution = 10

rw_label1_width = '80px'
rw_label2_width = '130px'
rw_label3_width = '200px'
rw_widget1_width = '100px'
rw_space_width = '50px'

rw_title_box = ipw.HBox([ipw.Label('rolling window parameters:')])

rw_method_radiobuttons = ipw.RadioButtons(options=['std', 'mean'], value=rw_default_method, layout=ipw.Layout(width=rw_widget1_width))
rw_method_box = ipw.HBox([ipw.Label('method', layout=ipw.Layout(width=rw_label1_width)), rw_method_radiobuttons])

rw_outlierseparation_floattext = ipw.FloatText(value=rw_default_outlierseparation, step=.1, continuous_update=False, layout=ipw.Layout(width=rw_widget1_width))
rw_outlierseparation_box = ipw.HBox([ipw.Label('max outlier separation', layout=ipw.Layout(width=rw_label2_width)), rw_outlierseparation_floattext])

rw_anomalysize_floattext = ipw.FloatText(value=rw_default_anomalysize, step=.1, continuous_update=False, layout=ipw.Layout(width=rw_widget1_width))
rw_anomalysize_box = ipw.HBox([ipw.Label('min anomaly size', layout=ipw.Layout(width=rw_label2_width)), rw_anomalysize_floattext])

rw_outliertolerance_floattext = ipw.FloatText(value=rw_default_outliertolerance, step=.1, continuous_update=False, layout=ipw.Layout(width=rw_widget1_width))
rw_outliertolerance_box = ipw.HBox([ipw.Label('outlier tolerance', layout=ipw.Layout(width=rw_label2_width)), rw_outliertolerance_floattext])

rw_anomalyresolution_inttext = ipw.IntText(value=rw_default_resolution, continuous_update=False, layout=ipw.Layout(width=rw_widget1_width))
rw_anomalyresolution_box = ipw.HBox([ipw.Label('anomaly resolution [minutes]', layout=ipw.Layout(width=rw_label3_width)), rw_anomalyresolution_inttext])


def handle_rw_widget_change(change):
    r"""
    Observe the change of a rolling window widget value.
    """
    sdf.get_selected_df()
    sdf.plot_df()


# register callbacks executed on value change
rw_method_radiobuttons.observe(handle_rw_widget_change, names='value')
rw_outlierseparation_floattext.observe(handle_rw_widget_change, names='value')
rw_anomalysize_floattext.observe(handle_rw_widget_change, names='value')
rw_outliertolerance_floattext.observe(handle_rw_widget_change, names='value')
rw_anomalyresolution_inttext.observe(handle_rw_widget_change, names='value')


rollingwindow_widgets = ipw.VBox([rw_title_box,
                                  ipw.HBox([ipw.VBox([rw_method_box]), ipw.VBox(layout=ipw.Layout(width=rw_space_width)),
                                            ipw.VBox([rw_outlierseparation_box]), ipw.VBox(layout=ipw.Layout(width=rw_space_width)),
                                            ipw.VBox([rw_anomalysize_box]), ipw.VBox(layout=ipw.Layout(width=rw_space_width)),
                                            ipw.VBox([rw_outliertolerance_box]), ipw.VBox(layout=ipw.Layout(width=rw_space_width)),
                                            ipw.VBox([rw_anomalyresolution_box])])])

### Get and plot selected dataframe

In [12]:
class SelectedDataframe():
    r"""Contains the selected dataframe to analyse."""

    freq_map = {'default': None, '10 milliseconds': '10ms', '1 second': '1s', '1 minute': '1m', '30 minutes': '30m'}

    def __init__(self, nodeid=None, operators=None, cleandata=True, clearoutput=True):
        self.df = None
        self.df_group = None
        self.df_iccid_dict = {}
        self.df_anomalies_dict = {}
        self.node_id = nodeid_dropdown.value if nodeid is None else str(nodeid)
        self.start_time = None
        self.end_time = None
        self.freq = None
        self.operators = operators
        self.clean_data_bool = cleandata
        self.clear_output = clearoutput


    def groupby_iccid(self):
        if 'Iccid' in self.df.columns:
            self.df_group = self.df.groupby('Iccid')
            for iccid, df_iccid in self.df_group:
                self.df_iccid_dict[iccid] = df_iccid
        else:
            self.df_group = None


    def store_df(self):
        r"""Store df to file."""
        pd.to_pickle(self.df, 'stored_df')


    def store_csv(self):
        r"""Store df to csv file."""
        self.df.to_csv('stored_csv.csv')
        for iccid, df_iccid in self.df_iccid_dict.items():
            df_iccid.to_csv('stored_csv_{0}.csv'.format(iccid))


    def restore_df(self, filename='stored_df'):
        r"""Restore df from file."""
        try:
            self.df = pd.read_pickle(filename)
            self.groupby_iccid()
        except FileNotFoundError:
            print('File {0} not found.'.format(filename))


    def clean_data(self):
        r"""
        Clean the data in dataframes for each iccid.
        """
        relevant_attributes = ['DeviceMode', 'RSSI', 'RSRQ', 'Frequency', 'CPU_Apps', 'CPU_User', 'CumUptime',
                               'Swap', 'Uptime', 'IP_Address', 'MCC_MNC', 'Host', 'Operator', 'CID']

        temp_df_iccid_dict = {}
        self.df = pd.DataFrame()
        for iccid, df_iccid in self.df_iccid_dict.items():

            # get correct values of column EventType
            if 'EventType' in df_iccid:

                df_event = ric.getdf('event', nodeid=self.node_id, limit=None, start_time=self.start_time, end_time=self.end_time, freq='10ms')
                df_iccid_appended = df_iccid.append(df_event, sort=False).sort_index()
                for column in df_iccid:
                    if df_iccid[column].dtype.name == 'category' and column != 'EventType':
                        df_iccid_appended[column] = df_iccid_appended[column].astype(df_iccid[column].dtype)
                df_iccid = df_iccid_appended

                # propagate value Scheduling.Task.Started until Scheduling.Task.Stopped occurs

                started_indices = list(df_iccid['EventType'].loc[df_iccid['EventType'] == 'Scheduling.Task.Started'].sort_index().index.values)
                stopped_indices = list(df_iccid['EventType'].loc[df_iccid['EventType'] == 'Scheduling.Task.Stopped'].sort_index().index.values)
                # first Scheduling.Task.Stopped occurs before Scheduling.Task.Started
                if started_indices and stopped_indices:
                    if started_indices[0] > stopped_indices[0]:
                        df_iccid.loc[df_iccid.index.min() : stopped_indices[0] - 1, 'EventType'] = 'Scheduling.Task.Started'
                        stopped_indices = stopped_indices[1:]

                    started_indices_idx = 0
                    for stopped_index in stopped_indices:
                        if started_indices_idx < len(started_indices):
                            df_iccid.loc[started_indices[started_indices_idx] : stopped_index - 1, 'EventType'] = 'Scheduling.Task.Started'
                            while (started_indices_idx < len(started_indices)) and (started_indices[started_indices_idx] < stopped_index):
                                started_indices_idx += 1

                # remove all values in column EventType that do not start with 'Scheduling'
                df_iccid['EventType'] = df_iccid['EventType'].map(lambda value: None if not str(value).startswith('Scheduling') else value).astype('category')

            # fill nans for relevant attributes
            for attribute in relevant_attributes:
                if attribute in df_iccid:
                    df_iccid[attribute] = df_iccid[attribute].fillna(method='ffill').fillna(method='bfill')

            temp_df_iccid_dict[iccid] = df_iccid
            self.df = pd.concat([self.df, df_iccid.reset_index()], sort=False, ignore_index=True)

        self.df_iccid_dict = temp_df_iccid_dict
#         display(qgrid.show_grid(self.df, grid_options={'forceFitColumns': False}))


    def transform_store_df(self):
        r"""
        Transform and store dataframe to file for further analysis.
        1. Add column 'observed parameter'_OK filled with value=False to every dataframe in df_outliers_list.
        2. Add column anomaly_id with value=(index + 1) to every dataframe in df_outliers_list.
        3. Fill column 'observed parameter'_OK with value=True for samples in df_OK = (df - all df_outliers).
        4. Fill column anomaly_id with value=Nan for samples in df_OK.
        5. Concatenate df_OK + all df_outliers to df_all.
        6. Pad anomaly_id: anomaly_id determines a region with anomalies on some time interval and a region
            without anomalies on subsequent time interval.
        7. Store resulting dataframe to file.
        """
        for iccid, (df_iccid, df_outliers_list) in self.df_anomalies_dict.items():
            # add column 'observed parameter'_OK filled with value=False to every dataframe with outliers
            df_outliers_list = [df_outlier.assign(
                new_column=pd.Categorical([False] * df_outlier.shape[0], categories=[True, False])).rename(
                index=str, columns={'new_column' : '{0}_OK'.format(observedparameter_dropdown.value)}) for df_outlier in df_outliers_list]
            # add column anomaly_id with value=(index + 1) to every dataframe with outliers
            df_outliers_list = [df_outlier.assign(anomaly_id=pd.Categorical(
                [idx + 1] * df_outlier.shape[0], categories=range(len(df_outliers_list) + 1))) for idx, df_outlier in enumerate(df_outliers_list)]

            df_outliers = pd.concat(df_outliers_list)
            df_OK = df_iccid[~df_iccid.index.isin(df_outliers.index)]
            # add and fill column 'observed parameter'_OK with value=True for samples in df_OK
            df_OK = df_OK.assign(new_column=pd.Categorical([True] * df_OK.shape[0], categories=[True, False])).rename(
                index=str, columns={'new_column' : '{0}_OK'.format(observedparameter_dropdown.value)})
            # add and fill column anomaly_id with value=Nan for samples in df_OK
            df_OK = df_OK.assign(anomaly_id=pd.Categorical([np.nan] * df_OK.shape[0], categories=range(len(df_outliers_list) + 1)))
            # concatenate df_OK + all df_outlier
            df_all = pd.concat([df_OK, df_outliers]).sort_index()
            # pad anomaly_id
            df_all['anomaly_id'].fillna(method='pad', inplace=True)
            df_all['anomaly_id'].fillna(0, inplace=True)
            # store resulting dataframe to file
#             pd.to_pickle(df_all, 'df_anomalies_{0}'.format(iccid))
#             display(df_all)
            display(qgrid.show_grid(df_all[['RTT', 'RTT_OK', 'anomaly_id']], grid_options={'forceFitColumns': False}))
    
    
    def get_anomalies_df(self):
        r"""
        Get dataframes with anomalies.

        Returns
        -------
        list of dataframes
            Each dataframe in list contains outliers ('observed parameter'_OK==False) in anomaly region
            and regular data ('observed parameter'_OK==True).
        """
        df_anomalies_list = []
        for iccid, (df_around_anomaly_list, df_outliers_list) in self.df_anomalies_dict.items():
            # add column 'observed parameter'_OK filled with value=False to every dataframe with outliers
            df_outliers_list = [df_outlier.assign(new_column=False).rename(
                index=str, columns={'new_column' : '{0}_OK'.format(observedparameter_dropdown.value)}) for df_outlier in df_outliers_list]

            if df_outliers_list:
                df_outliers = pd.concat(df_outliers_list)
                df_OK = pd.concat(df_around_anomaly_list)
                df_OK = df_OK[~df_OK.index.isin(df_outliers.index)]
                # add and fill column 'observed parameter'_OK with value=True for samples in df_OK
                df_OK = df_OK.assign(new_column=True).rename(index=str, columns={'new_column' : '{0}_OK'.format(observedparameter_dropdown.value)})

                df_anomalies_list += [pd.concat([df_OK, df_anomaly]).sort_index() for df_anomaly in df_outliers_list]

#         for anomaly_df in df_anomalies_list:
#             display(qgrid.show_grid(anomaly_df[['EventType', 'RTT', 'RTT_OK']], grid_options={'forceFitColumns': False}))
        return df_anomalies_list


    def get_selected_df(self, node_id=None, start_time=None, end_time=None, freq=None):
        r"""
        Get selected dataframe based on values in menu widgets.
        Group dataframe by iccid and save result into DataFrameGroupBy df_group.
        """
        self.__init__(self.node_id, self.operators, self.clean_data_bool, self.clear_output)
        self.start_time = dt.datetime.combine(startdate_datepicker.value,
                                              dt.time(hour=starthour_intslider.value)) if start_time is None else start_time
        self.end_time = self.start_time + dt.timedelta(hours=timespan_inttext.value)  if end_time is None else end_time
        self.freq = self.freq_map[temporalresolution_dropdown.value] if freq is None else freq

        self.df = ric.getdf(ric.tables_for_node(self.node_id), nodeid=self.node_id, limit=None, start_time=self.start_time,
                            end_time=self.end_time, freq=self.freq, tolerance=pd.Timedelta(seconds=60))

        self.groupby_iccid()

        if self.clean_data_bool:
            self.clean_data()

        # all nodes with same operators
        if sameoperators_checkbox.value:
            self.nodes_sameoperators = []

            if 'Operator' in self.df.columns:
                self.operators = set(self.df['Operator'].unique())

            for node in ric.all_nodes():
                node_df = ric.getdf('modem', nodeid=node, limit=100, start_time=self.start_time, end_time=self.end_time,
                                    freq='1m', tolerance=pd.Timedelta(seconds=60))
                if ('Operator' in node_df.columns) and (set(node_df['Operator'].unique()) & set(self.operators)):
                    self.nodes_sameoperators.append(node)


    def plot_df(self):
        r"""Plot selected dataframes for each iccid."""
        if self.clear_output:
            clear_output()
            if int(ipw.__version__[0]) > 6:
                if detector_dropdown.value == method_list[0]:
                    display(ipw.VBox([menu_widgets, baseline_widgets]))
                elif detector_dropdown.value == method_list[1]:
                    display(ipw.VBox([menu_widgets, rollingwindow_widgets]))
                else:
                    display(menu_widgets)
            plt.close('all')

#         for df_iccid in self.df_iccid_dict.values():
#             display(qgrid.show_grid(df_iccid, grid_options={'forceFitColumns': False}))
#             display(df_iccid)

        # show plot for each iccid
        if self.df_group is not None and observedparameter_dropdown.value is not None and observedparameter_dropdown.value in self.df:
            for iccid, df_iccid in self.df_iccid_dict.items():
                if (self.operators is None) or (set(df_iccid['Operator'].unique()) & set(self.operators)):
                    plt.figure(num='node id={0}, iccid={1}'.format(self.node_id, iccid), tight_layout=True)
                    # resolve 'could not convert string to float' error
                    if type(df_iccid[observedparameter_dropdown.value].dtype) is pd.core.dtypes.dtypes.CategoricalDtype:
                        plt.scatter(df_iccid.index, df_iccid[observedparameter_dropdown.value].astype('str'), s=1, color='b')
                    else:
                        plt.scatter(df_iccid.index, df_iccid[observedparameter_dropdown.value], s=1, color='b')

                    plt.legend(loc='upper right')
                    plt.xlabel('time')
                    plt.ylabel(observedparameter_dropdown.value)
                    plt.xlim(df_iccid.index.min(), df_iccid.index.max())

                    # anomaly detection
                    self.df_anomalies_dict[iccid] = select_detector(detector_dropdown.value, df_iccid, observedparameter_dropdown.value, arguments_text.value)

            # display significance analysis
            if significanceanalysis_checkbox.value:
                if self.get_anomalies_df():
#                     display(qgrid.show_grid(test_columns(pd.concat(self.get_anomalies_df())), grid_options={'forceFitColumns': False, 'defaultColumnWidth': 400}))
                    display(significance_test(pd.concat(self.get_anomalies_df())).reset_index().style.set_properties(subset=['Iccid,Variable'], **{'width':'400px'}))

            if sameoperators_checkbox.value:
                clear_output()
                if int(ipw.__version__[0]) > 6:
                    display(menu_widgets)
                plt.close('all')
                sameoperators_checkbox.unobserve(handle_widget_change_keep_df, names='value')
                sameoperators_checkbox.value = False
                sameoperators_checkbox.observe(handle_widget_change_keep_df, names='value')

                anomaly_count_df = pd.DataFrame({'anomaly count': 0}, index=self.df['time'].unique().copy()).sort_index()
                for operator in self.operators:

                    anomaly_intervals = []
                    anomaly_count_df['anomaly count'] = 0
                    display('Interfaces with operator ' + operator)
                    for node in self.nodes_sameoperators:
                        self.__init__(node, [operator], False, False)
                        self.get_selected_df()
                        self.plot_df()

                        if detector_dropdown.value != 'none':
                            for iccid, (_, df_outliers_list) in self.df_anomalies_dict.items():
                                if operator in set(self.df_iccid_dict[iccid]['Operator'].unique()):
                                    for anomaly in df_outliers_list:
                                        anomaly_intervals.append(pd.Interval(anomaly.index.min(), anomaly.index.max(), closed='both'))

                    for anomaly_interval in anomaly_intervals:
                        anomaly_count_df.loc[anomaly_interval.left : anomaly_interval.right, 'anomaly count'] += 1

                    plt.figure(num='anomaly count for operator {}'.format(operator), tight_layout=True)
                    plt.plot(anomaly_count_df.index, anomaly_count_df['anomaly count'], color='black', label='anomaly count')
                    plt.xlabel('time')
                    plt.ylabel('anomaly count')
                    plt.xlim(anomaly_count_df.index.min(), anomaly_count_df.index.max())

In [None]:
# read stored dataframe from a file into sdf
# sdf.restore_df()
# sdf.store_df()
sdf.store_csv()

In [48]:
# store dataframe for further analysis
sdf.transform_store_df()

In [None]:
# get dataframes with anomalies
sdf.get_anomalies_df()

In [68]:
# create csv with data from multiple nodes listed in node_list
node_list = [470, 471, 472, 473, 476, 477, 478, 479, 480]

df_combined = pd.DataFrame()

for node in node_list:
    additional_sdf = SelectedDataframe()
    additional_sdf.get_selected_df(str(node), sdf.start_time, sdf.end_time, sdf.freq)
    df_combined = pd.concat([df_combined, additional_sdf.df.reset_index()], sort=False, ignore_index=True)

df_combined.to_csv('{0}_{1}-{2}.csv'.format(observedparameter_dropdown.value, node_list[0], node_list[-1]))
# display(qgrid.show_grid(df_combined, grid_options={'forceFitColumns': False}))

### Display menu widgets

In [13]:
sdf = SelectedDataframe()

plt.rcParams['figure.figsize'] = 17, 5
plt.rcParams['figure.max_open_warning'] = 50

if detector_dropdown.value == method_list[0]:
    display(ipw.VBox([menu_widgets, baseline_widgets]))
elif detector_dropdown.value == method_list[1]:
    display(ipw.VBox([menu_widgets, rollingwindow_widgets]))
else:
    display(menu_widgets)

VBox(children=(HBox(children=(VBox(children=(HBox(children=(Label(value='node id', layout=Layout(width='130px'…

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>