# 1.Import libaries

In [19]:
from CellPAD.evaluator import evaluate
from CellPAD.controller import DropController
from CellPAD.synthsiser import DropSynthesiser
from CellPAD.preprocessor import Preprocessor
from CellPAD.filter import DropAnomalyFilter, ChangeAnomalyFilter
import pandas as pd
import numpy as np
np.set_printoptions(threshold=sys.maxsize)
import sys


# 2.Read data from file

In [20]:
data_path = "./data/sd.csv"
df = pd.read_csv(data_path)
df

Unnamed: 0,Time,KPI
0,2016/11/7 0:00,4609
1,2016/11/7 1:00,3882
2,2016/11/7 2:00,3845
3,2016/11/7 3:00,3547
4,2016/11/7 4:00,3305
...,...,...
2851,2017/5/7 19:00,7673
2852,2017/5/7 20:00,7036
2853,2017/5/7 21:00,7346
2854,2017/5/7 22:00,7501


# 3.inject anomalies

In [21]:
timestamps, series = df["Time"].values, df["KPI"].values
syner = DropSynthesiser(raw_series=series, period_len=168)
syn_series, syn_labels = syner.syn_drop()

# 4.detect drop

# 4.1---attributes

In [22]:
timestamps=timestamps
#use the serie that has both normal and anormal items
series=syn_series

to_remove_trend=True
trend_remove_method="past_mean"
period_len=168  # choose the period is 1 week to compute 168 = 24 hours * 7 days

feature_types=["Indexical", "Numerical"]
feature_time_grain=["Weekly"]
feature_operations=["Mean","Median","Wma","Ewma"]

bootstrap_period_cnt=2

anomaly_filter_method="gauss"
anomaly_filter_coefficient=3.0

# 4.2---some dicts.

In [23]:
# the dict of the attributes of the time series.
dict_series = {}

# trend removal
if not to_remove_trend:
    dict_series["detected_series"] = np.array(series)
else:
    preprocessor = Preprocessor()
    dict_series["detected_series"] = preprocessor.remove_trend(series, period_len, method=trend_remove_method)

dict_series["timestamps"] = timestamps
dict_series["series_len"] = len(dict_series["detected_series"])
dict_series["period_len"] = period_len #=168 = 24 hours * 7 days
dict_series

{'detected_series': array([1.00000000e+00, 9.14379932e-01, 9.35068093e-01, 8.93282126e-01,
        8.61215343e-01, 9.32077299e-01, 1.08005028e-01, 1.26581540e+00,
        9.42185203e-01, 5.92735988e-01, 7.08075784e-01, 7.90224251e-01,
        7.61244975e-01, 8.25116589e-01, 7.40330242e-01, 7.51557481e-01,
        9.90153884e-01, 1.42411666e+00, 1.93378785e+00, 1.85720597e+00,
        1.42002329e+00, 1.47560620e+00, 1.46309416e+00, 1.29075583e+00,
        1.14729768e+00, 9.31148816e-01, 8.70867607e-01, 7.82858710e-01,
        8.00191658e-01, 8.29675027e-01, 1.06344215e+00, 1.13966720e+00,
        7.24735322e-01, 5.88192808e-01, 5.62887575e-01, 7.07035689e-01,
        7.02344920e-01, 7.37940609e-01, 5.94446801e-01, 6.25217519e-01,
        8.00879621e-01, 1.25742028e+00, 1.60676807e+00, 1.56463542e+00,
        1.55126371e+00, 1.62679608e+00, 1.45176542e+00, 1.23746554e+00,
        8.80774198e-01, 7.34183247e-01, 6.68566710e-01, 4.22071680e-01,
        7.26088493e-01, 7.16225567e-01, 1.030

In [24]:
from CellPAD.feature import FeatureTools

# the dict of features related variables.
dict_feature = {}
dict_feature["operations"] = feature_operations
dict_feature["time_grain"] = feature_time_grain
dict_feature["feature_types"] = feature_types
dict_feature["feature_tool"] = FeatureTools()
dict_feature["feature_list"] = dict_feature["feature_tool"].set_feature_names(dict_feature["feature_types"],
                                                                                dict_feature["time_grain"],
                                                                                dict_feature["operations"])
dict_feature

{'operations': ['Mean', 'Median', 'Wma', 'Ewma'],
 'time_grain': ['Weekly'],
 'feature_types': ['Indexical', 'Numerical'],
 'feature_tool': <CellPAD.feature.FeatureTools at 0x7f055249be80>,
 'feature_list': ['Hour',
  'Day',
  '3_Weekly_Mean',
  '5_Weekly_Mean',
  '7_Weekly_Mean',
  '10_Weekly_Mean',
  '3_Weekly_Median',
  '5_Weekly_Median',
  '7_Weekly_Median',
  '10_Weekly_Median',
  '3_Weekly_Wma',
  '5_Weekly_Wma',
  '7_Weekly_Wma',
  '10_Weekly_Wma',
  '3_Weekly_Ewma',
  '5_Weekly_Ewma',
  '7_Weekly_Ewma',
  '10_Weekly_Ewma']}

In [25]:
# the dict of the bootstrap parameters.
dict_bootstrap = {}
dict_bootstrap["period_cnt"] = bootstrap_period_cnt
#use 2 weeks data to bootstratp predictors, the we start our predicion/evaluation from the third week
dict_bootstrap["bootstrap_series_len"] = bootstrap_period_cnt * period_len # =(2*168) 
dict_bootstrap["bootstrap_series"] = dict_series["detected_series"][:dict_bootstrap["bootstrap_series_len"]]
dict_bootstrap

{'period_cnt': 2,
 'bootstrap_series_len': 336,
 'bootstrap_series': array([1.        , 0.91437993, 0.93506809, 0.89328213, 0.86121534,
        0.9320773 , 0.10800503, 1.2658154 , 0.9421852 , 0.59273599,
        0.70807578, 0.79022425, 0.76124497, 0.82511659, 0.74033024,
        0.75155748, 0.99015388, 1.42411666, 1.93378785, 1.85720597,
        1.42002329, 1.4756062 , 1.46309416, 1.29075583, 1.14729768,
        0.93114882, 0.87086761, 0.78285871, 0.80019166, 0.82967503,
        1.06344215, 1.1396672 , 0.72473532, 0.58819281, 0.56288757,
        0.70703569, 0.70234492, 0.73794061, 0.5944468 , 0.62521752,
        0.80087962, 1.25742028, 1.60676807, 1.56463542, 1.55126371,
        1.62679608, 1.45176542, 1.23746554, 0.8807742 , 0.73418325,
        0.66856671, 0.42207168, 0.72608849, 0.71622557, 1.03012032,
        1.0333767 , 0.75836073, 0.65196675, 0.66633752, 0.58485608,
        0.77264111, 0.6807935 , 0.58785971, 0.68623676, 0.80348504,
        1.39963062, 2.05917543, 1.78998293, 1.64

In [26]:
# the dict of anomaly filter parameters.
dict_filter = {}
dict_filter["method"] = anomaly_filter_method
dict_filter["coefficient"] = anomaly_filter_coefficient
dict_filter

{'method': 'gauss', 'coefficient': 3.0}

In [27]:
# the dict of the storage for training data.
dict_storage = {}
dict_storage["normal_features_matrix"] = pd.DataFrame()
dict_storage["normal_response_series"] = []
dict_storage

{'normal_features_matrix': Empty DataFrame
 Columns: []
 Index: [],
 'normal_response_series': []}

# 4.3---Detect Anormaly and evaluate the results

In [28]:
# self.__detect_by_regression(predictor=predictor)
predictor = "RT"

"""
It initiates the result variables.
the dict of prediction results
"""
dict_result = {}
dict_result["drop_ratios"] = [0.0] * dict_bootstrap["bootstrap_series_len"]
dict_result["drop_scores"] = [0.0] * dict_bootstrap["bootstrap_series_len"]
dict_result["drop_labels"] = [False] * dict_bootstrap["bootstrap_series_len"]
dict_result["predicted_series"] = dict_bootstrap["bootstrap_series"]
dict_result

{'drop_ratios': [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0

In [29]:
from CellPAD.algorithm import RegressionPredictor, StatisticalPredictor, StatisticalMeasurement
# regression model
model = RegressionPredictor(predictor)

In [30]:
from CellPAD.smoothing_algorithms import EWMA, WMA


# extract features for training, use the data of two weeks to bootstrap
train_feature_matrix = pd.DataFrame()
start_pos=0
end_pos=dict_bootstrap["bootstrap_series_len"]

for feature_name in dict_feature["feature_list"]:

    feature_values = []
    timestamps_to_datetime = pd.to_datetime(timestamps)
    # Indexical features
    if feature_name == "Hour":
        for i in range(start_pos, end_pos):
            feature_values.append(timestamps_to_datetime[i].hour)
    if feature_name == "Day":
        for i in range(start_pos, end_pos):
            feature_values.append(timestamps_to_datetime[i].dayofweek)

    # numerical features: KPI values
    if feature_name == "Raw":
        for i in range(start_pos, end_pos):
            feature_values.append(series[i])

    
    # numerical features: <window, operator>
    if feature_name[0].isdigit():
        win, period_grain, operation = feature_name.split("_")
        win = int(win)

        # calculate the features for each instance from the start_pos to end_pos
        for idx in range(start_pos, end_pos):
            # computes the period length of different numerical features.
            time_delta = timestamps_to_datetime[1] - timestamps_to_datetime[0] 
            hourly_time_delta = pd.to_datetime("2018/1/9 1:00:00") - pd.to_datetime("2018/1/9 0:00:00")
            daily_time_delta = pd.to_datetime("2018/1/9 0:00:00") - pd.to_datetime("2018/1/8 0:00:00")
            weekly_time_delta = pd.to_datetime("2018/1/15 0:00:00") - pd.to_datetime("2018/1/8 0:00:00")
            if period_grain == "Hourly":
                feature_period_len = int(hourly_time_delta / time_delta)
            if period_grain == "Daily":
                feature_period_len = int(daily_time_delta / time_delta)
            if period_grain == "Weekly":
                feature_period_len = int(weekly_time_delta / time_delta)

            # get the list of instances occurred in the same time of the current point 
            labels = list([False] * dict_bootstrap["bootstrap_series_len"]) + list([False] * dict_series["period_len"])
            ret_series = []
            pos = idx - dict_series["period_len"]
            cnt = 0

            while pos >= 0 and cnt < win:
                if not labels[pos]:
                    ret_series.append(dict_bootstrap["bootstrap_series"][pos])
                pos = pos - feature_period_len
                cnt = cnt + 1

            if len(ret_series) == 0:
                vs = [0.0]
            else:
                vs = ret_series

            if operation == "Mean":
                value = np.mean(vs)
            if operation == "Median":
                value = np.median(vs)
            if operation == "Wma":
                operator = WMA()
                value = operator.wma(vs)
            if operation == "Ewma":
                operator = EWMA()
                value = operator.ewma(vs)
            
            feature_values.append(value)

    feature_values = np.array(feature_values)
    if len(feature_values) == 0:
            continue
    train_feature_matrix[feature_name] = feature_values



In [31]:
# print(train_feature_matrix.to_string())
train_feature_matrix


Unnamed: 0,Hour,Day,3_Weekly_Mean,5_Weekly_Mean,7_Weekly_Mean,10_Weekly_Mean,3_Weekly_Median,5_Weekly_Median,7_Weekly_Median,10_Weekly_Median,3_Weekly_Wma,5_Weekly_Wma,7_Weekly_Wma,10_Weekly_Wma,3_Weekly_Ewma,5_Weekly_Ewma,7_Weekly_Ewma,10_Weekly_Ewma
0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,1,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,2,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,3,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,4,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331,19,6,1.602372,1.602372,1.602372,1.602372,1.602372,1.602372,1.602372,1.602372,1.602372,1.602372,1.602372,1.602372,1.602372,1.602372,1.602372,1.602372
332,20,6,1.608127,1.608127,1.608127,1.608127,1.608127,1.608127,1.608127,1.608127,1.608127,1.608127,1.608127,1.608127,1.608127,1.608127,1.608127,1.608127
333,21,6,1.726165,1.726165,1.726165,1.726165,1.726165,1.726165,1.726165,1.726165,1.726165,1.726165,1.726165,1.726165,1.726165,1.726165,1.726165,1.726165
334,22,6,1.468795,1.468795,1.468795,1.468795,1.468795,1.468795,1.468795,1.468795,1.468795,1.468795,1.468795,1.468795,1.468795,1.468795,1.468795,1.468795


In [32]:
first_train_features = np.array(train_feature_matrix)
first_train_response = dict_bootstrap["bootstrap_series"]

# train the regression model
model.train(first_train_features, first_train_response)

# store the bootstrapping features and the predicted instances
dict_storage["normal_features_matrix"] = first_train_features
dict_storage["normal_response_series"] = list(first_train_response)



In [33]:
# interactively training
round_cnt = int(np.ceil(dict_series["series_len"] / dict_series["period_len"]))
# take some data segments from the third week to predict. (ed - st = 168)

for rod in range(dict_bootstrap["period_cnt"], round_cnt):
    st = rod * dict_series["period_len"]
    ed = min(dict_series["series_len"], st + dict_series["period_len"])


    # EXTRACT FEATURES FOR PREDICTING
    test_feature_matrix = pd.DataFrame()
    start_pos = st
    end_pos=ed

    for feature_name in dict_feature["feature_list"]:

        feature_values = []
        timestamps_to_datetime = pd.to_datetime(timestamps)
        # Indexical features
        if feature_name == "Hour":
            for i in range(start_pos, end_pos):
                feature_values.append(timestamps_to_datetime[i].hour)
        if feature_name == "Day":
            for i in range(start_pos, end_pos):
                feature_values.append(timestamps_to_datetime[i].dayofweek)

        # numerical features: KPI values
        if feature_name == "Raw":
            for i in range(start_pos, end_pos):
                feature_values.append(series[i])

        
        # numerical features: <window, operator>
        if feature_name[0].isdigit():
            win, period_grain, operation = feature_name.split("_")
            win = int(win)

            # calculate the features for each instance from the start_pos to end_pos
            for idx in range(start_pos, end_pos):
                # computes the period length of different numerical features.
                time_delta = timestamps_to_datetime[1] - timestamps_to_datetime[0] 
                hourly_time_delta = pd.to_datetime("2018/1/9 1:00:00") - pd.to_datetime("2018/1/9 0:00:00")
                daily_time_delta = pd.to_datetime("2018/1/9 0:00:00") - pd.to_datetime("2018/1/8 0:00:00")
                weekly_time_delta = pd.to_datetime("2018/1/15 0:00:00") - pd.to_datetime("2018/1/8 0:00:00")
                if period_grain == "Hourly":
                    feature_period_len = int(hourly_time_delta / time_delta)
                if period_grain == "Daily":
                    feature_period_len = int(daily_time_delta / time_delta)
                if period_grain == "Weekly":
                    feature_period_len = int(weekly_time_delta / time_delta)

                # get the list of instances occurred in the same time of the current point 
                labels = list(dict_result["drop_labels"]) + list([False] * dict_series["period_len"])
                ret_series = []
                pos = idx - dict_series["period_len"]
                cnt = 0

                while pos >= 0 and cnt < win:
                    if not labels[pos]:
                        ret_series.append(dict_series["detected_series"][:ed][pos])
                    pos = pos - feature_period_len
                    cnt = cnt + 1

                if len(ret_series) == 0:
                    vs = [0.0]
                else:
                    vs = ret_series

                if operation == "Mean":
                    value = np.mean(vs)
                if operation == "Median":
                    value = np.median(vs)
                if operation == "Wma":
                    operator = WMA()
                    value = operator.wma(vs)
                if operation == "Ewma":
                    operator = EWMA()
                    value = operator.ewma(vs)
                
                feature_values.append(value)

        feature_values = np.array(feature_values)
        if len(feature_values) == 0:
                continue
        test_feature_matrix[feature_name] = feature_values

    this_predicted_features = np.array(test_feature_matrix)
    # predict the series in the current period
    this_predicted_series = model.predict(this_predicted_features)
    this_practical_series = dict_series["detected_series"][st:ed]

    # compare the practical and predicted values and filter anomalies
    anomaly_filter = DropAnomalyFilter(rule=dict_filter["method"],coef=dict_filter["coefficient"])
    this_drop_ratios, this_drop_labels, this_drop_scores = anomaly_filter.detect_anomaly(
                                                                            predicted_series=this_predicted_series,
                                                                            practical_series=this_practical_series
                                                                        )
    # store the detected results. It stores the results in each period(iteration)
    dict_result["predicted_series"] = np.append(dict_result["predicted_series"], this_predicted_series)
    dict_result["drop_ratios"] = np.append(dict_result["drop_ratios"], this_drop_ratios)
    dict_result["drop_labels"] = np.append(dict_result["drop_labels"], this_drop_labels)
    dict_result["drop_scores"] = np.append(dict_result["drop_scores"], this_drop_scores)

    # store the features and values of the new normal instances in the current period.
    '''It stores the features and the response variables of each iteration and reuses them in the future.
    It only stores the instances with a "False" label, which means this instance is normal.'''

    for idx in range(len(this_drop_labels)):
        top_matrix = dict_storage["normal_features_matrix"]
        bottom_line = this_predicted_features[idx]
        if not this_drop_labels[idx]:
            dict_storage["normal_features_matrix"] = np.row_stack((top_matrix, bottom_line))
            dict_storage["normal_response_series"].append(this_practical_series[idx])

    # update the model
    model.train(np.array(dict_storage["normal_features_matrix"]),
                np.array(dict_storage["normal_response_series"]))




In [35]:
auc, prauc = evaluate(dict_result["drop_scores"][2*168:], syn_labels[2*168:])
print("front_mean", "auc", auc, "prauc", prauc)


front_mean auc 0.9832871420173008 prauc 0.9333264820423642
