In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
sns.set(rc={'figure.figsize':(24, 15)}, font_scale=2)

import math
import random
import json

import sys
sys.path.append('../')
import utils

from tqdm import tqdm
from glob import glob

from timeit import default_timer as timer
from argparse import Namespace
from KDEpy import FFTKDE
from scipy import stats
from scipy.signal import argrelextrema, argrelmin

# Functions

In [None]:
# KDE Bandwidth using ISJ Functions

def get_outlier_timestamps(df):
    '''
    Given a dataframe `df` of the processed time series return as a numpy array the timestamps of all detected outliers

    If a timestamps corresponds to an outlier from multiple views then the timestamp is repeated for each view 
    (i.e, there can be repetitions of the same timestamp)
    '''
    # Get the timestamps of all outliers
    outliers_df = df.copy()[df['raw_voting_score'] > 0]
    points = []
    
    # Each vote adds one point (a timestamp with more than 1 votes is added more than once)
    for _, row in outliers_df.iterrows():
        points += [row["unix_time"]] * row['raw_voting_score']

    return np.array(points)

def get_local_minima(arr):
    '''
    Given a list `arr` find its local minima.
    If there are flat minima present then only pick the boundaries of the minima flat regions
    '''
    minima_all = argrelextrema(arr, np.less_equal)[0]

    # Refine the minima list to avoid flat minima
    minima = []
    for min_idx in minima_all:
        # Loop over indices that aren't at the boundaries of the input list
        if ((min_idx > 0) and (min_idx < (len(arr) - 1))):
            if arr[min_idx -1] != arr[min_idx + 1]:
                minima.append(min_idx)
    
    return minima

def get_kde_isj_density_df(df):
    '''
    Run KDE over the timeseries in `df` using Improved Sheather-Jones (ISJ) bandwidth selection

    Returns a dataframe with the density
    '''
    # Get a list of the outlier timestamps
    outlier_timestamps = get_outlier_timestamps(df)

    # Sample the timestamps for which we estimate the density (use 3X num points of the df)
    samples = np.linspace(df['unix_time'].min(),df['unix_time'].max(), num=3*len(df.index))

    # Co                                                                                                                                                                                                                                                                                                                                                                    mpute density estimates using 'ISJ' - Improved Sheather Jones
    density = FFTKDE(kernel='epa', bw='ISJ').fit(outlier_timestamps).evaluate(samples)

    # Create the density_df
    density_df = pd.DataFrame({'unix_time': samples, 'density': density})
    density_df['timestamp'] = pd.to_datetime(density_df['unix_time'], unit='s')
    return density_df

def get_isj_clusters_df(density_df, df):
    mi = get_local_minima(density_df['density'].values)
    clusters_df_ISJ = utils.clustering.get_clusters_df(
        samples=density_df['unix_time'].values, density_dist=density_df['density'].values, minima_idx_list=mi,
        points=get_outlier_timestamps(df)
    )
    clusters_df_ISJ = clusters_df_ISJ.sort_values(by='area', ascending=False)
    return clusters_df_ISJ

def get_timeseries_and_density_figure(df, density, clusters, k):
    df = df.sort_values(by='timestamp')

    k_clusters = clusters.sort_values(by='area', ascending=False).head(k).reset_index()
    k_clusters

    # Plot the Figure
    fig, axs = plt.subplots(2, sharex=True)
    fig.subplots_adjust(hspace=0)

    axs[0].plot(df['unix_time'], df['measure'])
    axs[0].scatter(df[df['is_outlier']==1]['unix_time'].values, df[df['is_outlier']==1]['measure'].values, s=90, c='red', zorder=10, label='True Injected Outliers')
    axs[0].scatter(df[df['raw_voting_score']>0]['unix_time'].values, df[df['raw_voting_score']>0]['measure'].values, s=30, c='green', zorder=10, label='Detected Outliers')
    axs[0].set_ylabel('Measure');axs[0].legend()

    # Plot the alert regions
    for _, alert in k_clusters.iterrows():
        axs[0].add_patch(patches.Rectangle((alert['start'], -1.5), width=alert['end']-alert['start'], height=3, linewidth=0, color='yellow', zorder=10, alpha=0.40))

    axs[1].plot(density['unix_time'], density['density'])
    axs[1].set_ylabel('Density');axs[1].set_xlabel('Unix Time')
    for idx, alert in k_clusters.iterrows():
        axs[1].add_patch(patches.Rectangle((alert['start'], 0), width=alert['end']-alert['start'], height=density['density'].max(), linewidth=0, color='yellow', zorder=10, alpha=0.40))
        x_loc = alert['start'] + (alert['end'] - alert['start'])/2
        axs[1].text(x=x_loc, y=density['density'].max(), s=str(idx+1), fontsize=12, color='red', horizontalalignment='center')

    plt.tight_layout()
    plt.close()

    return fig

# Measure Runtime of different components in the pipeline

In [None]:
args = Namespace(window_size='6H', aggregation_model='aggregate_prophet_model', confidence_interval_width=0.98, num_points_per_window=6, point_frequency='1H', kernel='gaussian', bandwidth=86400)

df_path = '../streaming_data/all_outlier_types/df_full.pickle'
df_full = pd.read_pickle(df_path)
df_full

In [None]:
np.random.seed(1)

cur_df = df_full.copy()

# Fit model on raw data and identify outliers in the raw outliers
start = timer()
raw_model, forecast = utils.outlier_detection.raw_data_fit(cur_df, confidence_interval_width=args.confidence_interval_width)
cur_df = utils.outlier_detection.update_df_with_raw_model_fit(df=cur_df, forecast=forecast)
end = timer()
prophet_raw_fit_time = end-start

# Fit model over aggregate views and identify aggregate level outliers
start = timer()
df_agg = utils.aggregation.get_aggregate_df(df=cur_df, args=args)

# Identify outliers across all views (raw + aggregation) and perform voting 
# (assign each timestamp to count of views that marked it as an outlier)
cur_df = cur_df.copy()[:math.floor(len(cur_df.index) / len(args.window_size)) * len(df_agg.index)]
cur_df = utils.voting.add_voting_to_df(df=cur_df, df_agg=df_agg, period=args.num_points_per_window, freq=args.point_frequency)
end = timer()
prophet_agg_fit_time = end-start

# # TODO: Only compute KDE in the frozen time range (i.e., time range where new data does not have an impact to kde)
# start = timer()
# cur_df, density_df, clusters_df = utils.clustering.perform_clustering(df=cur_df, kernel=args.kernel, bandwidth=args.bandwidth)
# end = timer()
# kde_time = end-start

print("Prophet Raw Fit Time Elapsed:", prophet_raw_fit_time)
print("Prophet Aggregate Fit Time Elapsed:", prophet_agg_fit_time)
# print('Clustering Time Elapsed:', kde_time)

# clusters_df

In [None]:
start = timer()
density_df_ISJ = get_kde_isj_density_df(cur_df)
clusters_df_ISJ = get_isj_clusters_df(density_df_ISJ, cur_df)
end = timer()
kde_isj_time = end-start
print('Clustering using ISJ Bandwidth Selection Time Elapsed', kde_isj_time)

clusters_df_ISJ

## Compare Sklearn vs. KDEpy runtimes

In [None]:
from sklearn.neighbors import KernelDensity
from KDEpy import FFTKDE

# Get the list of outliers and the timestamps for which to compute KDE
outlier_pts = get_outlier_timestamps(cur_df)
samples = np.linspace(cur_df['unix_time'].min(),cur_df['unix_time'].max(), num=3*len(cur_df.index))

In [None]:
# Sklearn
start = timer()
kde = KernelDensity(kernel='epanechnikov', bandwidth=86400).fit(np.array(outlier_pts).reshape(-1,1))
density_sklearn = np.exp(kde.score_samples(samples.reshape(-1,1)))
sklearn_time = timer() - start

# KDEpy
start = timer()
density_kdepy = FFTKDE(kernel='epa', bw=86400).fit(outlier_pts).evaluate(samples)
kdepy_time = timer() - start

print("Sklearn Time:", sklearn_time)
print("KDEpy Time:", kdepy_time)

In [None]:
from KDEpy import FFTKDE
from scipy.stats import norm

# Generate a distribution and some data
dist = norm(loc=0, scale=1)
data = dist.rvs(2**8) # Generate 2**8 points
model = FFTKDE(kernel='gaussian', bw='ISJ').fit(data)
x, y = model.evaluate()


# plt.plot(x, y)
print(model.bw)

In [None]:
plt.plot(density_sklearn)

In [None]:
plt.plot(density_kdepy)

In [None]:
from scipy import integrate
area = integrate.simps(x = samples, y=density_kdepy)
area

In [None]:
df = pd.read_pickle('../streaming_data/all_outlier_types/future_prediction_isj/outputs/size_10080/df.pickle')
density_df = pd.read_pickle('../streaming_data/all_outlier_types/future_prediction_isj/outputs/size_10080/density_df.pickle')
clusters_df = pd.read_pickle('../streaming_data/all_outlier_types/future_prediction_isj/outputs/size_10080/clusters_df.pickle')
clusters_df

In [None]:
get_timeseries_and_density_figure(df, density_df, clusters_df, k=10)

# Runtime per component across iterations

In [None]:
fixed_bandwidth_path = '../streaming_data/all_outlier_types/fixed_bandwidth/outputs/'
ISJ_bandwidth_path = '../streaming_data/all_outlier_types/ISJ/outputs/'
future_prediction_path = '../streaming_data/all_outlier_types/future_prediction_isj/outputs/'

size_start = 2016
size_end = 10080
step_size = 6

runtime_dict = {"size": [], "iteration": [], "prophet": [], "prophet_agg": [], 'kde_fixed': [], 'kde_isj': [], 'isj_bandwidth': []}
future_pred_runtime_dict = {"size": [], "iteration": [], "prophet": [], "prophet_agg": [], 'kde_isj': [], 'isj_bandwidth': []}

for size in range(size_start, size_end, step_size):
    
    with open(fixed_bandwidth_path+'size_'+str(size)+'/runtime.json') as f:
        fixed_runtime = json.load(f)

    with open(ISJ_bandwidth_path+'size_'+str(size)+'/stats.json') as f:
        isj_runtime = json.load(f)

    with open(future_prediction_path+'size_'+str(size)+'/stats.json') as f:
        future_prediction_runtime = json.load(f)

    runtime_dict['size'].append(size)
    runtime_dict['iteration'].append((size - size_start)/step_size + 1)
    runtime_dict['prophet'].append((fixed_runtime['raw_fit_time'] + isj_runtime['raw_fit_time'])/2)
    runtime_dict['prophet_agg'].append((fixed_runtime['agg_fit_time'] + isj_runtime['agg_fit_time'])/2)
    runtime_dict['kde_fixed'].append(fixed_runtime['kde_time'])
    runtime_dict['kde_isj'].append(isj_runtime['kde_time'])
    runtime_dict['isj_bandwidth'].append(isj_runtime['bandwidth'])

    future_pred_runtime_dict['size'].append(size)
    future_pred_runtime_dict['iteration'].append((size - size_start)/step_size + 1)
    future_pred_runtime_dict['prophet'].append(future_prediction_runtime['raw_fit_time'])
    future_pred_runtime_dict['prophet_agg'].append(future_prediction_runtime['agg_fit_time'])
    future_pred_runtime_dict['kde_isj'].append(future_prediction_runtime['kde_time'])
    future_pred_runtime_dict['isj_bandwidth'].append(future_prediction_runtime['bandwidth'])

df_runtime = pd.DataFrame.from_dict(runtime_dict)
df_runtime['total_runtime_fixed'] = df_runtime['prophet'] + df_runtime['prophet_agg'] + df_runtime['kde_fixed']
df_runtime['total_runtime_isj'] = df_runtime['prophet'] + df_runtime['prophet_agg'] + df_runtime['kde_isj']

df_runtime

In [None]:
model_fiting_time = df_runtime['prophet'].sum() + df_runtime['prophet_agg'].sum()
kde_time = df_runtime['kde_isj'].sum()
print("Total model fitting time:", model_fiting_time)
print("Total kde time:", kde_time)

In [None]:
df_runtime_future_pred = pd.DataFrame.from_dict(future_pred_runtime_dict)
df_runtime_future_pred

In [None]:
plt.plot(df_runtime['iteration'], df_runtime['prophet'], label='Prophet Raw View Fit')
plt.plot(df_runtime['iteration'], df_runtime['prophet_agg'], label='Prophet Aggregate Views Fit')
plt.plot(df_runtime['iteration'], df_runtime['kde_fixed'], label='KDE Fixed Bandwidth')
plt.plot(df_runtime['iteration'], df_runtime['kde_isj'], label='KDE ISJ Bandwidth')

plt.ylabel('Time (seconds)');plt.xlabel('Iteration');plt.legend();plt.title('Total Runtime per component');plt.tight_layout()
plt.savefig('../figures/benchmarking/runtime_per_component.svg')

In [None]:
plt.plot(df_runtime['iteration'], df_runtime['prophet']/df_runtime['total_runtime_fixed'], label='Prophet Raw View Fit')
plt.plot(df_runtime['iteration'], df_runtime['prophet_agg']/df_runtime['total_runtime_fixed'], label='Prophet Aggregate Views Fit')
plt.plot(df_runtime['iteration'], df_runtime['kde_fixed']/df_runtime['total_runtime_fixed'], label='KDE Fixed Bandwidth')
plt.title('% of Runtime per component (KDE Fixed Bandwidth)');plt.ylabel('% of total runtime');plt.xlabel('Iteration');plt.legend();plt.tight_layout()
plt.savefig('../figures/benchmarking/percent_runtime_per_component_fixed_bandwidth.svg')

In [None]:
plt.plot(df_runtime['iteration'], df_runtime['prophet']/df_runtime['total_runtime_isj'], label='Prophet Raw View Fit')
plt.plot(df_runtime['iteration'], df_runtime['prophet_agg']/df_runtime['total_runtime_isj'], label='Prophet Aggregate Views Fit')
plt.plot(df_runtime['iteration'], df_runtime['kde_isj']/df_runtime['total_runtime_isj'], label='KDE ISJ Bandwidth')
plt.title('% of Runtime per component (KDE ISJ Bandwidth)');plt.ylabel('% of total runtime');plt.xlabel('Iteration');plt.legend();plt.tight_layout()
plt.savefig('../figures/benchmarking/percent_runtime_per_component_ISJ_bandwidth.svg')

In [None]:
plt.plot(df_runtime['iteration'], df_runtime['isj_bandwidth'])
plt.title('Selected ISJ Bandwidth across iterations');plt.ylabel('ISJ Bandwidth (seconds)');plt.xlabel('Iteration');plt.tight_layout()
plt.savefig('../figures/benchmarking/isj_bandwidth_vs_iterations.svg')

## Future Prediction

In [None]:
plt.plot(df_runtime_future_pred['iteration'], df_runtime_future_pred['prophet'], label='Prophet Raw View Fit')
plt.plot(df_runtime_future_pred['iteration'], df_runtime_future_pred['prophet_agg'], label='Prophet Aggregate Views Fit')
plt.plot(df_runtime_future_pred['iteration'], df_runtime_future_pred['kde_isj'], label='KDE ISJ Bandwidth')

plt.ylabel('Time (seconds)');plt.xlabel('Iteration');plt.legend();plt.title('Total Runtime per component (With 1 week ahead prediction)');plt.tight_layout()
plt.savefig('../figures/benchmarking/runtime_per_component_with_future_prediction.svg')

In [None]:
model_fiting_time = df_runtime_future_pred['prophet'].sum() + df_runtime_future_pred['prophet_agg'].sum()
kde_time = df_runtime_future_pred['kde_isj'].sum()
print("Total model fitting time:", model_fiting_time)
print("Total kde time:", kde_time)