In [1]:
import numpy as np
import pandas as pd
from scipy.stats import norm

from utils_data.synthetic_ts import generate_ts_data, generate_ts_panel
from utils_visualization import plot_residuals_outliers


class OnlineMovingAverage:
    def __init__(self, window_size):
        self._window_size = window_size
        self._array = np.array([None] * window_size)
        self._sum = 0
    
    def fit(self, y_init):
        assert len(y_init)==self._window_size
        self._array = y_init
        self._sum = np.sum(y_init)
        self.fitted = True
    
    def predict(self, h):
        assert self.fitted
        y_hat = np.repeat(self._sum/len(self._array), h) 
        return y_hat
        
    def update(self, val):
        self._array = np.roll(self._array, -1)
        self._sum += val
        self._sum -= self._array[-1]
        self._array[-1] = val

def rolling_window(Model, y, window_size):    
    model = Model(window_size)
    model.fit(y[:window_size])
    
    n_dates = len(y)
    y_hat = np.ones(n_dates) * 10000
    residuals = np.ones(n_dates) * 10000
    for idx, val in enumerate(y):
        val_hat = model.predict(1)
        model.update(val)
        residuals[idx] = val-val_hat
        y_hat[idx] = val_hat
    
    return y_hat

def normalize(x):
    # drop min-max observations for robustness
    x_r = x.copy()
    x_r.sort()
    x_r = x_r[1:-1]
    
    x_norm = x-np.mean(x_r)
    x_norm /= np.std(x_r)
    return x_norm

def simple_outlier_detection(x, empirical=False):
    score = normalize(x)
    if empirical:
        q0, q1 = np.quantile(x_norm, q=[0.001,0.999], interpolation='linear')
    else:
        q0, q1 = norm.ppf([0.001, 0.999])
        
    outlier = np.logical_not((score > q0) & (score < q1))
    threshold = [q0, q1]
    return outlier, score, threshold

def panel_outlier_detection(panel_df, empirical=False):    
    df_dict = {'u_id': [], 'y_hat': [], 'outlier': [], 'outlier_score': []}
    for u_id in panel_df.u_id.unique():
        print("========= u_id: {} =========".format(u_id))
        ts = panel_df.loc[panel_df['u_id']==u_id]
        
        y = panel_df.loc[u_id,'y']
        y_hat = rolling_window(Model=OnlineMovingAverage, y=y, window_size=5)
        
        residuals = y-y_hat
        outlier, outlier_score, _ = simple_outlier_detection(residuals, empirical=empirical)
        
        df_dict['u_id'].append(u_id)
        df_dict['y_hat'].append(y_hat)
        df_dict['outlier'].append(outlier)
        df_dict['outlier_score'].append(outlier_score)
    
    outlier_df = pd.DataFrame.from_dict(df_dict)
    panel_df = panel_df.merge(outlier_df, on='u_id', how='left')
    return panel_df


In [2]:
y, _, _ = generate_ts_data(n_dates=10, seed=2)
y_hat = rolling_window(OnlineMovingAverage, y, window_size=5)
residuals = y-y_hat
outlier, outlier_score, threshold = simple_outlier_detection(residuals, empirical=False)
plot_residuals_outliers(y, y_hat, outlier, outlier_score, threshold)

In [3]:
panel_df = generate_ts_panel(n_u_id=10, n_dates = 20)
panel_df = panel_outlier_detection(panel_df)



In [4]:
u_id = 0
y = panel_df.loc[0,'y']
y_hat = panel_df.loc[0,'y_hat']
outlier = panel_df.loc[0,'outlier']
outlier_score = panel_df.loc[0,'outlier_score']
plot_residuals_outliers(y, y_hat, outlier, outlier_score, threshold=norm.ppf([0.001, 0.999]))

# Group Anomaly Detection

In [None]:
from utils_visualization import plot_mixtures
from utils_data.synthetic_mixtures import generate_mixtures

mixtures_df = generate_mixtures()
plot_mixtures(mixtures_df)

In [None]:
from models.ocsmm import OneClassSMM

Strain = mixtures_df['distribution'].tolist()

clf = OneClassSMM(C=0.1, gamma=0.1)
clf.fit(Strain)

# y_pred_train = clf.predict(X_train)
# y_pred_test = clf.predict(X_test)
# y_pred_outliers = clf.predict(X_outliers)
# n_error_train = y_pred_train[y_pred_train == -1].size
# n_error_test = y_pred_test[y_pred_test == -1].size

In [None]:
print("mixtures_df['distribution']", type(mixtures_df['distribution']))

In [None]:
Strain = mixtures_df['distribution'].tolist()