# Anomaly detection on a time serie 
We will use three unsupervised machine learning algorithms to detect anomaly on a time series: 


KNN (source: Rajeev Rastogi Sridhar Ramaswamy and Kyuseok Shim. Efficient algorithms for mining outliers from large data sets. in proceedings of the international conference on management of data (sigmod). page 427–438,
2000.)

In [71]:

import pandas as pd

from sklearn.metrics import roc_auc_score

import numpy as np

from pyod.models.knn import KNN


## Define functions to use

### some helps function

In [72]:
# compute the contamination percentage time serie if it is labeled
def computeContamination(filepath) :
    df = pd.read_csv(filepath)
    #print('new')
    labels = []
    if 'anomaly' in df.columns:
        labels= df['anomaly'].to_numpy()
    else:
        labels = df['is_anomaly'].to_numpy()
    contamination = labels.sum() / len(labels)
    # Use smallest positive float as contamination if there are no anomalies in dataset
    contamination = np.nextafter(0, 1) if contamination <= 0. else contamination
    return contamination


In [73]:
# compute ROC-AUC
#return the auc or np.nan if Auc can not be computed
def compute_auc(tm_dataframe):
    #compute AUC
    y = []
    if 'anomaly' in tm_dataframe.columns:
        y= tm_dataframe['anomaly'].to_numpy()
    else:
        y = tm_dataframe['is_anomaly'].to_numpy()
    #print(y)
    #y = df['is_anomaly'].to_numpy()
    y_score = tm_dataframe['scores'].to_numpy()
    #auc =  roc_auc_score(y, y_score)
    if len(set(y)) > 1:
        auc =  roc_auc_score(y, y_score)
        return auc
    else:
        #print('found')
        #print(filepath)
        return np.nan


In [74]:
def Knn(knn_params, filepath ): 
    #get the time serie
    df_timeserie = pd.read_csv(filepath)
    contamination = computeContamination(filepath)
    if contamination > 0.5 :
        print('contamination is more than')
        contamination = 0.5
    
    #prepare values of the dataframe to fit the model
    X = []
    if  df_timeserie.iloc[:, 0].dtypes == object :
        myindex = df_timeserie.index.to_numpy()
        x_value = df_timeserie.iloc[:, 1].to_numpy()
        X = np.concatenate((myindex.reshape(-1,1), x_value.reshape(-1,1)), axis=1)
    else:   
            
        X = df_timeserie.iloc[:, 0:1].to_numpy()
    #fit the model
    knn = KNN(contamination= contamination, n_neighbors= knn_params['n_neighbors'], 
    leaf_size=20, p= 2 ,
    n_jobs=  1)
    knn.fit(X)

    #save score
    y_scores = knn.decision_function(X)
    df_timeserie['scores'] = y_scores
    return df_timeserie


## Anomaly detection

In [75]:
filename = "1.test.csv"

In [76]:
#general parameter:
# parameters are not optimized. It is just to illustrate how we can KNN to detect or score points in a time serie
knnParam = {    
    'n_neighbors': 150,
    'leaf_size':20,
    'distance_metric_order' :2,
    'n_jobs' : 1,
    'random_state' :42
}

In [77]:
df_knn = Knn(knnParam, filename )

In [78]:
auc = compute_auc(df_knn)
print(auc)

0.5351980460921844
