# Anomalie detection

In [1]:
import pandas as pd
import numpy as np
from collections import Counter

In [2]:
data = pd.read_csv('32018_02_25__19_10.csv')
#data = pd.read_csv('template12018_03_02__13_33.csv')


data.Timestamp = data.Timestamp.apply(lambda x: pd.to_datetime(x, format='%Y-%m-%d %H:%M:%S.%f', errors='ignore'))
data.head()

Unnamed: 0,cluster,log_id,Timestamp
0,1,2,2017-11-13 13:48:53.316592
1,1,4,2017-11-13 13:45:53.316592
2,1,6,2017-11-13 13:48:53.316592
3,1,7,2017-11-13 13:47:53.316592
4,1,8,2017-11-13 13:44:53.316592


In [3]:
data = data.sort_values(by=['Timestamp','log_id'],ascending=True)

In [4]:
w_size = 1.5 # 5 min
recouvrement = 0.0 * w_size# % de la taille de la fenetre
number_clusters = len(data.cluster.unique())
matrix = np.zeros((1,number_clusters))

In [5]:
t_H = max(data.Timestamp)
tmsp = min(data.Timestamp) 

In [6]:
while(tmsp < t_H):
    mask = ((data['Timestamp'] >= tmsp) & (data['Timestamp'] < (tmsp + pd.Timedelta(minutes=w_size))))
    w_items = data[mask].cluster.tolist()
    w_dict = Counter(w_items)
    w_el = np.array([E[1] for E in sorted(w_dict.most_common())])
    w_el = np.hstack([w_el, np.zeros(number_clusters)])[:number_clusters]
    matrix = np.concatenate((matrix, [w_el]))
    tmsp = tmsp + pd.Timedelta(minutes=w_size)

# K-Means 
http://scikit-learn.org/stable/auto_examples/covariance/plot_outlier_detection.html#sphx-glr-auto-examples-covariance-plot-outlier-detection-py

In [7]:
from sklearn.cluster import KMeans
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go

init_notebook_mode(connected=True)


kmeans = KMeans(n_clusters=10, random_state=0).fit(matrix)
y_kmeans = kmeans.predict(matrix)
centers = kmeans.cluster_centers_

In [8]:
c_palette= ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0, 360, len(y_kmeans))]
c = [c_palette[y] for y in y_kmeans]

centers_trace = go.Scatter3d(
    x=centers[:,0],
    y=centers[:,1],
    z=centers[:,2],
    mode='markers',
    marker=dict(
        size=12,
        line=dict(
            color='rgba(217, 217, 217, 0.14)',
            width=0.5
        ),
        opacity=0.8
    )
)

trace2 = go.Scatter3d(
    x=matrix[:,0],
    y=matrix[:,1],
    z=matrix[:,2],
    mode='markers',
    marker=dict(
        color=c,
        size=6,
        symbol='circle',
        line=dict(
            color='rgb(204, 204, 204)',
            width=0.1
        ),
        opacity=0.9
    )
)

fig = go.Figure(data=[centers_trace, trace2])
iplot(fig, filename='simple-3d-scatter')

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


# OneClassSVM

In [46]:
from sklearn import svm
outliers_fraction = 0.001
svm_ = svm.OneClassSVM(nu=0.99 * outliers_fraction,kernel="rbf", gamma=0.01)
svm_.fit(matrix)
y_svm = svm_.predict(matrix)

In [47]:
c_palette= ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0, 360, len(np.unique(y_svm)))]
c = [c_palette[int((y+1)/2)] for y in y_svm]

In [48]:
centers_trace = go.Scatter3d(
    x=matrix[:,0],
    y=matrix[:,1],
    z=matrix[:,2],
    mode='markers',
    marker=dict(
        color=c,
        size=6,
        symbol='circle',
        line=dict(
            color='rgb(204, 204, 204)',
            width=0.1
        ),
        opacity=0.9
    )
)

fig = go.Figure(data=[centers_trace])
iplot(fig, filename='simple-3d-scatter')