In [37]:
# Импортируем необходимые библиотеки

import pandas as pd
import numpy as np

from sklearn.cluster import DBSCAN, AgglomerativeClustering

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [38]:
# Загрузим данные

anomaly_df = pd.read_csv("./data/anomaly_sample.csv")
anomaly_df["timestamp"] = pd.to_datetime(anomaly_df["timestamp"], unit='s')
anomaly_df.head()

Unnamed: 0,timestamp,value,is_anomaly,predicted
0,2015-02-27 03:42:53,42,False,44.0725
1,2015-02-27 03:47:53,41,False,50.70939
2,2015-02-27 03:52:53,41,False,81.40512
3,2015-02-27 03:57:53,61,False,39.950367
4,2015-02-27 04:02:53,44,False,35.35016


In [39]:
# Посмотрим на данные
anomaly_df.describe()

Unnamed: 0,timestamp,value,predicted
count,15830,15830.0,15830.0
mean,2015-03-26 15:15:23,85.572205,71.870715
min,2015-02-27 03:42:53,0.0,-281.38907
25%,2015-03-12 21:29:08,29.0,32.919171
50%,2015-03-26 15:15:23,47.0,49.771124
75%,2015-04-09 09:01:38,76.0,75.948052
max,2015-04-23 02:47:53,13479.0,2716.1272
std,,321.760918,92.45052


# Поищем аномалии, с помощью DBSCAN

In [40]:
dbscan = DBSCAN(eps=1)

In [41]:
anomaly_df['prediction_dbscan_1'] = dbscan.fit_predict(anomaly_df['value'].values.reshape(-1, 1))
anomaly_df.head()

Unnamed: 0,timestamp,value,is_anomaly,predicted,prediction_dbscan_1
0,2015-02-27 03:42:53,42,False,44.0725,0
1,2015-02-27 03:47:53,41,False,50.70939,0
2,2015-02-27 03:52:53,41,False,81.40512,0
3,2015-02-27 03:57:53,61,False,39.950367,0
4,2015-02-27 04:02:53,44,False,35.35016,0


In [42]:
# Обнаруженные аномалии
anomaly_df[anomaly_df.prediction_dbscan_1 < 0]

Unnamed: 0,timestamp,value,is_anomaly,predicted,prediction_dbscan_1
162,2015-02-27 17:12:53,456,True,89.710290,-1
163,2015-02-27 17:17:53,440,True,134.684600,-1
164,2015-02-27 17:22:53,477,True,126.210050,-1
1006,2015-03-02 15:32:53,346,True,68.731980,-1
1168,2015-03-03 05:02:53,446,True,71.947266,-1
...,...,...,...,...,...
15464,2015-04-21 20:22:53,2510,True,407.389860,-1
15465,2015-04-21 20:27:53,1299,True,388.840450,-1
15466,2015-04-21 20:32:53,714,False,456.416630,-1
15467,2015-04-21 20:37:53,576,False,323.319300,-1


---

#### Задачи модуля

In [43]:
# Задание 5.1
# Сколько аномалий будет обнаружено при использовании DBSCAN
# с параметрами: число точек в кластере — 5, eps-окрестность — 2?

dbscan = DBSCAN(eps=2, min_samples=5)

anomaly_df["prediction_dbscan_2"] = dbscan.fit_predict(anomaly_df["value"].values.reshape(-1, 1))

anomaly_df[anomaly_df.prediction_dbscan_2 < 0].shape[0]

236

In [44]:
# Задание 5.2
# Сколько аномалий будет обнаружено при использовании DBSCAN 
# с параметрами: число точек в кластере — 10, eps-окрестность — 5?

dbscan = DBSCAN(eps=5, min_samples=10)

anomaly_df["prediction_dbscan_3"] = dbscan.fit_predict(anomaly_df["value"].values.reshape(-1, 1))

anomaly_df[anomaly_df.prediction_dbscan_3 < 0].shape[0]

226

---

# Поиск аномалий с помощью агломеративной кластеризации

In [45]:
agglom = AgglomerativeClustering(n_clusters=None, distance_threshold=1)

In [46]:
anomaly_df['prediction_agglom_1'] = agglom.fit_predict(anomaly_df['value'].values.reshape(-1, 1))
anomaly_df.head()

Unnamed: 0,timestamp,value,is_anomaly,predicted,prediction_dbscan_1,prediction_dbscan_2,prediction_dbscan_3,prediction_agglom_1
0,2015-02-27 03:42:53,42,False,44.0725,0,0,0,2
1,2015-02-27 03:47:53,41,False,50.70939,0,0,0,0
2,2015-02-27 03:52:53,41,False,81.40512,0,0,0,0
3,2015-02-27 03:57:53,61,False,39.950367,0,0,0,15
4,2015-02-27 04:02:53,44,False,35.35016,0,0,0,1


In [47]:
valcount = anomaly_df['prediction_agglom_1'].value_counts()
valcount = set(valcount[valcount == 1].index)
display(anomaly_df[anomaly_df.prediction_agglom_1.isin(valcount)])


Unnamed: 0,timestamp,value,is_anomaly,predicted,prediction_dbscan_1,prediction_dbscan_2,prediction_dbscan_3,prediction_agglom_1
162,2015-02-27 17:12:53,456,True,89.710290,-1,2,1,630
164,2015-02-27 17:22:53,477,True,126.210050,-1,-1,-1,629
1168,2015-03-03 05:02:53,446,True,71.947266,-1,2,1,628
1360,2015-03-03 21:02:53,1698,True,101.339670,-1,-1,-1,607
1361,2015-03-03 21:07:53,3228,True,148.821490,-1,-1,-1,479
...,...,...,...,...,...,...,...,...
15465,2015-04-21 20:27:53,1299,True,388.840450,-1,-1,-1,462
15466,2015-04-21 20:32:53,714,False,456.416630,-1,-1,-1,390
15467,2015-04-21 20:37:53,576,False,323.319300,-1,-1,-1,396
15468,2015-04-21 20:42:53,490,False,348.968020,16,11,5,577


---
### Задачи модуля


In [48]:
# Задание 5.3
# Сколько аномалий будет обнаружено при использовании 
# агломеративной кластеризации с дистанцией отсечения 2?

agglom = AgglomerativeClustering(n_clusters=None, distance_threshold=2)

anomaly_df["prediction_agglom_2"] = agglom.fit_predict(anomaly_df["value"].values.reshape(-1, 1))

valcount = anomaly_df['prediction_agglom_2'].value_counts()
valcount = set(valcount[valcount == 1].index)
display(anomaly_df[anomaly_df.prediction_agglom_2.isin(valcount)])

Unnamed: 0,timestamp,value,is_anomaly,predicted,prediction_dbscan_1,prediction_dbscan_2,prediction_dbscan_3,prediction_agglom_1,prediction_agglom_2
1360,2015-03-03 21:02:53,1698,True,101.33967,-1,-1,-1,607,303
1361,2015-03-03 21:07:53,3228,True,148.82149,-1,-1,-1,479,479
1362,2015-03-03 21:12:53,2234,True,209.43730,-1,-1,-1,318,318
1363,2015-03-03 21:17:53,1452,True,184.03752,-1,-1,-1,357,357
1364,2015-03-03 21:22:53,865,False,287.14713,-1,-1,-1,508,508
...,...,...,...,...,...,...,...,...,...
15463,2015-04-21 20:17:53,2826,True,382.82983,-1,-1,-1,453,453
15464,2015-04-21 20:22:53,2510,True,407.38986,-1,-1,-1,328,328
15465,2015-04-21 20:27:53,1299,True,388.84045,-1,-1,-1,462,462
15466,2015-04-21 20:32:53,714,False,456.41663,-1,-1,-1,390,390
