In [1]:
import pandas as pd
import numpy as np
import scipy.io
from anomaly_detector import AnomalyDetector
from metrics import AnomalyDetectorEvaluator

In [2]:
shuttle = scipy.io.loadmat("shuttle.mat")
shuttle_data = pd.DataFrame(shuttle['X'])
shuttle_eval = pd.DataFrame(shuttle['y'])
http_data = pd.read_csv("http_train.csv")
http_eval = pd.read_csv("http_eval.csv")

In [3]:
shuttle_eval = np.array(shuttle_eval).flatten()
http_eval = np.array(http_eval).flatten()

In [4]:
shuttle_data

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,50,21,77,0,28,0,27,48,22
1,53,0,82,0,52,-5,29,30,2
2,37,0,76,0,28,18,40,48,8
3,37,0,79,0,34,-26,43,46,2
4,85,0,88,-4,6,1,3,83,80
...,...,...,...,...,...,...,...,...,...
49092,39,-2,80,-4,38,0,41,41,0
49093,43,0,81,1,42,-9,37,39,2
49094,49,0,87,0,46,-12,38,41,2
49095,80,0,84,0,-36,-29,4,120,116


In [5]:
kmeans = AnomalyDetector(model="kmeans", n_clusters=2)
labels_shuttle_kmeans, distances_shuttle_kmeans = kmeans.fit_predict(data=shuttle_data)
labels_shuttle_kmeans = AnomalyDetector.transform_labels(labels_shuttle_kmeans)
distances_shuttle_kmeans = AnomalyDetector.transform_distances(distances_shuttle_kmeans)
evaluator_shuttle_kmeans = AnomalyDetectorEvaluator(shuttle_eval, labels_shuttle_kmeans, distances_shuttle_kmeans)
accuracy_shuttle_kmeans = evaluator_shuttle_kmeans.calculate_accuracy()
recall_shuttle_kmeans = evaluator_shuttle_kmeans.calculate_recall()
precision_shuttle_kmeans = evaluator_shuttle_kmeans.calculate_precision()
auc_pr_shuttle_kmeans = evaluator_shuttle_kmeans.calculate_auc_pr()
accuracy_shuttle_kmeans, recall_shuttle_kmeans, precision_shuttle_kmeans, auc_pr_shuttle_kmeans



(0.9960893740961769,
 0.9461691825690687,
 0.9990977443609023,
 0.42270291358630463)

In [None]:
labels_shuttle_kmeans, distances_shuttle_kmeans = kmeans.fit_predict(data=shuttle_data)

In [6]:
dbscan = AnomalyDetector(model="dbscan", )
labels_shuttle_dbscan, distances_shuttle_dbscan = dbscan.fit_predict(data=shuttle_data)
labels_shuttle_dbscan = AnomalyDetector.transform_labels(labels_shuttle_dbscan)
distances_shuttle_dbscan = AnomalyDetector.transform_distances(distances_shuttle_dbscan)
evaluator_shuttle_dbscan = AnomalyDetectorEvaluator(shuttle_eval, labels_shuttle_dbscan, distances_shuttle_dbscan)
accuracy_shuttle_dbscan = evaluator_shuttle_dbscan.calculate_accuracy()
recall_shuttle_dbscan = evaluator_shuttle_dbscan.calculate_recall()
precision_shuttle_dbscan = evaluator_shuttle_dbscan.calculate_precision()
auc_pr_shuttle_dbscan = evaluator_shuttle_dbscan.calculate_auc_pr()
accuracy_shuttle_dbscan, recall_shuttle_dbscan, precision_shuttle_dbscan, auc_pr_shuttle_dbscan

(0.9939711183982728,
 0.9584164055824551,
 0.957325746799431,
 0.19496838432501365)

In [16]:
shuttle_data.shape

(49097, 9)

In [4]:

batch_size = 10000

all_labels = []
all_distances = []

for start in range(0, len(shuttle_data), batch_size):
    end = start + batch_size
    batch_data = shuttle_data[start:end]

    agglomerative = AnomalyDetector(model="agglomerative", n_clusters=2)
    labels_batch, distances_batch = agglomerative.fit_predict(data=batch_data)

    all_labels.extend(AnomalyDetector.transform_labels(labels_batch))
    all_distances.extend(AnomalyDetector.transform_distances(distances_batch))

all_labels = np.array(all_labels)
all_distances = np.array(all_distances)

evaluator_shuttle_agglomerative = AnomalyDetectorEvaluator(shuttle_eval, all_labels, all_distances)
accuracy_shuttle_agglomerative = evaluator_shuttle_agglomerative.calculate_accuracy()
recall_shuttle_agglomerative = evaluator_shuttle_agglomerative.calculate_recall()
precision_shuttle_agglomerative = evaluator_shuttle_agglomerative.calculate_precision()
auc_pr_shuttle_agglomerative = evaluator_shuttle_agglomerative.calculate_auc_pr()

accuracy_shuttle_agglomerative, recall_shuttle_agglomerative, precision_shuttle_agglomerative, auc_pr_shuttle_agglomerative

(0.9958245921339389,
 0.9441754485901452,
 0.9972924187725631,
 0.46267401827965815)

In [5]:
http_data.shape

(567498, 3)

In [7]:
agglomerative = AnomalyDetector(model="agglomerative", n_clusters=2)
labels_shuttle_agglomerative, distances_shuttle_agglomerative = agglomerative.fit_predict(data=shuttle_data)
labels_shuttle_agglomerative = AnomalyDetector.transform_labels(labels_shuttle_agglomerative)
distances_shuttle_agglomerative = AnomalyDetector.transform_distances(distances_shuttle_agglomerative)
evaluator_shuttle_agglomerative = AnomalyDetectorEvaluator(shuttle_eval, labels_shuttle_agglomerative, distances_shuttle_agglomerative)
accuracy_shuttle_agglomerative = evaluator_shuttle_agglomerative.calculate_accuracy()
recall_shuttle_agglomerative = evaluator_shuttle_agglomerative.calculate_recall()
precision_shuttle_agglomerative = evaluator_shuttle_agglomerative.calculate_precision()
auc_pr_shuttle_agglomerative = evaluator_shuttle_agglomerative.calculate_auc_pr()
accuracy_shuttle_agglomerative, recall_shuttle_agglomerative, precision_shuttle_agglomerative, auc_pr_shuttle_agglomerative

: 

In [None]:
kmeans = AnomalyDetector(model="kmeans", n_clusters=2)
labels_http_kmeans, distances_http_kmeans = kmeans.fit_predict(data=http_data)
labels_http_kmeans = AnomalyDetector.transform_labels(labels_http_kmeans)
distances_http_kmeans = AnomalyDetector.transform_distances(distances_http_kmeans)
evaluator_http_kmeans = AnomalyDetectorEvaluator(http_eval, labels_http_kmeans, distances_http_kmeans)
accuracy_http_kmeans = evaluator_http_kmeans.calculate_accuracy()
recall_http_kmeans = evaluator_http_kmeans.calculate_recall()
precision_http_kmeans = evaluator_http_kmeans.calculate_precision()
auc_pr_http_kmeans = evaluator_http_kmeans.calculate_auc_pr()
accuracy_http_kmeans, recall_http_kmeans, precision_http_kmeans, auc_pr_http_kmeans

In [None]:
%load_ext memory_profiler

In [None]:
dbscan = AnomalyDetector(model="dbscan", )
labels_http_dbscan, distances_http_dbscan = dbscan.fit_predict(data=http_data)
labels_http_kmeans = AnomalyDetector.transform_labels(labels_http_kmeans)
distances_http_kmeans = AnomalyDetector.transform_distances(distances_http_kmeans)
evaluator_http_dbscan = AnomalyDetectorEvaluator(http_eval, labels_http_dbscan, distances_http_dbscan)
accuracy_http_dbscan = evaluator_http_dbscan.calculate_accuracy()
recall_http_dbscan = evaluator_http_dbscan.calculate_recall()
precision_http_dbscan = evaluator_http_dbscan.calculate_precision()
auc_pr_http_dbscan = evaluator_http_dbscan.calculate_auc_pr()

In [None]:
agglomerative = AnomalyDetector(model="agglomerative", n_clusters=2)
labels_http_agglomerative, distances_http_agglomerative = agglomerative.fit_predict(data=http_data)
evaluator_http_agglomerative = AnomalyDetectorEvaluator(http_eval, labels_http_agglomerative, distances_http_agglomerative)
accuracy_http_agglomerative = evaluator_http_agglomerative.calculate_accuracy()
recall_http_agglomerative = evaluator_http_agglomerative.calculate_recall()
precision_http_agglomerative = evaluator_http_agglomerative.calculate_precision()
auc_pr_http_agglomerative = evaluator_http_agglomerative.calculate_auc_pr()

In [6]:
batch_size = 20000

all_labels = []
all_distances = []

for start in range(0, len(http_data), batch_size):
    end = start + batch_size
    batch_data = http_data[start:end]

    agglomerative = AnomalyDetector(model="agglomerative", n_clusters=2)
    labels_batch, distances_batch = agglomerative.fit_predict(data=batch_data)

    all_labels.extend(AnomalyDetector.transform_labels(labels_batch))
    all_distances.extend(AnomalyDetector.transform_distances(distances_batch))

all_labels = np.array(all_labels)
all_distances = np.array(all_distances)

evaluator_http_agglomerative = AnomalyDetectorEvaluator(http_eval, all_labels, all_distances)
accuracy_http_agglomerative = evaluator_http_agglomerative.calculate_accuracy()
recall_http_agglomerative = evaluator_http_agglomerative.calculate_recall()
precision_http_agglomerative = evaluator_http_agglomerative.calculate_precision()
auc_pr_http_agglomerative = evaluator_http_agglomerative.calculate_auc_pr()

accuracy_http_agglomerative, recall_http_agglomerative, precision_http_agglomerative, auc_pr_http_agglomerative

(0.9944687029734025,
 0.9063772048846676,
 0.4059967585089141,
 0.054169680271749406)

In [5]:
all_labels

array([0, 0, 0, ..., 0, 0, 0])