In [1]:
import pandas as pd
import numpy as np
import scipy.io
from anomaly_detector import AnomalyDetector
from metrics import AnomalyDetectorEvaluator

In [2]:
shuttle = scipy.io.loadmat("shuttle.mat")
shuttle_data = pd.DataFrame(shuttle['X'])
shuttle_eval = pd.DataFrame(shuttle['y'])
http_data = pd.read_csv("http_train.csv")
http_eval = pd.read_csv("http_eval.csv")

In [3]:
shuttle_eval = np.array(shuttle_eval).flatten()
http_eval = np.array(http_eval).flatten()

In [27]:
#shuttle_data

In [4]:
kmeans = AnomalyDetector(model="kmeans", n_clusters=2)
labels_shuttle_kmeans, distances_shuttle_kmeans = kmeans.fit_predict(data=shuttle_data)
labels_shuttle_kmeans = AnomalyDetector.transform_labels(labels_shuttle_kmeans)
distances_shuttle_kmeans = AnomalyDetector.transform_distances(distances_shuttle_kmeans)
evaluator_shuttle_kmeans = AnomalyDetectorEvaluator(shuttle_eval, labels_shuttle_kmeans, distances_shuttle_kmeans)

(0.786830152555146,
 0.01708914839077186,
 0.008480565371024734,
 0.8408880005041621)

In [5]:
evaluator_shuttle_kmeans.calculate_all_metrics()

{'accuracy': 0.786830152555146,
 'outliers_accuracy': 0.01708914839077186,
 'precision': 0.008480565371024734,
 'recall': 0.01708914839077186,
 'precision_recall_curve': (array([0.0715115 , 0.97637475, 1.        ]),
  array([1.        , 0.68271148, 0.        ])),
 'auc_pr': 0.8408880005041621,
 'confusion_matrix_percentage':                           Positive Prediction        Negative Prediction
 0  Positive Class    True Positive (TP) 0.12%  False Negative (FN) 7.03%
 1  Negative Class  False Positive (FP) 14.29%  True Negative (TN) 78.56%,
 'imbalanced_metrics':                Metric   Value
 0     Positive Recall   1.71%
 1     Negative Recall  84.61%
 2  Positive Precision   0.85%
 3  Negative Precision  91.79%}

In [6]:
labels_shuttle_kmeans, distances_shuttle_kmeans = kmeans.fit_predict(data=shuttle_data)

In [7]:
dbscan = AnomalyDetector(model="dbscan", )
labels_shuttle_dbscan, distances_shuttle_dbscan = dbscan.fit_predict(data=shuttle_data)
labels_shuttle_dbscan = AnomalyDetector.transform_labels(labels_shuttle_dbscan)
distances_shuttle_dbscan = AnomalyDetector.transform_distances(distances_shuttle_dbscan)
evaluator_shuttle_dbscan = AnomalyDetectorEvaluator(shuttle_eval, labels_shuttle_dbscan, distances_shuttle_dbscan)
accuracy_shuttle_dbscan = evaluator_shuttle_dbscan.calculate_accuracy()
recall_shuttle_dbscan = evaluator_shuttle_dbscan.calculate_recall()
precision_shuttle_dbscan = evaluator_shuttle_dbscan.calculate_precision()
auc_pr_shuttle_dbscan = evaluator_shuttle_dbscan.calculate_auc_pr()
accuracy_shuttle_dbscan, recall_shuttle_dbscan, precision_shuttle_dbscan, auc_pr_shuttle_dbscan

(0.9939711183982728,
 0.9584164055824551,
 0.957325746799431,
 0.19496838432501365)

In [None]:
shuttle_data.shape

In [None]:

batch_size = 10000

all_labels = []
all_distances = []

for start in range(0, len(shuttle_data), batch_size):
    end = start + batch_size
    batch_data = shuttle_data[start:end]

    agglomerative = AnomalyDetector(model="agglomerative", n_clusters=2)
    labels_batch, distances_batch = agglomerative.fit_predict(data=batch_data)

    all_labels.extend(AnomalyDetector.transform_labels(labels_batch))
    all_distances.extend(AnomalyDetector.transform_distances(distances_batch))

all_labels = np.array(all_labels)
all_distances = np.array(all_distances)

evaluator_shuttle_agglomerative = AnomalyDetectorEvaluator(shuttle_eval, all_labels, all_distances)
accuracy_shuttle_agglomerative = evaluator_shuttle_agglomerative.calculate_accuracy()
recall_shuttle_agglomerative = evaluator_shuttle_agglomerative.calculate_recall()
precision_shuttle_agglomerative = evaluator_shuttle_agglomerative.calculate_precision()
auc_pr_shuttle_agglomerative = evaluator_shuttle_agglomerative.calculate_auc_pr()

accuracy_shuttle_agglomerative, recall_shuttle_agglomerative, precision_shuttle_agglomerative, auc_pr_shuttle_agglomerative

In [None]:
http_data.shape

In [None]:
agglomerative = AnomalyDetector(model="agglomerative", n_clusters=2)
labels_shuttle_agglomerative, distances_shuttle_agglomerative = agglomerative.fit_predict(data=shuttle_data)
labels_shuttle_agglomerative = AnomalyDetector.transform_labels(labels_shuttle_agglomerative)
distances_shuttle_agglomerative = AnomalyDetector.transform_distances(distances_shuttle_agglomerative)
evaluator_shuttle_agglomerative = AnomalyDetectorEvaluator(shuttle_eval, labels_shuttle_agglomerative, distances_shuttle_agglomerative)
accuracy_shuttle_agglomerative = evaluator_shuttle_agglomerative.calculate_accuracy()
recall_shuttle_agglomerative = evaluator_shuttle_agglomerative.calculate_recall()
precision_shuttle_agglomerative = evaluator_shuttle_agglomerative.calculate_precision()
auc_pr_shuttle_agglomerative = evaluator_shuttle_agglomerative.calculate_auc_pr()
accuracy_shuttle_agglomerative, recall_shuttle_agglomerative, precision_shuttle_agglomerative, auc_pr_shuttle_agglomerative

In [None]:
kmeans = AnomalyDetector(model="kmeans", n_clusters=2)
labels_http_kmeans, distances_http_kmeans = kmeans.fit_predict(data=http_data)
labels_http_kmeans = AnomalyDetector.transform_labels(labels_http_kmeans)
distances_http_kmeans = AnomalyDetector.transform_distances(distances_http_kmeans)
evaluator_http_kmeans = AnomalyDetectorEvaluator(http_eval, labels_http_kmeans, distances_http_kmeans)
accuracy_http_kmeans = evaluator_http_kmeans.calculate_accuracy()
recall_http_kmeans = evaluator_http_kmeans.calculate_recall()
precision_http_kmeans = evaluator_http_kmeans.calculate_precision()
auc_pr_http_kmeans = evaluator_http_kmeans.calculate_auc_pr()
accuracy_http_kmeans, recall_http_kmeans, precision_http_kmeans, auc_pr_http_kmeans

In [None]:
%load_ext memory_profiler

In [None]:
dbscan = AnomalyDetector(model="dbscan", )
labels_http_dbscan, distances_http_dbscan = dbscan.fit_predict(data=http_data)
labels_http_kmeans = AnomalyDetector.transform_labels(labels_http_kmeans)
distances_http_kmeans = AnomalyDetector.transform_distances(distances_http_kmeans)
evaluator_http_dbscan = AnomalyDetectorEvaluator(http_eval, labels_http_dbscan, distances_http_dbscan)
accuracy_http_dbscan = evaluator_http_dbscan.calculate_accuracy()
recall_http_dbscan = evaluator_http_dbscan.calculate_recall()
precision_http_dbscan = evaluator_http_dbscan.calculate_precision()
auc_pr_http_dbscan = evaluator_http_dbscan.calculate_auc_pr()

In [None]:
agglomerative = AnomalyDetector(model="agglomerative", n_clusters=2)
labels_http_agglomerative, distances_http_agglomerative = agglomerative.fit_predict(data=http_data)
evaluator_http_agglomerative = AnomalyDetectorEvaluator(http_eval, labels_http_agglomerative, distances_http_agglomerative)
accuracy_http_agglomerative = evaluator_http_agglomerative.calculate_accuracy()
recall_http_agglomerative = evaluator_http_agglomerative.calculate_recall()
precision_http_agglomerative = evaluator_http_agglomerative.calculate_precision()
auc_pr_http_agglomerative = evaluator_http_agglomerative.calculate_auc_pr()

In [None]:
batch_size = 20000

all_labels = []
all_distances = []

for start in range(0, len(http_data), batch_size):
    end = start + batch_size
    batch_data = http_data[start:end]

    agglomerative = AnomalyDetector(model="agglomerative", n_clusters=2)
    labels_batch, distances_batch = agglomerative.fit_predict(data=batch_data)

    all_labels.extend(AnomalyDetector.transform_labels(labels_batch))
    all_distances.extend(AnomalyDetector.transform_distances(distances_batch))

all_labels = np.array(all_labels)
all_distances = np.array(all_distances)

evaluator_http_agglomerative = AnomalyDetectorEvaluator(http_eval, all_labels, all_distances)
accuracy_http_agglomerative = evaluator_http_agglomerative.calculate_accuracy()
recall_http_agglomerative = evaluator_http_agglomerative.calculate_recall()
precision_http_agglomerative = evaluator_http_agglomerative.calculate_precision()
auc_pr_http_agglomerative = evaluator_http_agglomerative.calculate_auc_pr()

accuracy_http_agglomerative, recall_http_agglomerative, precision_http_agglomerative, auc_pr_http_agglomerative

In [None]:
all_labels