In [1]:
import pandas as pd
import numpy as np
import scipy.io
from anomaly_detector import AnomalyDetector
from metrics import AnomalyDetectorEvaluator

In [2]:
shuttle = scipy.io.loadmat("shuttle.mat")
shuttle_data = pd.DataFrame(shuttle['X'])
shuttle_eval = pd.DataFrame(shuttle['y'])
http_data = pd.read_csv("http_train.csv")
http_eval = pd.read_csv("http_eval.csv")

In [3]:
shuttle_eval = np.array(shuttle_eval).flatten()
http_eval = np.array(http_eval).flatten()

In [27]:
#shuttle_data

In [6]:
kmeans = AnomalyDetector(model="kmeans", n_clusters=2)
labels_shuttle_kmeans, distances_shuttle_kmeans = kmeans.fit_predict(data=shuttle_data)
labels_shuttle_kmeans = AnomalyDetector.transform_labels(labels_shuttle_kmeans)
distances_shuttle_kmeans = AnomalyDetector.transform_distances(distances_shuttle_kmeans)
evaluator_shuttle_kmeans = AnomalyDetectorEvaluator(shuttle_eval, labels_shuttle_kmeans, distances_shuttle_kmeans)

In [7]:
evaluator_shuttle_kmeans.calculate_all_metrics()

{'accuracy': 0.9960893740961769,
 'outliers_accuracy': 0.9461691825690687,
 'precision': 0.9990977443609023,
 'recall': 0.9461691825690687,
 'precision_recall_curve': (array([0.0715115 , 0.46924644, 1.        ]),
  array([1.        , 0.32811165, 0.        ])),
 'auc_pr': 0.42270291358630463,
 'confusion_matrix_percentage':                          Positive Prediction        Negative Prediction
 0  Positive Class   True Positive (TP) 6.77%  False Negative (FN) 0.38%
 1  Negative Class  False Positive (FP) 0.01%  True Negative (TN) 92.84%,
 'imbalanced_metrics':                Metric   Value
 0     Positive Recall  94.62%
 1     Negative Recall  99.99%
 2  Positive Precision  99.91%
 3  Negative Precision  99.59%}

In [8]:
labels_shuttle_kmeans, distances_shuttle_kmeans = kmeans.fit_predict(data=shuttle_data)

In [9]:
dbscan = AnomalyDetector(model="dbscan", )
labels_shuttle_dbscan, distances_shuttle_dbscan = dbscan.fit_predict(data=shuttle_data)
labels_shuttle_dbscan = AnomalyDetector.transform_labels(labels_shuttle_dbscan)
distances_shuttle_dbscan = AnomalyDetector.transform_distances(distances_shuttle_dbscan)
evaluator_shuttle_dbscan = AnomalyDetectorEvaluator(shuttle_eval, labels_shuttle_dbscan, distances_shuttle_dbscan)


In [10]:
evaluator_shuttle_dbscan.calculate_all_metrics()

{'accuracy': 0.9939711183982728,
 'outliers_accuracy': 0.9584164055824551,
 'precision': 0.957325746799431,
 'recall': 0.9584164055824551,
 'precision_recall_curve': (array([0.0715115 , 0.19307536, 1.        ]),
  array([1.        , 0.13500427, 0.        ])),
 'auc_pr': 0.19496838432501365,
 'confusion_matrix_percentage':                          Positive Prediction        Negative Prediction
 0  Positive Class   True Positive (TP) 6.85%  False Negative (FN) 0.30%
 1  Negative Class  False Positive (FP) 0.31%  True Negative (TN) 92.54%,
 'imbalanced_metrics':                Metric   Value
 0     Positive Recall  95.84%
 1     Negative Recall  99.67%
 2  Positive Precision  95.73%
 3  Negative Precision  99.68%}

In [11]:

batch_size = 10000

all_labels = []
all_distances = []

for start in range(0, len(shuttle_data), batch_size):
    end = start + batch_size
    batch_data = shuttle_data[start:end]

    agglomerative = AnomalyDetector(model="agglomerative", n_clusters=2)
    labels_batch, distances_batch = agglomerative.fit_predict(data=batch_data)

    all_labels.extend(AnomalyDetector.transform_labels(labels_batch))
    all_distances.extend(AnomalyDetector.transform_distances(distances_batch))

all_labels = np.array(all_labels)
all_distances = np.array(all_distances)

evaluator_shuttle_agglomerative = AnomalyDetectorEvaluator(shuttle_eval, all_labels, all_distances)

In [12]:
evaluator_shuttle_agglomerative.calculate_all_metrics()

{'accuracy': 0.9958245921339389,
 'outliers_accuracy': 0.9441754485901452,
 'precision': 0.9972924187725631,
 'recall': 0.9441754485901452,
 'precision_recall_curve': (array([0.0715115 , 0.51771894, 1.        ]),
  array([1.        , 0.36200513, 0.        ])),
 'auc_pr': 0.46267401827965815,
 'confusion_matrix_percentage':                          Positive Prediction        Negative Prediction
 0  Positive Class   True Positive (TP) 6.75%  False Negative (FN) 0.40%
 1  Negative Class  False Positive (FP) 0.02%  True Negative (TN) 92.83%,
 'imbalanced_metrics':                Metric   Value
 0     Positive Recall  94.42%
 1     Negative Recall  99.98%
 2  Positive Precision  99.73%
 3  Negative Precision  99.57%}

In [14]:
# liczy się nie wiadomo ile i nie wiem po co 
agglomerative = AnomalyDetector(model="agglomerative", n_clusters=2)
labels_shuttle_agglomerative, distances_shuttle_agglomerative = agglomerative.fit_predict(data=shuttle_data)
labels_shuttle_agglomerative = AnomalyDetector.transform_labels(labels_shuttle_agglomerative)
distances_shuttle_agglomerative = AnomalyDetector.transform_distances(distances_shuttle_agglomerative)
evaluator_shuttle_agglomerative = AnomalyDetectorEvaluator(shuttle_eval, labels_shuttle_agglomerative, distances_shuttle_agglomerative)

KeyboardInterrupt: 

# Zbiór danych HTTP 

In [15]:
kmeans = AnomalyDetector(model="kmeans", n_clusters=2)
labels_http_kmeans, distances_http_kmeans = kmeans.fit_predict(data=http_data)
labels_http_kmeans = AnomalyDetector.transform_labels(labels_http_kmeans)
distances_http_kmeans = AnomalyDetector.transform_distances(distances_http_kmeans)
evaluator_http_kmeans = AnomalyDetectorEvaluator(http_eval, labels_http_kmeans, distances_http_kmeans)

In [16]:
evaluator_http_kmeans.calculate_all_metrics()

{'accuracy': 0.5663050794892669,
 'outliers_accuracy': 0.005427408412483039,
 'precision': 4.919363434371592e-05,
 'recall': 0.005427408412483039,
 'precision_recall_curve': (array([0.00389605, 0.07793169, 1.        ]),
  array([1., 1., 0.])),
 'auc_pr': 0.5389658454055197,
 'confusion_matrix_percentage':                           Positive Prediction        Negative Prediction
 0  Positive Class    True Positive (TP) 0.00%  False Negative (FN) 0.39%
 1  Negative Class  False Positive (FP) 42.98%  True Negative (TN) 56.63%,
 'imbalanced_metrics':                Metric   Value
 0     Positive Recall   0.54%
 1     Negative Recall  56.85%
 2  Positive Precision   0.00%
 3  Negative Precision  99.32%}

In [None]:
%load_ext memory_profiler

In [None]:
dbscan = AnomalyDetector(model="dbscan", )
labels_http_dbscan, distances_http_dbscan = dbscan.fit_predict(data=http_data)
labels_http_kmeans = AnomalyDetector.transform_labels(labels_http_kmeans)
distances_http_kmeans = AnomalyDetector.transform_distances(distances_http_kmeans)
evaluator_http_dbscan = AnomalyDetectorEvaluator(http_eval, labels_http_dbscan, distances_http_dbscan)

In [None]:
evaluator_http_dbscan.calculate_all_metrics()

In [None]:
agglomerative = AnomalyDetector(model="agglomerative", n_clusters=2)
labels_http_agglomerative, distances_http_agglomerative = agglomerative.fit_predict(data=http_data)
evaluator_http_agglomerative = AnomalyDetectorEvaluator(http_eval, labels_http_agglomerative, distances_http_agglomerative)
accuracy_http_agglomerative = evaluator_http_agglomerative.calculate_accuracy()
recall_http_agglomerative = evaluator_http_agglomerative.calculate_recall()
precision_http_agglomerative = evaluator_http_agglomerative.calculate_precision()
auc_pr_http_agglomerative = evaluator_http_agglomerative.calculate_auc_pr()

In [None]:
batch_size = 20000

all_labels = []
all_distances = []

for start in range(0, len(http_data), batch_size):
    end = start + batch_size
    batch_data = http_data[start:end]

    agglomerative = AnomalyDetector(model="agglomerative", n_clusters=2)
    labels_batch, distances_batch = agglomerative.fit_predict(data=batch_data)

    all_labels.extend(AnomalyDetector.transform_labels(labels_batch))
    all_distances.extend(AnomalyDetector.transform_distances(distances_batch))

all_labels = np.array(all_labels)
all_distances = np.array(all_distances)

evaluator_http_agglomerative = AnomalyDetectorEvaluator(http_eval, all_labels, all_distances)
accuracy_http_agglomerative = evaluator_http_agglomerative.calculate_accuracy()
recall_http_agglomerative = evaluator_http_agglomerative.calculate_recall()
precision_http_agglomerative = evaluator_http_agglomerative.calculate_precision()
auc_pr_http_agglomerative = evaluator_http_agglomerative.calculate_auc_pr()

accuracy_http_agglomerative, recall_http_agglomerative, precision_http_agglomerative, auc_pr_http_agglomerative

In [None]:
all_labels