In [2]:
import pandas as pd
import numpy as np
import scipy.io
from anomaly_detector import AnomalyDetector
from metrics import AnomalyDetectorEvaluator

In [3]:
shuttle = scipy.io.loadmat("shuttle.mat")
shuttle_data = pd.DataFrame(shuttle['X'])
shuttle_eval = pd.DataFrame(shuttle['y'])
http_data = pd.read_csv("http_train.csv")
http_eval = pd.read_csv("http_eval.csv")

In [3]:
# from subsampler import Subsampler

# # Using the Subsampler class
# shuttle_subsampler = Subsampler(shuttle_data, shuttle_eval)
# shuttle_X_ss, shuttle_y_ss = shuttle_subsampler.subsample(majority_fraction=0.1)

# http_subsampler = Subsampler(http_data, http_eval)
# http_X_ss, http_y_ss = http_subsampler.subsample(majority_fraction=0.1)

In [4]:
shuttle_eval = np.array(shuttle_eval).flatten()
http_eval = np.array(http_eval).flatten()

In [5]:
#shuttle_data

In [6]:
kmeans = AnomalyDetector(model="kmeans", metric="euclidean", n_clusters=2)
labels_shuttle_kmeans, distances_shuttle_kmeans = kmeans.fit_predict(data=shuttle_data)
labels_shuttle_kmeans = AnomalyDetector.transform_labels(labels_shuttle_kmeans)
distances_shuttle_kmeans = AnomalyDetector.transform_distances(distances_shuttle_kmeans)
evaluator_shuttle_kmeans = AnomalyDetectorEvaluator(shuttle_eval, labels_shuttle_kmeans, distances_shuttle_kmeans)



In [7]:
evaluator_shuttle_kmeans.calculate_all_metrics()

{'accuracy': 0.9960893740961769,
 'outliers_accuracy': 0.9461691825690687,
 'precision': 0.9990977443609023,
 'recall': 0.9461691825690687,
 'precision_recall_curve': (array([0.0715115 , 0.46924644, 1.        ]),
  array([1.        , 0.32811165, 0.        ])),
 'auc_pr': 0.42270291358630463,
 'confusion_matrix_percentage':                          Positive Prediction        Negative Prediction
 0  Positive Class   True Positive (TP) 6.77%  False Negative (FN) 0.38%
 1  Negative Class  False Positive (FP) 0.01%  True Negative (TN) 92.84%,
 'imbalanced_metrics':                Metric   Value
 0     Positive Recall  94.62%
 1     Negative Recall  99.99%
 2  Positive Precision  99.91%
 3  Negative Precision  99.59%}

In [8]:
kmeans = AnomalyDetector(model="kmeans", metric="mahalanobis", n_clusters=2)
labels_shuttle_kmeans, distances_shuttle_kmeans = kmeans.fit_predict(data=shuttle_data)
labels_shuttle_kmeans = AnomalyDetector.transform_labels(labels_shuttle_kmeans)
distances_shuttle_kmeans = AnomalyDetector.transform_distances(distances_shuttle_kmeans)
evaluator_shuttle_kmeans = AnomalyDetectorEvaluator(shuttle_eval, labels_shuttle_kmeans, distances_shuttle_kmeans)



In [9]:
evaluator_shuttle_kmeans.calculate_all_metrics()

{'accuracy': 0.9960893740961769,
 'outliers_accuracy': 0.9461691825690687,
 'precision': 0.9990977443609023,
 'recall': 0.9461691825690687,
 'precision_recall_curve': (array([0.0715115 , 0.47780041, 1.        ]),
  array([1.        , 0.33409285, 0.        ])),
 'auc_pr': 0.4297566379439553,
 'confusion_matrix_percentage':                          Positive Prediction        Negative Prediction
 0  Positive Class   True Positive (TP) 6.77%  False Negative (FN) 0.38%
 1  Negative Class  False Positive (FP) 0.01%  True Negative (TN) 92.84%,
 'imbalanced_metrics':                Metric   Value
 0     Positive Recall  94.62%
 1     Negative Recall  99.99%
 2  Positive Precision  99.91%
 3  Negative Precision  99.59%}

In [10]:
labels_shuttle_kmeans, distances_shuttle_kmeans = kmeans.fit_predict(data=shuttle_data)



In [11]:
kmeans = AnomalyDetector(model="kmeans", metric="manhattan", n_clusters=2)
labels_shuttle_kmeans, distances_shuttle_kmeans = kmeans.fit_predict(data=shuttle_data)
labels_shuttle_kmeans = AnomalyDetector.transform_labels(labels_shuttle_kmeans)
distances_shuttle_kmeans = AnomalyDetector.transform_distances(distances_shuttle_kmeans)
evaluator_shuttle_kmeans = AnomalyDetectorEvaluator(shuttle_eval, labels_shuttle_kmeans, distances_shuttle_kmeans)



In [12]:
evaluator_shuttle_kmeans.calculate_all_metrics()

{'accuracy': 0.9960690062529278,
 'outliers_accuracy': 0.9458843634292224,
 'precision': 0.9990974729241877,
 'recall': 0.9458843634292224,
 'precision_recall_curve': (array([0.0715115 , 0.58044807, 1.        ]),
  array([1.        , 0.40586727, 0.        ])),
 'auc_pr': 0.5144013302357626,
 'confusion_matrix_percentage':                          Positive Prediction        Negative Prediction
 0  Positive Class   True Positive (TP) 6.76%  False Negative (FN) 0.39%
 1  Negative Class  False Positive (FP) 0.01%  True Negative (TN) 92.84%,
 'imbalanced_metrics':                Metric   Value
 0     Positive Recall  94.59%
 1     Negative Recall  99.99%
 2  Positive Precision  99.91%
 3  Negative Precision  99.58%}

In [13]:
dbscan = AnomalyDetector(model="dbscan", metric="euclidean")
labels_shuttle_dbscan, distances_shuttle_dbscan = dbscan.fit_predict(data=shuttle_data)
labels_shuttle_dbscan = AnomalyDetector.transform_labels(labels_shuttle_dbscan)
distances_shuttle_dbscan = AnomalyDetector.transform_distances(distances_shuttle_dbscan)
evaluator_shuttle_dbscan = AnomalyDetectorEvaluator(shuttle_eval, labels_shuttle_dbscan, distances_shuttle_dbscan)


In [14]:
evaluator_shuttle_dbscan.calculate_all_metrics()

{'accuracy': 0.9939711183982728,
 'outliers_accuracy': 0.9584164055824551,
 'precision': 0.957325746799431,
 'recall': 0.9584164055824551,
 'precision_recall_curve': (array([0.0715115 , 0.21868625, 1.        ]),
  array([1.        , 0.15266306, 0.        ])),
 'auc_pr': 0.215971821566783,
 'confusion_matrix_percentage':                          Positive Prediction        Negative Prediction
 0  Positive Class   True Positive (TP) 6.85%  False Negative (FN) 0.30%
 1  Negative Class  False Positive (FP) 0.31%  True Negative (TN) 92.54%,
 'imbalanced_metrics':                Metric   Value
 0     Positive Recall  95.84%
 1     Negative Recall  99.67%
 2  Positive Precision  95.73%
 3  Negative Precision  99.68%}

In [15]:
dbscan = AnomalyDetector(model="dbscan", metric="mahalanobis")
labels_shuttle_dbscan, distances_shuttle_dbscan = dbscan.fit_predict(data=shuttle_data)
labels_shuttle_dbscan = AnomalyDetector.transform_labels(labels_shuttle_dbscan)
distances_shuttle_dbscan = AnomalyDetector.transform_distances(distances_shuttle_dbscan)
evaluator_shuttle_dbscan = AnomalyDetectorEvaluator(shuttle_eval, labels_shuttle_dbscan, distances_shuttle_dbscan)

KeyboardInterrupt: 

Exception ignored in: 'sklearn.neighbors._ball_tree.BinaryTree._query_radius_single'
Traceback (most recent call last):
  File "sklearn/neighbors/_binary_tree.pxi", line 1021, in sklearn.neighbors._ball_tree.BinaryTree.rdist
  File "sklearn/metrics/_dist_metrics.pyx", line 328, in sklearn.metrics._dist_metrics.DistanceMetric.rdist
  File "sklearn/metrics/_dist_metrics.pyx", line 2762, in sklearn.metrics._dist_metrics.PyFuncDistance.dist
  File "sklearn/metrics/_dist_metrics.pyx", line 2774, in sklearn.metrics._dist_metrics.PyFuncDistance._dist
  File "/home/jbudzins/zum_24l/anomaly_detector.py", line 195, in _mahalanobis_metric
    def _mahalanobis_metric(self, u, v):
KeyboardInterrupt: 


In [None]:
evaluator_shuttle_dbscan.calculate_all_metrics()

In [None]:
dbscan = AnomalyDetector(model="dbscan", metric="manhattan")
labels_shuttle_dbscan, distances_shuttle_dbscan = dbscan.fit_predict(data=shuttle_data)
labels_shuttle_dbscan = AnomalyDetector.transform_labels(labels_shuttle_dbscan)
distances_shuttle_dbscan = AnomalyDetector.transform_distances(distances_shuttle_dbscan)
evaluator_shuttle_dbscan = AnomalyDetectorEvaluator(shuttle_eval, labels_shuttle_dbscan, distances_shuttle_dbscan)

In [None]:
evaluator_shuttle_dbscan.calculate_all_metrics()

In [None]:

batch_size = 10000

all_labels = []
all_distances = []

for start in range(0, len(shuttle_data), batch_size):
    end = start + batch_size
    batch_data = shuttle_data[start:end]

    agglomerative = AnomalyDetector(model="agglomerative", metric="euclidean", n_clusters=2)
    labels_batch, distances_batch = agglomerative.fit_predict(data=batch_data)

    all_labels.extend(AnomalyDetector.transform_labels(labels_batch))
    all_distances.extend(AnomalyDetector.transform_distances(distances_batch))

all_labels = np.array(all_labels)
all_distances = np.array(all_distances)

evaluator_shuttle_agglomerative = AnomalyDetectorEvaluator(shuttle_eval, all_labels, all_distances)

In [None]:

batch_size = 10000

all_labels = []
all_distances = []

for start in range(0, len(shuttle_data), batch_size):
    end = start + batch_size
    batch_data = shuttle_data[start:end]

    agglomerative = AnomalyDetector(model="agglomerative", metric="mahalanobis", n_clusters=2)
    labels_batch, distances_batch = agglomerative.fit_predict(data=batch_data)

    all_labels.extend(AnomalyDetector.transform_labels(labels_batch))
    all_distances.extend(AnomalyDetector.transform_distances(distances_batch))

all_labels = np.array(all_labels)
all_distances = np.array(all_distances)

evaluator_shuttle_agglomerative = AnomalyDetectorEvaluator(shuttle_eval, all_labels, all_distances)

In [None]:
evaluator_shuttle_agglomerative.calculate_all_metrics()

In [None]:
batch_size = 10000

all_labels = []
all_distances = []

for start in range(0, len(shuttle_data), batch_size):
    end = start + batch_size
    batch_data = shuttle_data[start:end]

    agglomerative = AnomalyDetector(model="agglomerative", metric="manhattan", n_clusters=2)
    labels_batch, distances_batch = agglomerative.fit_predict(data=batch_data)

    all_labels.extend(AnomalyDetector.transform_labels(labels_batch))
    all_distances.extend(AnomalyDetector.transform_distances(distances_batch))

all_labels = np.array(all_labels)
all_distances = np.array(all_distances)

evaluator_shuttle_agglomerative = AnomalyDetectorEvaluator(shuttle_eval, all_labels, all_distances)

In [None]:
# # liczy się nie wiadomo ile i nie wiem po co
# agglomerative = AnomalyDetector(model="agglomerative", n_clusters=2)
# labels_shuttle_agglomerative, distances_shuttle_agglomerative = agglomerative.fit_predict(data=shuttle_data)
# labels_shuttle_agglomerative = AnomalyDetector.transform_labels(labels_shuttle_agglomerative)
# distances_shuttle_agglomerative = AnomalyDetector.transform_distances(distances_shuttle_agglomerative)
# evaluator_shuttle_agglomerative = AnomalyDetectorEvaluator(shuttle_eval, labels_shuttle_agglomerative, distances_shuttle_agglomerative)

# Zbiór danych HTTP 

In [5]:
kmeans = AnomalyDetector(model="kmeans", metric="euclidean", n_clusters=2)
labels_http_kmeans, distances_http_kmeans = kmeans.fit_predict(data=http_data)
labels_http_kmeans = AnomalyDetector.transform_labels(labels_http_kmeans)
distances_http_kmeans = AnomalyDetector.transform_distances(distances_http_kmeans)
evaluator_http_kmeans = AnomalyDetectorEvaluator(http_eval, labels_http_kmeans, distances_http_kmeans)

In [6]:
evaluator_http_kmeans.calculate_all_metrics()

{'accuracy': 0.990933888753793,
 'outliers_accuracy': 0.014925373134328358,
 'precision': 0.011,
 'recall': 0.014925373134328358,
 'precision_recall_curve': (array([0.00389605, 0.0779262 , 1.        ]),
  array([1., 1., 0.])),
 'auc_pr': 0.5389630987206147,
 'confusion_matrix_percentage':                          Positive Prediction        Negative Prediction
 0  Positive Class   True Positive (TP) 0.01%  False Negative (FN) 0.38%
 1  Negative Class  False Positive (FP) 0.52%  True Negative (TN) 99.09%,
 'imbalanced_metrics':                Metric   Value
 0     Positive Recall   1.49%
 1     Negative Recall  99.48%
 2  Positive Precision   1.10%
 3  Negative Precision  99.61%}

In [7]:
kmeans = AnomalyDetector(model="kmeans", metric="mahalanobis", n_clusters=2)
labels_http_kmeans, distances_http_kmeans = kmeans.fit_predict(data=http_data)
labels_http_kmeans = AnomalyDetector.transform_labels(labels_http_kmeans)
distances_http_kmeans = AnomalyDetector.transform_distances(distances_http_kmeans)
evaluator_http_kmeans = AnomalyDetectorEvaluator(http_eval, labels_http_kmeans, distances_http_kmeans)

In [8]:
evaluator_http_kmeans.calculate_all_metrics()

{'accuracy': 0.5669641126488552,
 'outliers_accuracy': 0.005427408412483039,
 'precision': 4.926917392018394e-05,
 'recall': 0.005427408412483039,
 'precision_recall_curve': (array([0.00389605, 0.0779207 , 1.        ]),
  array([1., 1., 0.])),
 'auc_pr': 0.5389603524229075,
 'confusion_matrix_percentage':                           Positive Prediction        Negative Prediction
 0  Positive Class    True Positive (TP) 0.00%  False Negative (FN) 0.39%
 1  Negative Class  False Positive (FP) 42.92%  True Negative (TN) 56.69%,
 'imbalanced_metrics':                Metric   Value
 0     Positive Recall   0.54%
 1     Negative Recall  56.92%
 2  Positive Precision   0.00%
 3  Negative Precision  99.32%}

In [9]:
kmeans = AnomalyDetector(model="kmeans", metric="manhattan", n_clusters=2)
labels_http_kmeans, distances_http_kmeans = kmeans.fit_predict(data=http_data)
labels_http_kmeans = AnomalyDetector.transform_labels(labels_http_kmeans)
distances_http_kmeans = AnomalyDetector.transform_distances(distances_http_kmeans)
evaluator_http_kmeans = AnomalyDetectorEvaluator(http_eval, labels_http_kmeans, distances_http_kmeans)

In [10]:
evaluator_http_kmeans.calculate_all_metrics()

{'accuracy': 0.5622451532868838,
 'outliers_accuracy': 0.9954771596562642,
 'precision': 0.008782360264308743,
 'recall': 0.9954771596562642,
 'precision_recall_curve': (array([0.00389605, 0.07792345, 1.        ]),
  array([1., 1., 0.])),
 'auc_pr': 0.5389617255233665,
 'confusion_matrix_percentage':                           Positive Prediction        Negative Prediction
 0  Positive Class    True Positive (TP) 0.39%  False Negative (FN) 0.00%
 1  Negative Class  False Positive (FP) 43.77%  True Negative (TN) 55.84%,
 'imbalanced_metrics':                Metric    Value
 0     Positive Recall   99.55%
 1     Negative Recall   56.06%
 2  Positive Precision    0.88%
 3  Negative Precision  100.00%}

In [None]:
%load_ext memory_profiler

In [1]:
dbscan = AnomalyDetector(model="dbscan", metric="euclidean")
labels_http_dbscan, distances_http_dbscan = dbscan.fit_predict(data=http_data)
labels_http_kmeans = AnomalyDetector.transform_labels(labels_http_kmeans)
distances_http_kmeans = AnomalyDetector.transform_distances(distances_http_kmeans)
evaluator_http_dbscan = AnomalyDetectorEvaluator(http_eval, labels_http_dbscan, distances_http_dbscan)

NameError: name 'AnomalyDetector' is not defined

In [None]:
evaluator_http_dbscan.calculate_all_metrics()

In [None]:
dbscan = AnomalyDetector(model="dbscan", metric="mahalanobis")
labels_http_dbscan, distances_http_dbscan = dbscan.fit_predict(data=http_data)
labels_http_kmeans = AnomalyDetector.transform_labels(labels_http_kmeans)
distances_http_kmeans = AnomalyDetector.transform_distances(distances_http_kmeans)
evaluator_http_dbscan = AnomalyDetectorEvaluator(http_eval, labels_http_dbscan, distances_http_dbscan)

In [None]:
evaluator_http_dbscan.calculate_all_metrics()

In [None]:
dbscan = AnomalyDetector(model="dbscan", metric="manhattan")
labels_http_dbscan, distances_http_dbscan = dbscan.fit_predict(data=http_data)
labels_http_kmeans = AnomalyDetector.transform_labels(labels_http_kmeans)
distances_http_kmeans = AnomalyDetector.transform_distances(distances_http_kmeans)
evaluator_http_dbscan = AnomalyDetectorEvaluator(http_eval, labels_http_dbscan, distances_http_dbscan)

In [None]:
evaluator_http_dbscan.calculate_all_metrics()

In [None]:
# agglomerative = AnomalyDetector(model="agglomerative", n_clusters=2)
# labels_http_agglomerative, distances_http_agglomerative = agglomerative.fit_predict(data=http_data)
# evaluator_http_agglomerative = AnomalyDetectorEvaluator(http_eval, labels_http_agglomerative, distances_http_agglomerative)
# accuracy_http_agglomerative = evaluator_http_agglomerative.calculate_accuracy()
# recall_http_agglomerative = evaluator_http_agglomerative.calculate_recall()
# precision_http_agglomerative = evaluator_http_agglomerative.calculate_precision()
# auc_pr_http_agglomerative = evaluator_http_agglomerative.calculate_auc_pr()

In [None]:
batch_size = 20000

all_labels = []
all_distances = []

for start in range(0, len(http_data), batch_size):
    end = start + batch_size
    batch_data = http_data[start:end]

    agglomerative = AnomalyDetector(model="agglomerative", metric="euclidean", n_clusters=2)
    labels_batch, distances_batch = agglomerative.fit_predict(data=batch_data)

    all_labels.extend(AnomalyDetector.transform_labels(labels_batch))
    all_distances.extend(AnomalyDetector.transform_distances(distances_batch))

all_labels = np.array(all_labels)
all_distances = np.array(all_distances)

evaluator_http_agglomerative = AnomalyDetectorEvaluator(http_eval, all_labels, all_distances)
# accuracy_http_agglomerative = evaluator_http_agglomerative.calculate_accuracy()
# recall_http_agglomerative = evaluator_http_agglomerative.calculate_recall()
# precision_http_agglomerative = evaluator_http_agglomerative.calculate_precision()
# auc_pr_http_agglomerative = evaluator_http_agglomerative.calculate_auc_pr()

# accuracy_http_agglomerative, recall_http_agglomerative, precision_http_agglomerative, auc_pr_http_agglomerative

In [None]:
evaluator_http_agglomerative.calculate_all_metrics()

In [None]:
batch_size = 20000

all_labels = []
all_distances = []

for start in range(0, len(http_data), batch_size):
    end = start + batch_size
    batch_data = http_data[start:end]

    agglomerative = AnomalyDetector(model="agglomerative", metric="mahalanobis", n_clusters=2)
    labels_batch, distances_batch = agglomerative.fit_predict(data=batch_data)

    all_labels.extend(AnomalyDetector.transform_labels(labels_batch))
    all_distances.extend(AnomalyDetector.transform_distances(distances_batch))

all_labels = np.array(all_labels)
all_distances = np.array(all_distances)

evaluator_http_agglomerative = AnomalyDetectorEvaluator(http_eval, all_labels, all_distances)
# accuracy_http_agglomerative = evaluator_http_agglomerative.calculate_accuracy()
# recall_http_agglomerative = evaluator_http_agglomerative.calculate_recall()
# precision_http_agglomerative = evaluator_http_agglomerative.calculate_precision()
# auc_pr_http_agglomerative = evaluator_http_agglomerative.calculate_auc_pr()

# accuracy_http_agglomerative, recall_http_agglomerative, precision_http_agglomerative, auc_pr_http_agglomerative

In [None]:
evaluator_http_agglomerative.calculate_all_metrics()

In [None]:
batch_size = 20000

all_labels = []
all_distances = []

for start in range(0, len(http_data), batch_size):
    end = start + batch_size
    batch_data = http_data[start:end]

    agglomerative = AnomalyDetector(model="agglomerative", metric="manhattan", n_clusters=2)
    labels_batch, distances_batch = agglomerative.fit_predict(data=batch_data)

    all_labels.extend(AnomalyDetector.transform_labels(labels_batch))
    all_distances.extend(AnomalyDetector.transform_distances(distances_batch))

all_labels = np.array(all_labels)
all_distances = np.array(all_distances)

evaluator_http_agglomerative = AnomalyDetectorEvaluator(http_eval, all_labels, all_distances)
# accuracy_http_agglomerative = evaluator_http_agglomerative.calculate_accuracy()
# recall_http_agglomerative = evaluator_http_agglomerative.calculate_recall()
# precision_http_agglomerative = evaluator_http_agglomerative.calculate_precision()
# auc_pr_http_agglomerative = evaluator_http_agglomerative.calculate_auc_pr()

# accuracy_http_agglomerative, recall_http_agglomerative, precision_http_agglomerative, auc_pr_http_agglomerative

In [None]:
evaluator_http_agglomerative.calculate_all_metrics()

In [None]:
all_labels

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, recall_score, precision_score

import data_prep
from ad_one_class import OneClassAnnomalyDetector
from anomaly_detector import AnomalyDetector
from metrics import AnomalyDetectorEvaluator

In [None]:
X = pd.read_csv("shuttle_train.csv",header=0).values
y = pd.read_csv("shuttle_eval.csv",header=0).values

Xtrain, ytrain, Xtest, ytest = data_prep.split_binary_dataset(X, y)

#fit isolation forest
print("Fitting Isolation Forest...")
#print shape of Xtrain with name formated
print(f"Shape of Xtrain: {Xtrain.shape}")
isolation_forest = OneClassAnnomalyDetector(model_name = "isolationforest")
isolation_forest.fit(Xtrain)
print("Fitted. Predicting...")
ypred_forest = isolation_forest.predict(Xtest)



In [None]:
AnomalyDetectorEvaluator(ytest, ypred_forest, ypred_forest).calculate_all_metrics()

In [None]:


#fit isolation forest
print("Fitting SVM...")
#print shape of Xtrain with name formated
print(f"Shape of Xtrain: {Xtrain.shape}")
svm = OneClassAnnomalyDetector(model_name = "oneclasssvm")
svm.fit(Xtrain)
print("Fitted. Predicting...")
ypred_svm = svm.predict(Xtest)

In [None]:
print("Outliers predicted fraction:")
print(sum(ypred_svm) / len(ypred_svm))
print("Outliers test fraction:")
print(sum(ytest) / len(ytest))

In [None]:
AnomalyDetectorEvaluator(ytest, ypred_svm, ypred_svm).calculate_all_metrics()