In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from kdelearn.kde_tasks import KDEOutliersDetection
from kdelearn.bandwidth_selection import direct_plugin, normal_reference
from kdelearn.metrics import pi_kf
from sklearn.neighbors import NearestNeighbors

In [2]:
!ls ../../data/mulcross/

phpGGVhl9.arff


In [3]:
data = pd.read_csv("../../data/mulcross/phpGGVhl9.arff", skiprows=7, names=["v1", "v2", "v3", "v4", "target"])

In [4]:
data.loc[data["target"] == "'Normal'", "target"] = 0
data.loc[data["target"] == "'Anomaly'", "target"] = 1
data["target"] = data["target"].astype(int)

In [5]:
data.shape

(262144, 5)

In [6]:
x_train = data[["v1", "v2", "v3", "v4"]].to_numpy()
labels_train = data["target"].to_numpy()

bandwidth = np.array([0.02687994, 0.02610197, 0.02315374, 0.01958707])

# r=0.1

In [7]:
nbrs = NearestNeighbors(n_neighbors=11, algorithm='brute').fit(x_train)
distances, indices = nbrs.kneighbors(x_train)
dist_k = np.max(distances, axis=1)
r = 0.1
threshold = np.quantile(dist_k, 1 - r)
labels_pred = np.where(dist_k >= threshold, 1 ,0)
metric = pi_kf(x_train, labels_pred, bandwidth=bandwidth)

print(f"Wskaźnik jakości: {metric}")

Wskaźnik jakości: 1.0026581828488323


In [8]:
labels_train.sum(), labels_pred.sum()

(26214, 26215)

In [9]:
res = confusion_matrix(labels_train, labels_pred)
res

array([[209715,  26215],
       [ 26214,      0]])

In [10]:
print(f"precision: {res[1, 1] / res[:, 1].sum()}")
print(f"recall: {res[1, 1] / res[1, :].sum()}")
print(f"accuracy: {np.diag(res).sum() / res.sum()}")

precision: 0.0
recall: 0.0
accuracy: 0.7999992370605469


# r=0.01

In [11]:
nbrs = NearestNeighbors(n_neighbors=11, algorithm='brute').fit(x_train)
distances, indices = nbrs.kneighbors(x_train)
dist_k = np.max(distances, axis=1)
r = 0.01
threshold = np.quantile(dist_k, 1 - r)
labels_pred = np.where(dist_k >= threshold, 1 ,0)
metric = pi_kf(x_train, labels_pred, bandwidth=bandwidth)

print(f"Wskaźnik jakości: {metric}")

Wskaźnik jakości: 1.0000567314291076


In [12]:
labels_train.sum(), labels_pred.sum()

(26214, 2622)

In [13]:
res = confusion_matrix(labels_train, labels_pred)
res

array([[233308,   2622],
       [ 26214,      0]])

In [14]:
print(f"precision: {res[1, 1] / res[:, 1].sum()}")
print(f"recall: {res[1, 1] / res[1, :].sum()}")
print(f"accuracy: {np.diag(res).sum() / res.sum()}")

precision: 0.0
recall: 0.0
accuracy: 0.8899993896484375


# r=0.05

In [15]:
nbrs = NearestNeighbors(n_neighbors=11, algorithm='brute').fit(x_train)
distances, indices = nbrs.kneighbors(x_train)
dist_k = np.max(distances, axis=1)
r = 0.05
threshold = np.quantile(dist_k, 1 - r)
labels_pred = np.where(dist_k >= threshold, 1 ,0)
metric = pi_kf(x_train, labels_pred, bandwidth=bandwidth)

print(f"Wskaźnik jakości: {metric}")

Wskaźnik jakości: 1.0008375667477691


In [16]:
labels_train.sum(), labels_pred.sum()

(26214, 13108)

In [17]:
res = confusion_matrix(labels_train, labels_pred)
res

array([[222822,  13108],
       [ 26214,      0]])

In [18]:
print(f"precision: {res[1, 1] / res[:, 1].sum()}")
print(f"recall: {res[1, 1] / res[1, :].sum()}")
print(f"accuracy: {np.diag(res).sum() / res.sum()}")

precision: 0.0
recall: 0.0
accuracy: 0.8499984741210938


# r=0.2

In [19]:
nbrs = NearestNeighbors(n_neighbors=11, algorithm='brute').fit(x_train)
distances, indices = nbrs.kneighbors(x_train)
dist_k = np.max(distances, axis=1)
r = 0.2
threshold = np.quantile(dist_k, 1 - r)
labels_pred = np.where(dist_k >= threshold, 1 ,0)
metric = pi_kf(x_train, labels_pred, bandwidth=bandwidth)

print(f"Wskaźnik jakości: {metric}")

Wskaźnik jakości: 1.0054481253661567


In [20]:
labels_train.sum(), labels_pred.sum()

(26214, 52429)

In [21]:
res = confusion_matrix(labels_train, labels_pred)
res

array([[183502,  52428],
       [ 26213,      1]])

In [22]:
print(f"precision: {res[1, 1] / res[:, 1].sum()}")
print(f"recall: {res[1, 1] / res[1, :].sum()}")
print(f"accuracy: {np.diag(res).sum() / res.sum()}")

precision: 1.9073413568826414e-05
recall: 3.8147554741741054e-05
accuracy: 0.7000083923339844
