Isolation-based anomaly detection using the nearest-neighbor ensembles (iNNE) method.

Bandaragoda, Tharindu R., et al. "Efficient anomaly detection by isolation using nearest neighbour ensemble."
2014 IEEE International Conference on Data Mining Workshop. IEEE, 2014.

Bandaragoda, Tharindu R., et al. "Isolation‐based anomaly detection using nearest‐neighbor ensembles."
Computational Intelligence 34.4 (2018): 968-998.

In [1]:
from helpers.inne_anomaly_detection import (
    prob_sample_inclusion, 
    min_subsample_size, 
    min_ensemble_size
)

In [2]:
# Range of number of data points (samples)
N_range = [100, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000]

In [3]:
# Minimum `psi` with the default ensemble size `t = 100` and `eps = 0.05`.
# That is every sample has a minimum probability `1 - eps = 0.95` of getting selected at least once.
for N in N_range:
    psi = min_subsample_size(N)
    print("N = {:d}, psi = {:d}".format(N, psi))


N = 100, psi = 3
N = 500, psi = 15
N = 1000, psi = 30
N = 5000, psi = 148
N = 10000, psi = 296
N = 50000, psi = 1476
N = 100000, psi = 2952
N = 500000, psi = 14757
N = 1000000, psi = 29514


In [4]:
# Minimum `psi` with the ensemble size `t = 100` and `eps = 0.01`.
# That is every sample has a minimum probability `1 - eps = 0.99` of getting selected at least once.
for N in N_range:
    psi = min_subsample_size(N, eps=0.01)
    print("N = {:d}, psi = {:d}".format(N, psi))

N = 100, psi = 5
N = 500, psi = 23
N = 1000, psi = 46
N = 5000, psi = 226
N = 10000, psi = 451
N = 50000, psi = 2251
N = 100000, psi = 4501
N = 500000, psi = 22504
N = 1000000, psi = 45008


In [5]:
# Minimum ensemble size `t` required with `psi = 128` and `eps = 0.05`.
for N in N_range:
    t = min_ensemble_size(N, 128, eps=0.05)
    print("N = {:d}, t = {:d}".format(N, t))

N = 100, t = 1
N = 500, t = 11
N = 1000, t = 22
N = 5000, t = 116
N = 10000, t = 233
N = 50000, t = 1169
N = 100000, t = 2339
N = 500000, t = 11701
N = 1000000, t = 23403


In [6]:
# Minimum ensemble size `t` required with `psi = 256` and `eps = 0.05`.
for N in N_range:
    t = min_ensemble_size(N, 256, eps=0.05)
    print("N = {:d}, t = {:d}".format(N, t))

N = 100, t = 1
N = 500, t = 5
N = 1000, t = 11
N = 5000, t = 57
N = 10000, t = 116
N = 50000, t = 584
N = 100000, t = 1169
N = 500000, t = 5850
N = 1000000, t = 11701
