<a href="https://colab.research.google.com/github/hamagami/anomaly-detection/blob/main/04_03_ABOD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ABOD
角度に基づく密度推定によって外れを調べる方法です。
KNNやLOCは次元が増えると距離の意味がなくなってくるのに対し，角度に基づくABODは高次元でも妥当な外れを見つけることが可能です。一方で計算量が多いのでいくつかの高速化の試みがあります。

In [1]:
import numpy as np
import itertools
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

FastABOD
http://ni4muraano.hatenablog.com/entry/2017/11/14/193000

In [8]:
class FastABOD:
    def __init__(self, n_neighbors):
        self.n_neighbors = n_neighbors

    def fit_predict(self, X, contamination=0.1):
        # 各点のk最近傍を求める
        k_nearest = NearestNeighbors(n_neighbors=self.n_neighbors).fit(X)
        distances, indices = k_nearest.kneighbors(X)
        # k最近傍との角度を求めるための組み合わせ計算
        numbers = [i + 1 for i in range(distances.shape[1] - 1)]
        combs = list(itertools.combinations(numbers, 2))
        # ABOFを求める
        abofs = []
        for i in range(len(X)):
            x = X[indices[i]]
            abof = self._compute_abof(x, combs)
            abofs.append(abof)
        # ABOFスコア下位N%を異常と見なす
        ordered_abofs = np.argsort(abofs)
        anomaly_indices = ordered_abofs[:int(len(abofs)*contamination + 0.5)]
        # scikit-learnに倣って正常を1、異常を-1として返す
        prediction = np.ones((len(abofs)), dtype=np.int32)
        prediction[anomaly_indices] = -1
        return prediction

    def _compute_abof(self, x, combs):
        numerator1 = 0
        numerator2 = 0
        denominator1 = 0
        for comb in combs:
            AB = x[comb[0]] - x[0]
            AC = x[comb[1]] - x[0]
            AB_norm = np.linalg.norm(AB)
            AC_norm = np.linalg.norm(AC)
            a = 1 / (AB_norm * AC_norm)
            b = np.dot(AB, AC) / ((AB_norm ** 2) * (AC_norm ** 2))
            numerator1 += a * (b ** 2)
            denominator1 += a
            numerator2 += a * b
        denominator2 = denominator1
        return numerator1 / denominator1 - (numerator2 / denominator2) ** 2

データ分布例

In [3]:
def makedata1():#単一分布の例
  dnum=1000
  mean = np.array([0, 0]) # 平均
  cov = np.array([[1, 0.7],[0.7, 2]]) # 共分散行列
  x, y = np.random.multivariate_normal(mean, cov, dnum).T #多変量正規分布に従う乱数を生成
  data=np.array([x,y]).T
  return data

In [4]:
def makedata2(): #複数分布の例
  dnum=100

  mean1 = np.array([0, -2]) # 平均を指定。
  cov1 = np.array([[1, 0.7],[0.7, 2]]) # 共分散行列を指定。
  x1, y1 = np.random.multivariate_normal(mean1, cov1, dnum).T

  mean2 = np.array([4, 4]) # 平均を指定。
  cov2 = np.array([[1, -0.3],[0.7, 0.4]]) # 共分散行列を指定。
  x2, y2 = np.random.multivariate_normal(mean2, cov2, dnum).T

  mean3 = np.array([-3, 5]) # 平均を指定。
  cov3 = np.array([[0.8, -0.1],[1.0, 2]]) # 共分散行列を指定。
  x3, y3 = np.random.multivariate_normal(mean3, cov3, dnum).T

  mean=np.array([mean1,mean2,mean3])
  cov =np.array([cov1,cov2,cov3])
  x = np.hstack([x1, x2, x3])
  y = np.hstack([y1,y2,y3])
  data=np.array([x,y]).T

  return data

In [5]:

# Generate train data
#X = 0.3 * np.random.randn(100, 2)
# Generate some abnormal novel observations
#X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))
#X = np.r_[X + 2, X - 2, X_outliers]


In [None]:
Xs=[makedata1(),makedata2()]

for X in Xs:
  clf = FastABOD(n_neighbors=10)
  #contamination = 0.05 # 全体の5%を異常と定義
  y_pred = clf.fit_predict(X, contamination=0.05)
  predicted_outlier_index = np.where(y_pred == -1)
  predicted_outlier = X[predicted_outlier_index]
  plt.plot(X.T[0],X.T[1],".")
  plt.plot(predicted_outlier.T[0],predicted_outlier.T[1],"o")
  plt.show()
