In [24]:
import numpy as np

def aom(scores, n_buckets, n_estimators, standard=True):
    '''
    Average of Maximum - An ensemble method for outlier detection
    Aggarwal, C.C. and Sathe, S., 2015. Theoretical foundations and algorithms
    for outlier ensembles. ACM SIGKDD Explorations Newsletter, 17(1), pp.24-47.
    :param scores:
    :param n_buckets:
    :param n_estimators:
    :param standard:
    :return:
    '''
    scores = np.asarray(scores)
    if scores.shape[1] != n_estimators:
        raise ValueError('score matrix should be n_samples by n_estimaters')

    scores_aom = np.zeros([scores.shape[0], n_buckets])

    n_estimators_per_bucket = int(n_estimators / n_buckets)
    if n_estimators % n_buckets != 0:
        Warning('n_estimators / n_buckets leads to a remainder')

    # shuffle the estimator order
    estimators_list = list(range(0, n_estimators, 1))
    np.random.shuffle(estimators_list)

    head = 0
    for i in range(0, n_estimators, n_estimators_per_bucket):
        tail = i + n_estimators_per_bucket
        batch_ind = int(i / n_estimators_per_bucket)

        scores_aom[:, batch_ind] = np.max(
            scores[:, estimators_list[head:tail]], axis=1)

        head = head + n_estimators_per_bucket
        tail = tail + n_estimators_per_bucket

    return np.mean(scores_aom, axis=1)


def moa(scores, n_buckets, n_estimators):
    '''
    Maximum of Average - An ensemble method for outlier detection
    Aggarwal, C.C. and Sathe, S., 2015. Theoretical foundations and algorithms
    for outlier ensembles. ACM SIGKDD Explorations Newsletter, 17(1), pp.24-47.
    :param scores:
    :param n_buckets:
    :param n_estimators:
    :param standard:
    :return:
    '''
    scores = np.asarray(scores)
    if scores.shape[1] != n_estimators:
        raise ValueError('score matrix should be n_samples by n_estimaters')

    scores_moa = np.zeros([scores.shape[0], n_buckets])

    n_estimators_per_bucket = int(n_estimators / n_buckets)
    if n_estimators % n_buckets != 0:
        Warning('n_estimators / n_buckets leads to a remainder')

    # shuffle the estimator order
    estimators_list = list(range(0, n_estimators, 1))
    np.random.shuffle(estimators_list)

    head = 0
    for i in range(0, n_estimators, n_estimators_per_bucket):
        tail = i + n_estimators_per_bucket
        batch_ind = int(i / n_estimators_per_bucket)

        scores_moa[:, batch_ind] = np.mean(
            scores[:, estimators_list[head:tail]], axis=1)

        head = head + n_estimators_per_bucket
        tail = tail + n_estimators_per_bucket

    return np.max(scores_moa, axis=1)

In [26]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KDTree
from sklearn.exceptions import NotFittedError
from scipy.stats import scoreatpercentile
from scipy.stats import rankdata
from scipy.special import erf


class Knn(object):
    """
    Knn class for outlier detection
    support original knn, average knn, and median knn
    """

    def __init__(self, n_neighbors=1, contamination=0.05, method='largest'):
        self.n_neighbors = n_neighbors
        self.contamination = contamination
        self.method = method

    def fit(self, X_train):
        self.X_train = X_train
        self._isfitted = True
        self.tree = KDTree(X_train)

        neigh = NearestNeighbors(n_neighbors=self.n_neighbors)
        neigh.fit(self.X_train)

        result = neigh.kneighbors(n_neighbors=self.n_neighbors,
                                  return_distance=True)
        dist_arr = result[0]

        if self.method == 'largest':
            dist = dist_arr[:, -1]
        elif self.method == 'mean':
            dist = np.mean(dist_arr, axis=1)
        elif self.method == 'median':
            dist = np.median(dist_arr, axis=1)

        self.threshold = scoreatpercentile(dist, 100 * (1 - self.contamination))
        self.decision_scores = dist.ravel()
        self.y_pred = (self.decision_scores > self.threshold).astype('int')

        self.mu = np.mean(self.decision_scores)
        self.sigma = np.std(self.decision_scores)

    def decision_function(self, X_test):

        if not self._isfitted:
            NotFittedError('Knn is not fitted yet')

        # initialize the output score
        pred_score = np.zeros([X_test.shape[0], 1])

        for i in range(X_test.shape[0]):
            x_i = X_test[i, :]
            x_i = np.asarray(x_i).reshape(1, x_i.shape[0])

            # get the distance of the current point
            dist_arr, ind_arr = self.tree.query(x_i, k=self.n_neighbors)

            if self.method == 'largest':
                dist = dist_arr[:, -1]
            elif self.method == 'mean':
                dist = np.mean(dist_arr, axis=1)
            elif self.method == 'median':
                dist = np.median(dist_arr, axis=1)

            pred_score_i = dist[-1]

            # record the current item
            pred_score[i, :] = pred_score_i

        return pred_score

    def predict(self, X_test):
        pred_score = self.decision_function(X_test)
        return (pred_score > self.threshold).astype('int')

    def predict_proba(self, X_test, method='linear'):
        test_scores = self.decision_function(X_test)
        train_scores = self.decision_scores

        if method == 'linear':
            scaler = MinMaxScaler().fit(train_scores.reshape(-1, 1))
            proba = scaler.transform(test_scores.reshape(-1, 1))
            return proba.clip(0, 1)
        else:
            # turn output into probability
            pre_erf_score = (test_scores - self.mu) / (self.sigma * np.sqrt(2))
            erf_score = erf(pre_erf_score)
            proba = erf_score.clip(0)

            # TODO: move to testing code
            assert (proba.min() >= 0)
            assert (proba.max() <= 1)
            return proba

    def predict_rank(self, X_test):
        test_scores = self.decision_function(X_test)
        train_scores = self.decision_scores

        ranks = np.zeros([X_test.shape[0], 1])

        for i in range(test_scores.shape[0]):
            train_scores_i = np.append(train_scores.reshape(-1, 1),
                                       test_scores[i])

            ranks[i] = rankdata(train_scores_i)[-1]

        # return normalized ranks
        ranks_norm = ranks / ranks.max()
        return ranks_norm

##############################################################################
samples = [[-1, 0], [0., 0.], [1., 1], [2., 5.], [3, 1]]

clf = Knn()
clf.fit(samples)

scores = clf.decision_function(np.asarray([[2, 3], [6, 8]])).ravel()
assert (scores[0] == [2])
assert (scores[1] == [5])
#
labels = clf.predict(np.asarray([[2, 3], [6, 8]])).ravel()
assert (labels[0] == [0])
assert (labels[1] == [1])


AttributeError: 'list' object has no attribute 'size'