### Simple PyOD code example
This notebook illustrates a simple example of applying PyOD models to real-world anomaly detection dataset

In [1]:
import os, sys
from scipy.io import arff
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pyod.models.lof import LOF
p = os.path.abspath('..')
sys.path.insert(1, p)
from utils import import_dataset

In [3]:
# Import dataset
df = import_dataset('../../data/Annthyroid_withoutdupl_norm_07.arff')

# Subsample
N = 5000
if(len(df) > N):
    df = df.sample(n=N)
    
# Extract X, y
X  = df.iloc[:, :-1]
y = df['outlier']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=10)

# print
print("Ratio of outliers in training set:", len(y_train[y_train==1])/len(y_train))
print("Ratio of outliers in test set:", len(y_test[y_test==1])/len(y_test))
print("Training size:", len(X_train))
print("Test size:", len(X_test))

Ratio of outliers in training set: 0.07306666666666667
Ratio of outliers in test set: 0.0728
Training size: 3750
Test size: 1250


In [4]:
# train COF detector
clf_name = 'LOF'
clf = LOF()
clf.fit(X_train)

# get the prediction label and outlier scores of the training data
y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_  # raw outlier scores
y_train_proba = clf.predict_proba(X_train) # probability of outliers

# get the prediction on the test data
y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_test)  # outlier scores
y_test_proba = clf.predict_proba(X_test)  # probabilities in the range of [0,1]

# Model instance
print('Model instance:')
print('Parameters:',clf.get_params())
# Training set
print("\nTraining set:")
print('Threshold:',clf.threshold_)
print('Labels:',clf.labels_[:5])
print('Scores:',clf.decision_scores_[:5]) # y_train_scores[:5]
print('Probabilities:\n', y_train_proba[:5])
# Test set
print("\nTest set")
print('Labels:',y_test_pred[15:25])
print('Scores:',y_test_scores[15:25])
print('Probabilities:\n',y_test_proba[15:25])

Model instance:
Parameters: {'algorithm': 'auto', 'contamination': 0.1, 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': 1, 'n_neighbors': 20, 'novelty': True, 'p': 2}

Training set:
Threshold: 1.8853842582808247
Labels: [0 0 0 0 0]
Scores: [1.14344662 1.01508758 1.01111714 1.56998712 1.15525702]
Probabilities:
 [[0.98652094 0.01347906]
 [0.99514648 0.00485352]
 [0.99559484 0.00440516]
 [0.96185361 0.03814639]
 [0.98615844 0.01384156]]

Test set
Labels: [0 0 0 0 0 0 0 0 0 1]
Scores: [1.24108456 1.10130986 0.9769059  1.5563613  1.52652011 1.10223906
 1.10602042 0.99298255 0.96697086 5.14123119]
Probabilities:
 [[0.97959304 0.02040696]
 [0.98903298 0.01096702]
 [0.99743484 0.00256516]
 [0.95830023 0.04169977]
 [0.96031561 0.03968439]
 [0.98897023 0.01102977]
 [0.98871485 0.01128515]
 [0.99634907 0.00365093]
 [0.99810582 0.00189418]
 [0.7161893  0.2838107 ]]


In [5]:
from pyod.utils.data import evaluate_print

# evaluate and print the results
print("\nOn Training Data:")
evaluate_print(clf_name, y_train, y_train_scores)
print("\nOn Test Data:")
evaluate_print(clf_name, y_test, y_test_scores)


On Training Data:
LOF ROC:0.6235, precision @ rank n:0.0693

On Test Data:
LOF ROC:0.6502, precision @ rank n:0.033
