### Simple PyOD code example
This notebook illustrates a simple example of applying PyOD models to real-world anomaly detection dataset

In [None]:
# Shuttle dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.io import loadmat  # this is the SciPy module that loads mat-files
mat = loadmat('../../data/shuttle.mat')  # load mat-file
X = mat['X']
y = mat['y']
X = pd.DataFrame(X)
y = pd.DataFrame(y)
y = y[0]
N = 5000
if(len(X) > N):
    X = X.sample(n=N, ignore_index=True, random_state=1)
    y = y.sample(n=N, ignore_index=True, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y) # stratified split
# Reset indexes
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
#X_train_norm = X_train.copy()
#X_test_norm = X_test.copy()
#for col in X_train_norm.columns:
#    mu = X_train_norm[col].mean()
#    sig = X_train_norm[col].std()
#    X_train_norm[col] = (X_train_norm[col] - mu) / sig
#    X_test_norm[col] = (X_test_norm[col] - mu) / sig
#X_train = X_train_norm.copy()
#X_test = X_test_norm.copy()
print("Ratio of outliers in training set:", len(y_train[y_train==1])/len(y_train))
print("Ratio of outliers in test set:", len(y_test[y_test==1])/len(y_test))
print("Training size:", len(X_train))
print("Test size:", len(X_test))

In [None]:
from pyod.models.cof import COF

# train COF detector
clf_name = 'COF'
clf = COF(contamination=0.2, n_neighbors=20, method='fast')
clf.fit(X_train)

# get the prediction label and outlier scores of the training data
y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_  # raw outlier scores
y_train_proba = clf.predict_proba(X_train) # probability of outliers

# get the prediction on the test data
y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_test)  # outlier scores
y_test_proba = clf.predict_proba(X_test)  # probabilities in the range of [0,1]

# Model instance
print('Model instance:')
print('Parameters:',clf.get_params())
# Training set
print("\nTraining set:")
print('Threshold:',clf.threshold_)
print('Labels:',clf.labels_[:5])
print('Scores:',clf.decision_scores_[:5]) # y_train_scores[:5]
print('Probabilities:\n', y_train_proba[:5])
# Test set
print("\nTest set")
print('Labels:',y_test_pred[15:25])
print('Scores:',y_test_scores[15:25])
print('Probabilities:\n',y_test_proba[15:25])

In [None]:
from pyod.utils.data import evaluate_print

# evaluate and print the results
print("\nOn Training Data:")
evaluate_print(clf_name, y_train, y_train_scores)
print("\nOn Test Data:")
evaluate_print(clf_name, y_test, y_test_scores)