### Simple PyOD code example
This notebook illustrates a simple example of applying PyOD models to real-world anomaly detection dataset

In [3]:
# Shuttle dataset
import os
from scipy.io import arff
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pyod.models.lof import LOF

In [4]:
def import_dataset(filepath):
    """ Function that reads the KDDCup99 dataset and returns a dataframe.

    Args:
        filename (str): The name of the file

    Returns:
        (df): The dataframe with the data contents
    """

    # If file does not exist
    if not os.path.exists(filepath):
        raise FileNotFoundError("filepath %s does not exist" % filepath)

    # Load file to a df
    data = arff.loadarff(filepath)
    df = pd.DataFrame(data[0])
    df.outlier = df.outlier.str.decode("utf-8")
    df['outlier'] = df['outlier'].map({'yes':1,'no':0}) 
    if 'id' in df:
        del df['id']

    return df

# Import dataset
df = import_dataset('../../data/Annthyroid_withoutdupl_norm_07.arff')

# Subsample
Ν = 10000
if(len(df) > N):
    df = df.sample(n=N)

# Extract X, y
X  = df.iloc[:, :-1]
y = df['outlier']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=10)

print("Ratio of outliers in training set:", len(y_train[y_train==1])/len(y_train))
print("Ratio of outliers in test set:", len(y_test[y_test==1])/len(y_test))
print("Training size:", len(X_train))
print("Test size:", len(X_test))

NameError: name 'N' is not defined

In [10]:
# train COF detector
clf_name = 'LOF'
clf = LOF()
clf.fit(X_train)

# get the prediction label and outlier scores of the training data
y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_  # raw outlier scores
y_train_proba = clf.predict_proba(X_train) # probability of outliers

# get the prediction on the test data
y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_test)  # outlier scores
y_test_proba = clf.predict_proba(X_test)  # probabilities in the range of [0,1]

# Model instance
print('Model instance:')
print('Parameters:',clf.get_params())
# Training set
print("\nTraining set:")
print('Threshold:',clf.threshold_)
print('Labels:',clf.labels_[:5])
print('Scores:',clf.decision_scores_[:5]) # y_train_scores[:5]
print('Probabilities:\n', y_train_proba[:5])
# Test set
print("\nTest set")
print('Labels:',y_test_pred[15:25])
print('Scores:',y_test_scores[15:25])
print('Probabilities:\n',y_test_proba[15:25])

Model instance:
Parameters: {'algorithm': 'auto', 'contamination': 0.1, 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': 1, 'n_neighbors': 20, 'novelty': True, 'p': 2}

Training set:
Threshold: 2.1306418198975448
Labels: [1 0 0 0 0]
Scores: [9.18603848 1.05351953 1.00606752 0.97321997 0.99649222]
Probabilities:
 [[0.49932156 0.50067844]
 [0.99307789 0.00692211]
 [0.99536482 0.00463518]
 [0.99754275 0.00245725]
 [0.9965391  0.0034609 ]]

Test set
Labels: [0 0 0 0 0 0 0 0 1 0]
Scores: [0.98757305 1.24186462 1.06956712 1.41208462 1.32574674 1.02622924
 1.00859827 1.13174932 3.17915913 1.03356339]
Probabilities:
 [[0.99669246 0.00330754]
 [0.97976652 0.02023348]
 [0.99123484 0.00876516]
 [0.96843647 0.03156353]
 [0.97418322 0.02581678]
 [0.99411946 0.00588054]
 [0.995293   0.004707  ]
 [0.98709592 0.01290408]
 [0.85081789 0.14918211]
 [0.99363129 0.00636871]]


In [11]:
from pyod.utils.data import evaluate_print

# evaluate and print the results
print("\nOn Training Data:")
evaluate_print(clf_name, y_train, y_train_scores)
print("\nOn Test Data:")
evaluate_print(clf_name, y_test, y_test_scores)


On Training Data:
LOF ROC:0.615, precision @ rank n:0.0562

On Test Data:
LOF ROC:0.6486, precision @ rank n:0.1316
