In [66]:
# This is a simple example of PyOD usage
# Its goal is to understand the basic functionality of a PyOD algorithm
# and the way the scores, probabilities and final predicted labels are calculated.

In [67]:
import pandas as pd
from scipy.io import arff
from sklearn.model_selection import train_test_split

# Import dataset
data = arff.loadarff("ALOI_withoutdupl.arff") # unnormalized
df = pd.DataFrame(data[0])
df.outlier = df.outlier.str.decode("utf-8")
df['outlier'] = df['outlier'].map({'yes':1,'no':0}) # convert yes/no to 1/0
df.dropna()
if 'id' in df:
    del df['id'] # drop id column
#
# Define problem params
N_max = 10000 # number of max input instances
# Subsample if N > N_max
if(len(df) > N_max):
        df = df.sample(n=N_max)
print("Size of sampled dataset:",len(df))
# Split (stratified) train and test sets
X  = df.iloc[:, :-1] # drop labels for X
y = df['outlier'] # keep labels for y
# Random state set for reproducible results
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y) # stratified split
print("Ratio of outliers in training set:", len(y_train[y_train==1])/len(y_train))
print("Ratio of outliers in test set:", len(y_test[y_test==1])/len(y_test))
print("Training size:", len(X_train))
print("Test size:", len(X_test))
# Normalize training set
X_train_norm = X_train.copy()
X_test_norm = X_test.copy()
for col in X_train_norm.columns:
    mu = X_train_norm[col].mean()
    sig = X_train_norm[col].std()
    X_train_norm[col] = (X_train_norm[col] - mu) / sig
    X_test_norm[col] = (X_test_norm[col] - mu) / sig

Size of sampled dataset: 10000
Ratio of outliers in training set: 0.0285
Ratio of outliers in test set: 0.0285
Training size: 8000
Test size: 2000


In [70]:
from pyod.models.knn import KNN   # kNN detector
from pyod.models.lof import LOF # LOF detector

# train kNN detector
clf_name = 'LOF' #'KNN'
clf = LOF() # contamination = 0.1 by default
clf.fit(X_train)

# get the prediction label and outlier scores of the training data
y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_  # raw outlier scores
y_train_proba = clf.predict_proba(X_train) # probability of outliers

# get the prediction on the test data
y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_test)  # outlier scores
y_test_proba = clf.predict_proba(X_test)  # probabilities in the range of [0,1]

# Model instance
print('Model instance:')
print('Parameters:',clf.get_params())
# Training set
print("\nTraining set:")
print('Threshold:',clf.threshold_)
print('Labels:',clf.labels_[:5])
print('Scores:',clf.decision_scores_[:5]) # y_train_scores[:5]
print('Probabilities:\n', y_train_proba[:5])
# Test set
print("\nTest set")
print('Labels:',y_test_pred[15:25])
print('Scores:',y_test_scores[15:25])
print('Probabilities:\n',y_test_proba[15:25])

Model instance:
Parameters: {'algorithm': 'auto', 'contamination': 0.1, 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': 1, 'n_neighbors': 20, 'p': 2}

Training set:
Threshold: 1.258477607088302
Labels: [1 0 0 0 0]
Scores: [2.27862817 1.02058725 1.04285717 1.05412967 1.01592581]
Probabilities:
 [[0.6032966  0.3967034 ]
 [0.97186057 0.02813943]
 [0.96834258 0.03165742]
 [0.96121165 0.03878835]
 [0.97140909 0.02859091]]

Test set
Labels: [0 0 0 0 0 0 1 0 0 0]
Scores: [1.03357017 1.08291163 1.13134372 1.1243341  0.9906937  1.12643377
 1.29778837 1.03298952 1.02717508 1.02767294]
Probabilities:
 [[0.96552287 0.03447713]
 [0.95026717 0.04973283]
 [0.93529263 0.06470737]
 [0.93745991 0.06254009]
 [0.97877968 0.02122032]
 [0.93681072 0.06318928]
 [0.88383025 0.11616975]
 [0.9657024  0.0342976 ]
 [0.96750014 0.03249986]
 [0.96734621 0.03265379]]


In [71]:
from pyod.utils.data import evaluate_print

# evaluate and print the results
print("\nOn Training Data:")
evaluate_print(clf_name, y_train, y_train_scores)
print("\nOn Test Data:")
evaluate_print(clf_name, y_test, y_test_scores)


On Training Data:
LOF ROC:0.6501, precision @ rank n:0.0789

On Test Data:
LOF ROC:0.6799, precision @ rank n:0.1754
