In [8]:
# Import base libraries
import os
import sys
from time import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Import ML libraries
from sklearn.model_selection import train_test_split
from scipy.io import loadmat
from sklearn.metrics import roc_auc_score

# Import PyOD library
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores

In [9]:
# Define data file and read and y
mat_file_list = ['arrhythmia.mat','cardio.mat','glass.mat','ionosphere.mat','letter.mat','lympho.mat','mnist.mat','musk.mat','optdigits.mat','pendigits.mat','pima.mat','satellite.mat','satimage-2.mat','shuttle.mat','vertebral.mat','vowels.mat','wbc.mat']

In [10]:
df_columns = ['Data','#Samples','#Dimensions','Outlier Perc','ABOD',
              'CBLOF','FB','HBOS','IForest','KNN','LOF','MCD','OCSVM','PCA']

roc_df = pd.DataFrame(columns=df_columns)
prn_df = pd.DataFrame(columns=df_columns)
time_df = pd.DataFrame(columns=df_columns)

In [15]:
# Exploring all mat files
random_state = np.random.RandomState(42)
for mat_file in mat_file_list:
    print('\n... Processing',mat_file,'...')
    mat = loadmat(os.path.join('Anamoly_detec_data',mat_file))
    X = mat['X']
    y = mat['y'].ravel()
    outliers_fraction = np.count_nonzero(y) / len(y)
    outliers_percentage = round(outliers_fraction * 100,ndigits=4)
    
#   construct containers for saving results
    roc_list = [mat_file[:-4],X.shape[0], X.shape[1],outliers_percentage]
    prn_list = [mat_file[:-4],X.shape[0], X.shape[1],outliers_percentage]
    time_list = [mat_file[:-4],X.shape[0], X.shape[1],outliers_percentage]
    
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.4, random_state=random_state)
    
    X_train_norm, X_test_norm = standardizer(X_train,X_test)
    
    classifiers = {'Angle-based Outlier Detector (ABOD)':ABOD(contamination=outliers_fraction),
                   'Cluster-based Local Outlier Factor': CBLOF(contamination=outliers_fraction, check_estimator=False,
                                                              random_state=random_state),
                   'Feature Bagging': FeatureBagging(contamination=outliers_fraction,random_state=random_state),
                   'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction),
                   'Isolation Forest': IForest(contamination=outliers_fraction,random_state=random_state),
                   'K Nearest Neighbours (KNN)': KNN(contamination=outliers_fraction),
                   'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction),
                   'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction,random_state=random_state),
                   'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
                   'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction,random_state=random_state)}
    for clf_name, clf in classifiers.items():
        t0 = time()
        clf.fit(X_train_norm)
        test_scores = clf.decision_function(X_test_norm)
        t1 = time()
        duration = round(t1 - t0 , ndigits=4)
        time_list.append(duration)
        
        roc = round(roc_auc_score(y_test,test_scores),ndigits=4)
        prn = round(precision_n_scores(y_test,test_scores),ndigits=4)
        
        print('{clf_name} ROC: {roc}, precision @ rank n: {prn}, execution time: {duration}s'.format(clf_name=clf_name,roc=roc,prn=prn,duration=duration))
        
        roc_list.append(roc)
        prn_list.append(prn)
        
    temp_df = pd.DataFrame(time_list).transpose()
    temp_df.columns = df_columns
    time_df = pd.concat([time_df,temp_df],axis=0)
    
    temp_df = pd.DataFrame(roc_list).transpose()
    temp_df.columns = df_columns
    roc_df = pd.concat([roc_df,temp_df],axis=0)
    
    temp_df = pd.DataFrame(prn_list).transpose()
    temp_df.columns = df_columns
    prn_df = pd.concat([prn_df,temp_df],axis=0)


... Processing arrhythmia.mat ...
Angle-based Outlier Detector (ABOD) ROC: 0.7687, precision @ rank n: 0.3571, execution time: 0.3812s
Cluster-based Local Outlier Factor ROC: 0.7684, precision @ rank n: 0.4643, execution time: 4.8557s
Feature Bagging ROC: 0.7799, precision @ rank n: 0.5, execution time: 1.0318s
Histogram-base Outlier Detection (HBOS) ROC: 0.8511, precision @ rank n: 0.5714, execution time: 5.0512s
Isolation Forest ROC: 0.8527, precision @ rank n: 0.5714, execution time: 1.0438s
K Nearest Neighbours (KNN) ROC: 0.782, precision @ rank n: 0.5, execution time: 0.1778s
Local Outlier Factor (LOF) ROC: 0.7787, precision @ rank n: 0.4643, execution time: 0.1318s




Minimum Covariance Determinant (MCD) ROC: 0.8228, precision @ rank n: 0.4286, execution time: 7.5286s
One-class SVM (OCSVM) ROC: 0.7986, precision @ rank n: 0.5, execution time: 0.1216s
Principal Component Analysis (PCA) ROC: 0.7997, precision @ rank n: 0.5, execution time: 0.183s

... Processing cardio.mat ...
Angle-based Outlier Detector (ABOD) ROC: 0.5763, precision @ rank n: 0.1875, execution time: 1.0896s
Cluster-based Local Outlier Factor ROC: 0.8221, precision @ rank n: 0.4844, execution time: 0.3343s
Feature Bagging ROC: 0.4879, precision @ rank n: 0.1406, execution time: 1.4958s
Histogram-base Outlier Detection (HBOS) ROC: 0.8453, precision @ rank n: 0.4688, execution time: 0.0152s
Isolation Forest ROC: 0.9414, precision @ rank n: 0.5, execution time: 0.9332s
K Nearest Neighbours (KNN) ROC: 0.6959, precision @ rank n: 0.2812, execution time: 0.353s
Local Outlier Factor (LOF) ROC: 0.4715, precision @ rank n: 0.125, execution time: 0.184s




Minimum Covariance Determinant (MCD) ROC: 0.8778, precision @ rank n: 0.3906, execution time: 2.1206s
One-class SVM (OCSVM) ROC: 0.9507, precision @ rank n: 0.5938, execution time: 0.1281s
Principal Component Analysis (PCA) ROC: 0.9638, precision @ rank n: 0.6875, execution time: 0.011s

... Processing glass.mat ...
Angle-based Outlier Detector (ABOD) ROC: 0.7104, precision @ rank n: 0.25, execution time: 0.1067s
Cluster-based Local Outlier Factor ROC: 0.8506, precision @ rank n: 0.25, execution time: 0.1009s
Feature Bagging ROC: 0.7043, precision @ rank n: 0.25, execution time: 0.0784s
Histogram-base Outlier Detection (HBOS) ROC: 0.6524, precision @ rank n: 0.0, execution time: 0.0084s
Isolation Forest ROC: 0.7195, precision @ rank n: 0.25, execution time: 0.6801s
K Nearest Neighbours (KNN) ROC: 0.7805, precision @ rank n: 0.25, execution time: 0.0238s
Local Outlier Factor (LOF) ROC: 0.7774, precision @ rank n: 0.25, execution time: 0.006s
Minimum Covariance Determinant (MCD) ROC: 0.7




One-class SVM (OCSVM) ROC: 0.9636, precision @ rank n: 0.6, execution time: 0.0037s
Principal Component Analysis (PCA) ROC: 0.9818, precision @ rank n: 0.8, execution time: 0.004s

... Processing mnist.mat ...
Angle-based Outlier Detector (ABOD) ROC: 0.7813, precision @ rank n: 0.3562, execution time: 14.3187s
Cluster-based Local Outlier Factor ROC: 0.8447, precision @ rank n: 0.4007, execution time: 1.7753s
Feature Bagging ROC: 0.7259, precision @ rank n: 0.3664, execution time: 91.3452s
Histogram-base Outlier Detection (HBOS) ROC: 0.5675, precision @ rank n: 0.1199, execution time: 0.124s
Isolation Forest ROC: 0.7801, precision @ rank n: 0.2979, execution time: 3.2228s
K Nearest Neighbours (KNN) ROC: 0.8409, precision @ rank n: 0.4144, execution time: 11.8631s
Local Outlier Factor (LOF) ROC: 0.7085, precision @ rank n: 0.339, execution time: 11.106s




Minimum Covariance Determinant (MCD) ROC: 0.863, precision @ rank n: 0.3973, execution time: 30.5514s
One-class SVM (OCSVM) ROC: 0.8417, precision @ rank n: 0.3801, execution time: 7.656s
Principal Component Analysis (PCA) ROC: 0.8396, precision @ rank n: 0.3767, execution time: 0.2506s

... Processing musk.mat ...
Angle-based Outlier Detector (ABOD) ROC: 0.0809, precision @ rank n: 0.0333, execution time: 4.5638s
Cluster-based Local Outlier Factor ROC: 1.0, precision @ rank n: 1.0, execution time: 0.6512s
Feature Bagging ROC: 0.5228, precision @ rank n: 0.1667, execution time: 23.173s
Histogram-base Outlier Detection (HBOS) ROC: 0.9999, precision @ rank n: 0.9667, execution time: 0.1509s
Isolation Forest ROC: 0.9996, precision @ rank n: 0.9333, execution time: 2.1883s
K Nearest Neighbours (KNN) ROC: 0.7348, precision @ rank n: 0.2333, execution time: 3.2398s
Local Outlier Factor (LOF) ROC: 0.5323, precision @ rank n: 0.1333, execution time: 2.9426s
Minimum Covariance Determinant (MCD)



Minimum Covariance Determinant (MCD) ROC: 0.3486, precision @ rank n: 0.0, execution time: 14.8066s
One-class SVM (OCSVM) ROC: 0.4972, precision @ rank n: 0.0, execution time: 2.3387s
Principal Component Analysis (PCA) ROC: 0.504, precision @ rank n: 0.0, execution time: 0.095s

... Processing pendigits.mat ...
Angle-based Outlier Detector (ABOD) ROC: 0.7008, precision @ rank n: 0.0308, execution time: 4.0082s
Cluster-based Local Outlier Factor ROC: 0.9609, precision @ rank n: 0.3077, execution time: 0.544s
Feature Bagging ROC: 0.4687, precision @ rank n: 0.0462, execution time: 8.6638s
Histogram-base Outlier Detection (HBOS) ROC: 0.9294, precision @ rank n: 0.2615, execution time: 0.0208s
Isolation Forest ROC: 0.9422, precision @ rank n: 0.2769, execution time: 1.4564s
K Nearest Neighbours (KNN) ROC: 0.7602, precision @ rank n: 0.0462, execution time: 1.4049s
Local Outlier Factor (LOF) ROC: 0.481, precision @ rank n: 0.0462, execution time: 1.0415s
Minimum Covariance Determinant (MCD)





Minimum Covariance Determinant (MCD) ROC: 0.9903, precision @ rank n: 0.7534, execution time: 23.464s
One-class SVM (OCSVM) ROC: 0.9922, precision @ rank n: 0.9553, execution time: 88.6867s
Principal Component Analysis (PCA) ROC: 0.9902, precision @ rank n: 0.9503, execution time: 0.0534s

... Processing vertebral.mat ...
Angle-based Outlier Detector (ABOD) ROC: 0.2797, precision @ rank n: 0.0, execution time: 0.1381s
Cluster-based Local Outlier Factor ROC: 0.3908, precision @ rank n: 0.0, execution time: 0.1165s
Feature Bagging ROC: 0.3027, precision @ rank n: 0.0, execution time: 0.0821s
Histogram-base Outlier Detection (HBOS) ROC: 0.2695, precision @ rank n: 0.0, execution time: 0.006s
Isolation Forest ROC: 0.3576, precision @ rank n: 0.0, execution time: 0.7251s
K Nearest Neighbours (KNN) ROC: 0.318, precision @ rank n: 0.0, execution time: 0.024s
Local Outlier Factor (LOF) ROC: 0.318, precision @ rank n: 0.0, execution time: 0.006s
Minimum Covariance Determinant (MCD) ROC: 0.3308,

In [16]:
roc_df

Unnamed: 0,Data,#Samples,#Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,0.7687,0.7684,0.7799,0.8511,0.8527,0.782,0.7787,0.8228,0.7986,0.7997
0,cardio,1831,21,9.6122,0.5763,0.8221,0.4879,0.8453,0.9414,0.6959,0.4715,0.8778,0.9507,0.9638
0,glass,214,9,4.2056,0.7104,0.8506,0.7043,0.6524,0.7195,0.7805,0.7774,0.7165,0.6189,0.622
0,ionosphere,351,33,35.8974,0.9004,0.8952,0.8933,0.5195,0.8309,0.9134,0.8989,0.9399,0.8372,0.7971
0,letter,1600,32,6.25,0.8465,0.7423,0.866,0.5728,0.5778,0.845,0.8409,0.7499,0.5744,0.48
0,lympho,148,18,4.0541,0.9382,0.9709,0.9673,0.9964,0.9855,0.9636,0.9636,0.9127,0.9636,0.9818
0,mnist,7603,100,9.2069,0.7813,0.8447,0.7259,0.5675,0.7801,0.8409,0.7085,0.863,0.8417,0.8396
0,musk,3062,166,3.1679,0.0809,1.0,0.5228,0.9999,0.9996,0.7348,0.5323,1.0,1.0,1.0
0,optdigits,5216,64,2.8758,0.4428,0.7852,0.4641,0.8822,0.5764,0.3824,0.4584,0.3486,0.4972,0.504
0,pendigits,6870,16,2.2707,0.7008,0.9609,0.4687,0.9294,0.9422,0.7602,0.481,0.8271,0.93,0.9332


In [17]:
prn_df

Unnamed: 0,Data,#Samples,#Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,0.3571,0.4643,0.5,0.5714,0.5714,0.5,0.4643,0.4286,0.5,0.5
0,cardio,1831,21,9.6122,0.1875,0.4844,0.1406,0.4688,0.5,0.2812,0.125,0.3906,0.5938,0.6875
0,glass,214,9,4.2056,0.25,0.25,0.25,0.0,0.25,0.25,0.25,0.0,0.25,0.25
0,ionosphere,351,33,35.8974,0.8214,0.8036,0.75,0.3393,0.6607,0.8393,0.75,0.8571,0.7143,0.5893
0,letter,1600,32,6.25,0.275,0.175,0.4,0.125,0.05,0.3,0.325,0.075,0.1,0.05
0,lympho,148,18,4.0541,0.4,0.6,0.6,0.8,0.6,0.6,0.6,0.6,0.6,0.8
0,mnist,7603,100,9.2069,0.3562,0.4007,0.3664,0.1199,0.2979,0.4144,0.339,0.3973,0.3801,0.3767
0,musk,3062,166,3.1679,0.0333,1.0,0.1667,0.9667,0.9333,0.2333,0.1333,0.9667,1.0,1.0
0,optdigits,5216,64,2.8758,0.0161,0.0,0.0484,0.2581,0.0161,0.0,0.0484,0.0,0.0,0.0
0,pendigits,6870,16,2.2707,0.0308,0.3077,0.0462,0.2615,0.2769,0.0462,0.0462,0.0615,0.2923,0.3385


In [18]:
time_df

Unnamed: 0,Data,#Samples,#Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,0.3812,4.8557,1.0318,5.0512,1.0438,0.1778,0.1318,7.5286,0.1216,0.183
0,cardio,1831,21,9.6122,1.0896,0.3343,1.4958,0.0152,0.9332,0.353,0.184,2.1206,0.1281,0.011
0,glass,214,9,4.2056,0.1067,0.1009,0.0784,0.0084,0.6801,0.0238,0.006,0.1194,0.003,0.003
0,ionosphere,351,33,35.8974,0.1831,0.1178,0.153,0.0228,0.7327,0.0295,0.0181,0.3998,0.0082,0.0041
0,letter,1600,32,6.25,0.9682,0.2618,1.4574,0.0192,0.9423,0.3353,0.1781,6.8193,0.1281,0.0171
0,lympho,148,18,4.0541,0.0599,0.0952,0.0742,0.0145,0.6849,0.0174,0.005,0.1422,0.0037,0.004
0,mnist,7603,100,9.2069,14.3187,1.7753,91.3452,0.124,3.2228,11.8631,11.106,30.5514,7.656,0.2506
0,musk,3062,166,3.1679,4.5638,0.6512,23.173,0.1509,2.1883,3.2398,2.9426,155.846,1.9188,0.3319
0,optdigits,5216,64,2.8758,5.6262,0.8628,26.2401,0.0722,1.7744,3.5219,2.9736,14.8066,2.3387,0.095
0,pendigits,6870,16,2.2707,4.0082,0.544,8.6638,0.0208,1.4564,1.4049,1.0415,6.0129,1.6465,0.0172
