In [1]:
import os
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.io import loadmat

# Import Pyod and the Methods

In [11]:
from pyod.models.pca import PCA
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging

# Import Metrics Package

In [12]:
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score

In [13]:
mat_file_list=['arrhythmia.mat','cardio.mat','glass.mat','ionosphere.mat','letter.mat','lympho.mat','mnist.mat','musk.mat','optdigits.mat','pendigits.mat','pima.mat','satellite.mat','satimage-2.mat','shuttle.mat','vertebral.mat','vowels.mat','wbc.mat']

In [14]:
mat_file_list

['arrhythmia.mat',
 'cardio.mat',
 'glass.mat',
 'ionosphere.mat',
 'letter.mat',
 'lympho.mat',
 'mnist.mat',
 'musk.mat',
 'optdigits.mat',
 'pendigits.mat',
 'pima.mat',
 'satellite.mat',
 'satimage-2.mat',
 'shuttle.mat',
 'vertebral.mat',
 'vowels.mat',
 'wbc.mat']

In [15]:
data=loadmat("cardio.mat")

In [16]:
data

{'__header__': b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
         -0.28978574, -0.49329397],
        [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
         -0.25638541, -0.49329397],
        [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
         -0.25638541,  1.14001753],
        ...,
        [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
          0.24461959, -0.49329397],
        [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
          0.14441859, -0.49329397],
        [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
          3.58465295, -0.49329397]]),
 'y': array([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]])}

In [17]:
len(data)

5

In [18]:
df_columns=['Data','#Sample','#Dimensions','Outlier Perc','ABOD','CBLOF','FB','HBOS','IFOREST','KNN','LOF','MCD','OCSVM','PCA']

# Precision, Time and ROC Evolution Tables Creation

In [19]:
roc_df = pd.DataFrame(columns=df_columns)
roc_df

Unnamed: 0,Data,#Sample,#Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IFOREST,KNN,LOF,MCD,OCSVM,PCA


In [20]:
prn_df = pd.DataFrame(columns=df_columns)
prn_df

Unnamed: 0,Data,#Sample,#Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IFOREST,KNN,LOF,MCD,OCSVM,PCA


In [21]:
time_df = pd.DataFrame(columns=df_columns)
time_df

Unnamed: 0,Data,#Sample,#Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IFOREST,KNN,LOF,MCD,OCSVM,PCA


# Exploring Mat Files

In [22]:
from time import time
random_state = np.random.RandomState(42)

for mat_file in mat_file_list:
    print("\n... Processing", mat_file, '...')
    mat = loadmat(os.path.join(mat_file))

    X = mat['X']
    y = mat['y'].ravel()
    outliers_fraction = np.count_nonzero(y) / len(y)
    outliers_percentage = round(outliers_fraction * 100, ndigits=4)
    
    roc_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    prn_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    time_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,random_state=random_state)

    X_train_norm, X_test_norm = standardizer(X_train, X_test)

    classifiers = {'Angle-based Outlier Detector (ABOD)': ABOD(
       contamination=outliers_fraction),
       'Cluster-based Local Outlier Factor': CBLOF(
           contamination=outliers_fraction, check_estimator=False,
           random_state=random_state),
       'Feature Bagging': FeatureBagging(contamination=outliers_fraction,
                                         random_state=random_state),
       'Histogram-base Outlier Detection (HBOS)': HBOS(
           contamination=outliers_fraction),
       'Isolation Forest': IForest(contamination=outliers_fraction,
                                   random_state=random_state),
       'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
       'Local Outlier Factor (LOF)': LOF(
           contamination=outliers_fraction),
       'Minimum Covariance Determinant (MCD)': MCD(
           contamination=outliers_fraction, random_state=random_state),
       'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
       'Principal Component Analysis (PCA)': PCA(
           contamination=outliers_fraction, random_state=random_state),
   }

    for clf_name, clf in classifiers.items():
        t0 = time()
        clf.fit(X_train_norm)
        test_scores = clf.decision_function(X_test_norm)
        t1 = time()
        duration = round(t1 - t0, ndigits=4)
        time_list.append(duration)

        roc = round(roc_auc_score(y_test, test_scores), ndigits=4)
        prn = round(precision_n_scores(y_test, test_scores), ndigits=4)

        print('{clf_name} ROC:{roc}, precision @ rank n:{prn}, ''execution time: {duration}s'.format(clf_name=clf_name, roc=roc, prn=prn, duration=duration))

        roc_list.append(roc)
        prn_list.append(prn)

    temp_df = pd.DataFrame(time_list).transpose()
    temp_df.columns = df_columns
    time_df = pd.concat([time_df, temp_df], axis=0)

    temp_df = pd.DataFrame(roc_list).transpose()
    temp_df.columns = df_columns
    roc_df = pd.concat([roc_df, temp_df], axis=0)

    temp_df = pd.DataFrame(prn_list).transpose()
    temp_df.columns = df_columns
    prn_df = pd.concat([prn_df, temp_df], axis=0)



... Processing arrhythmia.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7687, precision @ rank n:0.3571, execution time: 2.8476s
Cluster-based Local Outlier Factor ROC:0.7684, precision @ rank n:0.4643, execution time: 3.5694s
Feature Bagging ROC:0.7799, precision @ rank n:0.5, execution time: 1.3059s
Histogram-base Outlier Detection (HBOS) ROC:0.8511, precision @ rank n:0.5714, execution time: 3.3392s
Isolation Forest ROC:0.8527, precision @ rank n:0.5714, execution time: 0.8308s
K Nearest Neighbors (KNN) ROC:0.782, precision @ rank n:0.5, execution time: 0.1972s
Local Outlier Factor (LOF) ROC:0.7787, precision @ rank n:0.4643, execution time: 0.1678s




Minimum Covariance Determinant (MCD) ROC:0.8228, precision @ rank n:0.4286, execution time: 1.1889s
One-class SVM (OCSVM) ROC:0.7986, precision @ rank n:0.5, execution time: 0.1169s
Principal Component Analysis (PCA) ROC:0.7997, precision @ rank n:0.5, execution time: 0.1319s

... Processing cardio.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.5763, precision @ rank n:0.1875, execution time: 0.8523s
Cluster-based Local Outlier Factor ROC:0.8221, precision @ rank n:0.4844, execution time: 0.3991s
Feature Bagging ROC:0.4879, precision @ rank n:0.1406, execution time: 1.6731s
Histogram-base Outlier Detection (HBOS) ROC:0.8453, precision @ rank n:0.4688, execution time: 0.0171s
Isolation Forest ROC:0.9414, precision @ rank n:0.5, execution time: 0.8533s
K Nearest Neighbors (KNN) ROC:0.6959, precision @ rank n:0.2812, execution time: 0.3351s
Local Outlier Factor (LOF) ROC:0.4715, precision @ rank n:0.125, execution time: 0.2125s




Minimum Covariance Determinant (MCD) ROC:0.8778, precision @ rank n:0.3906, execution time: 1.1716s
One-class SVM (OCSVM) ROC:0.9507, precision @ rank n:0.5938, execution time: 0.1704s
Principal Component Analysis (PCA) ROC:0.9638, precision @ rank n:0.6875, execution time: 0.0088s

... Processing glass.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7104, precision @ rank n:0.25, execution time: 0.0955s
Cluster-based Local Outlier Factor ROC:0.8506, precision @ rank n:0.25, execution time: 0.1004s
Feature Bagging ROC:0.7043, precision @ rank n:0.25, execution time: 0.0747s
Histogram-base Outlier Detection (HBOS) ROC:0.6524, precision @ rank n:0.0, execution time: 0.0051s
Isolation Forest ROC:0.7195, precision @ rank n:0.25, execution time: 0.6238s
K Nearest Neighbors (KNN) ROC:0.7805, precision @ rank n:0.25, execution time: 0.0198s
Local Outlier Factor (LOF) ROC:0.7774, precision @ rank n:0.25, execution time: 0.0068s
Minimum Covariance Determinant (MCD) ROC:0.7165, precision @ ran



Angle-based Outlier Detector (ABOD) ROC:0.7813, precision @ rank n:0.3562, execution time: 15.3689s
Cluster-based Local Outlier Factor ROC:0.8447, precision @ rank n:0.4007, execution time: 2.281s
Feature Bagging ROC:0.7259, precision @ rank n:0.3664, execution time: 83.1086s
Histogram-base Outlier Detection (HBOS) ROC:0.5675, precision @ rank n:0.1199, execution time: 0.0688s
Isolation Forest ROC:0.7801, precision @ rank n:0.2979, execution time: 2.5049s
K Nearest Neighbors (KNN) ROC:0.8409, precision @ rank n:0.4144, execution time: 8.5522s
Local Outlier Factor (LOF) ROC:0.7085, precision @ rank n:0.339, execution time: 8.3959s




Minimum Covariance Determinant (MCD) ROC:0.863, precision @ rank n:0.3973, execution time: 3.4579s
One-class SVM (OCSVM) ROC:0.8417, precision @ rank n:0.3801, execution time: 6.099s
Principal Component Analysis (PCA) ROC:0.8396, precision @ rank n:0.3767, execution time: 0.1761s

... Processing musk.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.0809, precision @ rank n:0.0333, execution time: 3.0054s
Cluster-based Local Outlier Factor ROC:1.0, precision @ rank n:1.0, execution time: 0.4565s
Feature Bagging ROC:0.5228, precision @ rank n:0.1667, execution time: 18.0199s
Histogram-base Outlier Detection (HBOS) ROC:0.9999, precision @ rank n:0.9667, execution time: 0.0809s
Isolation Forest ROC:0.9996, precision @ rank n:0.9333, execution time: 1.5223s
K Nearest Neighbors (KNN) ROC:0.7348, precision @ rank n:0.2333, execution time: 2.285s
Local Outlier Factor (LOF) ROC:0.5323, precision @ rank n:0.1333, execution time: 2.2453s
Minimum Covariance Determinant (MCD) ROC:1.0, precision @ 



Minimum Covariance Determinant (MCD) ROC:0.3486, precision @ rank n:0.0, execution time: 1.748s
One-class SVM (OCSVM) ROC:0.4972, precision @ rank n:0.0, execution time: 1.9111s
Principal Component Analysis (PCA) ROC:0.504, precision @ rank n:0.0, execution time: 0.0558s

... Processing pendigits.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7008, precision @ rank n:0.0308, execution time: 2.0121s
Cluster-based Local Outlier Factor ROC:0.9609, precision @ rank n:0.3077, execution time: 0.3807s
Feature Bagging ROC:0.4687, precision @ rank n:0.0462, execution time: 6.2932s
Histogram-base Outlier Detection (HBOS) ROC:0.9294, precision @ rank n:0.2615, execution time: 0.012s
Isolation Forest ROC:0.9422, precision @ rank n:0.2769, execution time: 0.9137s
K Nearest Neighbors (KNN) ROC:0.7602, precision @ rank n:0.0462, execution time: 0.8158s
Local Outlier Factor (LOF) ROC:0.481, precision @ rank n:0.0462, execution time: 0.7636s
Minimum Covariance Determinant (MCD) ROC:0.8271, precision





Minimum Covariance Determinant (MCD) ROC:0.9903, precision @ rank n:0.7534, execution time: 14.183s
One-class SVM (OCSVM) ROC:0.9922, precision @ rank n:0.9553, execution time: 57.465s
Principal Component Analysis (PCA) ROC:0.9902, precision @ rank n:0.9503, execution time: 0.0394s

... Processing vertebral.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.2797, precision @ rank n:0.0, execution time: 0.0628s
Cluster-based Local Outlier Factor ROC:0.3908, precision @ rank n:0.0, execution time: 0.0679s
Feature Bagging ROC:0.3027, precision @ rank n:0.0, execution time: 0.0516s
Histogram-base Outlier Detection (HBOS) ROC:0.2695, precision @ rank n:0.0, execution time: 0.003s
Isolation Forest ROC:0.3576, precision @ rank n:0.0, execution time: 0.3816s
K Nearest Neighbors (KNN) ROC:0.318, precision @ rank n:0.0, execution time: 0.013s
Local Outlier Factor (LOF) ROC:0.318, precision @ rank n:0.0, execution time: 0.004s
Minimum Covariance Determinant (MCD) ROC:0.3308, precision @ rank n:0.0

In [23]:
roc_df

Unnamed: 0,Data,#Sample,#Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IFOREST,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,0.7687,0.7684,0.7799,0.8511,0.8527,0.782,0.7787,0.8228,0.7986,0.7997
0,cardio,1831,21,9.6122,0.5763,0.8221,0.4879,0.8453,0.9414,0.6959,0.4715,0.8778,0.9507,0.9638
0,glass,214,9,4.2056,0.7104,0.8506,0.7043,0.6524,0.7195,0.7805,0.7774,0.7165,0.6189,0.622
0,ionosphere,351,33,35.8974,0.9004,0.8952,0.8933,0.5195,0.8309,0.9134,0.8989,0.9399,0.8372,0.7971
0,letter,1600,32,6.25,0.8465,0.7423,0.866,0.5728,0.5778,0.845,0.8409,0.7499,0.5744,0.48
0,lympho,148,18,4.0541,0.9382,0.9709,0.9673,0.9964,0.9855,0.9636,0.9636,0.9164,0.9636,0.9818
0,mnist,7603,100,9.2069,0.7813,0.8447,0.7259,0.5675,0.7801,0.8409,0.7085,0.863,0.8417,0.8396
0,musk,3062,166,3.1679,0.0809,1.0,0.5228,0.9999,0.9996,0.7348,0.5323,1.0,1.0,1.0
0,optdigits,5216,64,2.8758,0.4428,0.7852,0.4641,0.8822,0.5764,0.3824,0.4584,0.3486,0.4972,0.504
0,pendigits,6870,16,2.2707,0.7008,0.9609,0.4687,0.9294,0.9422,0.7602,0.481,0.8271,0.93,0.9332


In [24]:
prn_df

Unnamed: 0,Data,#Sample,#Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IFOREST,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,0.3571,0.4643,0.5,0.5714,0.5714,0.5,0.4643,0.4286,0.5,0.5
0,cardio,1831,21,9.6122,0.1875,0.4844,0.1406,0.4688,0.5,0.2812,0.125,0.3906,0.5938,0.6875
0,glass,214,9,4.2056,0.25,0.25,0.25,0.0,0.25,0.25,0.25,0.0,0.25,0.25
0,ionosphere,351,33,35.8974,0.8214,0.8036,0.75,0.3393,0.6607,0.8393,0.75,0.8571,0.7143,0.5893
0,letter,1600,32,6.25,0.275,0.175,0.4,0.125,0.05,0.3,0.325,0.075,0.1,0.05
0,lympho,148,18,4.0541,0.4,0.6,0.6,0.8,0.6,0.6,0.6,0.6,0.6,0.8
0,mnist,7603,100,9.2069,0.3562,0.4007,0.3664,0.1199,0.2979,0.4144,0.339,0.3973,0.3801,0.3767
0,musk,3062,166,3.1679,0.0333,1.0,0.1667,0.9667,0.9333,0.2333,0.1333,0.9667,1.0,1.0
0,optdigits,5216,64,2.8758,0.0161,0.0,0.0484,0.2581,0.0161,0.0,0.0484,0.0,0.0,0.0
0,pendigits,6870,16,2.2707,0.0308,0.3077,0.0462,0.2615,0.2769,0.0462,0.0462,0.0615,0.2923,0.3385


In [25]:
time_df

Unnamed: 0,Data,#Sample,#Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IFOREST,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,2.8476,3.5694,1.3059,3.3392,0.8308,0.1972,0.1678,1.1889,0.1169,0.1319
0,cardio,1831,21,9.6122,0.8523,0.3991,1.6731,0.0171,0.8533,0.3351,0.2125,1.1716,0.1704,0.0088
0,glass,214,9,4.2056,0.0955,0.1004,0.0747,0.0051,0.6238,0.0198,0.0068,0.0685,0.005,0.001
0,ionosphere,351,33,35.8974,0.1718,0.1066,0.1838,0.0195,0.6725,0.0394,0.0158,0.1364,0.0115,0.0057
0,letter,1600,32,6.25,0.8385,0.2637,1.6069,0.0232,0.86,0.3001,0.1969,2.1949,0.1871,0.0148
0,lympho,148,18,4.0541,0.0697,0.1021,0.0822,0.0163,0.6461,0.0129,0.0076,0.0748,0.001,0.0
0,mnist,7603,100,9.2069,15.3689,2.281,83.1086,0.0688,2.5049,8.5522,8.3959,3.4579,6.099,0.1761
0,musk,3062,166,3.1679,3.0054,0.4565,18.0199,0.0809,1.5223,2.285,2.2453,13.9755,1.619,0.1756
0,optdigits,5216,64,2.8758,3.3323,0.6518,18.9978,0.04,1.1806,2.4386,2.2321,1.748,1.9111,0.0558
0,pendigits,6870,16,2.2707,2.0121,0.3807,6.2932,0.012,0.9137,0.8158,0.7636,2.7043,1.2453,0.009
