In [76]:
import os  #importing modules
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.io import loadmat

In [77]:
from warnings import filterwarnings
filterwarnings('ignore')

In [78]:
# import pyod packages and methods
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging

In [79]:
# Import metrics Performance

In [80]:
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score

In [81]:
#Assigning all dataset to the list
mat_file_list = ['arrhythmia.mat',
                 'cardio.mat',
                 'glass.mat',
                 'ionosphere.mat',
                 'letter.mat',
                 'lympho.mat',
                 'mnist.mat',
                 'musk.mat',
                 'optdigits.mat',
                 'pendigits.mat',
                 'pima.mat',
                 'satellite.mat',
                 'satimage-2.mat',
                 'shuttle.mat',
                 'vertebral.mat',
                 'vowels.mat',
                 'wbc.mat'
                ]

In [82]:
data = loadmat('Anamoly_detec_data/cardio.mat')

In [83]:
data

{'__header__': b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
         -0.28978574, -0.49329397],
        [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
         -0.25638541, -0.49329397],
        [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
         -0.25638541,  1.14001753],
        ...,
        [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
          0.24461959, -0.49329397],
        [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
          0.14441859, -0.49329397],
        [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
          3.58465295, -0.49329397]]),
 'y': array([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]])}

In [84]:
len(data)

5

In [85]:
data.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'y'])

In [86]:
data.values()

dict_values([b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC', '1.0', [], array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
        -0.28978574, -0.49329397],
       [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
        -0.25638541, -0.49329397],
       [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
        -0.25638541,  1.14001753],
       ...,
       [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
         0.24461959, -0.49329397],
       [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
         0.14441859, -0.49329397],
       [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
         3.58465295, -0.49329397]]), array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]])])

# Input (Independent) Feature Shape in Mat file format

In [87]:
type(data['X']),data['X'].shape

(numpy.ndarray, (1831, 21))

# Dependent / Target /Output Feature shape

In [88]:
type(data['y']),data['y'].shape

(numpy.ndarray, (1831, 1))

In [89]:
df_columns = ['Data','#Samples','#Dimensions','Outlier Perc',
             'ABOD','CBLOF','FB','HBOS','IForest','KNN','LOF','MCD','OCSVM','PCA']

In [97]:
#ROC Performance evulotion table
roc_df = pd.DataFrame(columns=df_columns)
#precison performance evulotion table
prn_df = pd.DataFrame(columns=df_columns)
#Time performance evulotion table
time_df = pd.DataFrame(columns=df_columns)

# Exploring All mat files

In [91]:
from time import time

In [92]:
random_state = np.random.RandomState(42)

In [98]:
# processing matrix files
for mat_file in mat_file_list:
    print("\n...... Processing",mat_file,"...")
    mat = loadmat(os.path.join('Anamoly_detec_data',mat_file))
    
    x = mat['X']
    y = mat['y'].ravel()
    outliers_fraction = np.count_nonzero(y)/len(y)
    outliers_percentage = round(outliers_fraction * 100,ndigits=4)
    
    #construct containers for saving results
    roc_list = [mat_file[:-4], x.shape[0], x.shape[1], outliers_percentage]
    prn_list = [mat_file[:-4], x.shape[0], x.shape[1], outliers_percentage]    
    time_list = [mat_file[:-4], x.shape[0], x.shape[1], outliers_percentage]
    
    # 60% data for training and 40% for testing
    x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.4,random_state=random_state)
    
    #standardizing data for processing
    x_train_norm,x_test_norm = standardizer(x_train,x_test)
    
    classifiers = {'Angle-based Outlier Dedector (ABOD)': ABOD(contamination = outliers_fraction),
                   'cluster based Local Outlier Factor': CBLOF(contamination= outliers_fraction,check_estimator=False,random_state=random_state),
                   'Feature Bagging': FeatureBagging(combination=outliers_fraction,random_state=random_state),
                   'Histogram-base outlier Detection (HBOS)': HBOS(contamination=outliers_fraction),
                   'Isolation Forest':IForest(contamination=outliers_fraction,random_state=random_state),
                   'K- Nearest Neibour(KNN)':KNN(contamination=outliers_fraction),
                   'Local outlier Factor (LOF)': LOF(contamination=outliers_fraction),
                   'Minimum Covariance Determinant':MCD(contamination=outliers_fraction,random_state=random_state),
                   'one-class SVM (OCSVM)':OCSVM(contamination=outliers_fraction),
                   'principal Component Analysis (PCA)': PCA(contamination=outliers_fraction,random_state=random_state)
                  }
    
    for clf_name,clf in classifiers.items():#Calcutating time taken for each Algorithm 
        t0 = time()
        clf.fit(x_train_norm)
        test_score =  clf.decision_function(x_test_norm)
        t1=time()
        duration = round(t1-t0,ndigits=4)
        time_list.append(duration)

        roc = round(roc_auc_score(y_test,test_score),ndigits=4)
        prn = round(precision_n_scores(y_test,test_score),ndigits=4)

        print('{clf_name} ROC:{roc},precision @ rank n:{prn},''execution time:{duration}s'.format(clf_name=clf_name,roc=roc,prn=prn,duration=duration))

        roc_list.append(roc)
        prn_list.append(prn)

    temp_df = pd.DataFrame(time_list).transpose()
    temp_df.columns = df_columns
    time_df = pd.concat([time_df,temp_df],axis=0)

    temp_df = pd.DataFrame(roc_list).transpose()
    temp_df.columns = df_columns
    roc_df = pd.concat([roc_df,temp_df],axis=0)

    temp_df = pd.DataFrame(prn_list).transpose()
    temp_df.columns = df_columns
    prn_df = pd.concat([prn_df,temp_df],axis=0)


...... Processing arrhythmia.mat ...
Angle-based Outlier Dedector (ABOD) ROC:0.7403,precision @ rank n:0.2963,execution time:0.192s
cluster based Local Outlier Factor ROC:0.7083,precision @ rank n:0.2593,execution time:0.13s
Feature Bagging ROC:0.695,precision @ rank n:0.2593,execution time:0.519s
Histogram-base outlier Detection (HBOS) ROC:0.791,precision @ rank n:0.5185,execution time:0.084s
Isolation Forest ROC:0.8016,precision @ rank n:0.5185,execution time:0.47s
K- Nearest Neibour(KNN) ROC:0.7208,precision @ rank n:0.3333,execution time:0.088s
Local outlier Factor (LOF) ROC:0.708,precision @ rank n:0.2963,execution time:0.066s
Minimum Covariance Determinant ROC:0.7097,precision @ rank n:0.3333,execution time:0.615s
one-class SVM (OCSVM) ROC:0.7211,precision @ rank n:0.3333,execution time:0.036s
principal Component Analysis (PCA) ROC:0.7229,precision @ rank n:0.3704,execution time:0.053s

...... Processing cardio.mat ...
Angle-based Outlier Dedector (ABOD) ROC:0.5463,precision @ r

Feature Bagging ROC:0.5336,precision @ rank n:0.0519,execution time:4.2412s
Histogram-base outlier Detection (HBOS) ROC:0.9202,precision @ rank n:0.2987,execution time:0.01s
Isolation Forest ROC:0.9631,precision @ rank n:0.4286,execution time:0.73s
K- Nearest Neibour(KNN) ROC:0.802,precision @ rank n:0.1688,execution time:0.675s
Local outlier Factor (LOF) ROC:0.5552,precision @ rank n:0.0779,execution time:0.63s
Minimum Covariance Determinant ROC:0.8408,precision @ rank n:0.0779,execution time:2.2791s
one-class SVM (OCSVM) ROC:0.9386,precision @ rank n:0.3766,execution time:1.0111s
principal Component Analysis (PCA) ROC:0.9357,precision @ rank n:0.3247,execution time:0.009s

...... Processing pima.mat ...
Angle-based Outlier Dedector (ABOD) ROC:0.6772,precision @ rank n:0.5304,execution time:0.214s
cluster based Local Outlier Factor ROC:0.6815,precision @ rank n:0.5217,execution time:0.085s
Feature Bagging ROC:0.5991,precision @ rank n:0.4696,execution time:0.097s
Histogram-base outlie

In [94]:
roc_df

Unnamed: 0,Data,#Samples,#Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,0.7687,0.7684,0.7675,0.8511,0.8527,0.782,0.7787,0.8228,0.7986,0.8
0,cardio,1831,21,9.6122,0.5763,0.8221,0.5144,0.8453,0.9414,0.6959,0.4715,0.8778,0.9507,0.9638
0,glass,214,9,4.2056,0.7104,0.8506,0.7104,0.6524,0.7195,0.7805,0.7774,0.7165,0.6189,0.622
0,ionosphere,351,33,35.8974,0.9004,0.8952,0.8895,0.5195,0.8309,0.9134,0.8989,0.9399,0.8372,0.7971
0,letter,1600,32,6.25,0.8465,0.7423,0.8705,0.5728,0.5778,0.845,0.8409,0.7499,0.5744,0.48
0,lympho,148,18,4.0541,0.9382,0.9709,0.9636,0.9964,0.9855,0.9636,0.9636,0.9164,0.9636,0.9818
0,mnist,7603,100,9.2069,0.7813,0.8447,0.7476,0.5675,0.7801,0.8409,0.7085,0.863,0.8417,0.8396
0,musk,3062,166,3.1679,0.0809,1.0,0.5371,0.9999,0.9996,0.7348,0.5323,1.0,1.0,1.0
0,optdigits,5216,64,2.8758,0.4428,0.7852,0.4617,0.8822,0.5764,0.3824,0.4584,0.3486,0.4972,0.504
0,pendigits,6870,16,2.2707,0.7008,0.9609,0.4244,0.9294,0.9422,0.7602,0.481,0.8271,0.93,0.9332


In [95]:
time_df

Unnamed: 0,Data,#Samples,#Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,0.186,0.145,0.52,0.081,0.438,0.081,0.068,0.592,0.035,0.052
0,cardio,1831,21,9.6122,0.552,0.162,0.823,0.008,0.423,0.161,0.099,0.583,0.079,0.004
0,glass,214,9,4.2056,0.058,0.051,0.047,0.004,0.316,0.011,0.003,0.038,0.002,0.002
0,ionosphere,351,33,35.8974,0.099,0.076,0.073,0.011,0.347,0.021,0.007,0.071,0.004,0.003
0,letter,1600,32,6.25,0.513,0.76,0.756,0.013,0.453,0.142,0.088,1.3201,0.096,0.016
0,lympho,148,18,4.0541,0.053,0.058,0.034,0.006,0.316,0.008,0.003,0.034,0.002,0.001
0,mnist,7603,100,9.2069,6.9604,1.1241,44.7566,0.061,2.0521,5.5213,5.4903,3.3422,3.9572,0.132
0,musk,3062,166,3.1679,2.2351,0.343,11.7107,0.069,1.3021,1.5251,1.4341,19.1091,0.9801,0.119
0,optdigits,5216,64,2.8758,2.8372,0.555,13.1678,0.034,0.9621,1.8431,1.5301,1.2901,1.2561,0.039
0,pendigits,6870,16,2.2707,2.2231,0.3,5.0193,0.011,0.703,0.662,0.583,2.1521,1.0381,0.009


In [96]:
prn_df

Unnamed: 0,Data,#Samples,#Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,0.3571,0.4643,0.4643,0.5714,0.5714,0.5,0.4643,0.4286,0.5,0.5
0,cardio,1831,21,9.6122,0.1875,0.4844,0.125,0.4688,0.5,0.2812,0.125,0.3906,0.5938,0.6875
0,glass,214,9,4.2056,0.25,0.25,0.25,0.0,0.25,0.25,0.25,0.0,0.25,0.25
0,ionosphere,351,33,35.8974,0.8214,0.8036,0.7679,0.3393,0.6607,0.8393,0.75,0.8571,0.7143,0.5893
0,letter,1600,32,6.25,0.275,0.175,0.375,0.125,0.05,0.3,0.325,0.075,0.1,0.05
0,lympho,148,18,4.0541,0.4,0.6,0.6,0.8,0.6,0.6,0.6,0.6,0.6,0.8
0,mnist,7603,100,9.2069,0.3562,0.4007,0.3493,0.1199,0.2979,0.4144,0.339,0.3973,0.3801,0.3767
0,musk,3062,166,3.1679,0.0333,1.0,0.3,0.9667,0.9333,0.2333,0.1333,0.9667,1.0,1.0
0,optdigits,5216,64,2.8758,0.0161,0.0,0.0323,0.2581,0.0161,0.0,0.0484,0.0,0.0,0.0
0,pendigits,6870,16,2.2707,0.0308,0.3077,0.0462,0.2615,0.2769,0.0462,0.0462,0.0615,0.2923,0.3385
