In [1]:
import random
import numpy as np
import pandas as pd


from evaluator import evaluate

from data_loader import load_kdd_cup_urc, load_yahoo_A1, load_yahoo_A2, load_yahoo_A3, load_yahoo_A4, load_power_demand # Univariate Datasets
from data_loader import load_nasa, load_ecg, load_gesture, load_smd # Multivariate Datasets

from tqdm.notebook import tqdm
from matplotlib import pyplot as plt
from sklearn.ensemble import IsolationForest as ISF

# THESE LINES ARE FOR REPRODUCIBILITY
random.seed(0)
np.random.seed(0)
# tf.random.set_seed(0)

### Yahoo S5

In [2]:
total_scores = {'dataset': [], 'f1': [], 'pr_auc': [], 'roc_auc': []}

In [3]:
for loader in [load_yahoo_A1, load_yahoo_A2, load_yahoo_A3, load_yahoo_A4]:
    datasets = loader()
    x_trains, x_tests, y_tests = datasets['x_train'], datasets['x_test'], datasets['y_test']
    
    for i in tqdm(range(len(x_trains))):
        X_train = x_trains[i]
        X_test = x_tests[i]
        
        clf = ISF(random_state=0, n_jobs=-1).fit(X_train)
        # 1 -> 0 for inliers, -1 -> 1 for outliers.
        X_test_rec = -clf.decision_function(X_test)
        X_test_rec = X_test_rec.reshape(y_tests[i].shape)
        
        scores = evaluate(X_test, X_test_rec, y_tests[i], is_reconstructed=False)

        total_scores['dataset'].append(loader.__name__.replace('load_', ''))
        total_scores['f1'].append(np.max(scores['f1']))
        total_scores['pr_auc'].append(scores['pr_auc'])
        total_scores['roc_auc'].append(scores['roc_auc'])
        print(loader.__name__.replace('load_', ''), np.max(scores['f1']), scores['pr_auc'], scores['roc_auc'])   

  0%|          | 0/67 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

yahoo_A1 0.14814813333333407 0.0 0.4599056372736083
yahoo_A1 0.9629629058984938 0.0 0.1483253576752258
yahoo_A1 0.9729729177501851 0.056783529418183425 0.8170669346939724
yahoo_A1 0.9999999000000052 0.0 0.2750582612412459
yahoo_A1 0.9090908429752097 0.5138729188363166 0.32921568071011925
yahoo_A1 0.999999930000003 0.5999999693333345 0.30751173086678135
yahoo_A1 0.5999999460000028 0.0 0.31264366766759183
yahoo_A1 0.9082125099302228 0.8063588440897093 0.13508257691877498
yahoo_A1 0.9107142349170947 0.9038292054717418 0.11451151827561118
yahoo_A1 0.7692307168967815 0.3089846907177446 0.6867469834941573
yahoo_A1 0.3636363272727291 0.12243248771369242 0.10444977232892078
yahoo_A1 0.7999999413333363 0.0 0.3285714230170069
yahoo_A1 0.9999999484127009 0.2380952372856116 0.027548209315074828
yahoo_A1 0.9999999250000032 0.7499999447916703 0.12559241389203976
yahoo_A1 0.999999943333336 0.06666666576190483 0.18203309567012613
yahoo_A1 0.999999947674421 0.0 0.051679586429768505
yahoo_A1 0.961538410

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

yahoo_A2 0.03448275190249817 0.008018948323825187 0.6406455701972145
yahoo_A2 0.3999999440000059 0.009203150801995824 0.4996706066782387
yahoo_A2 0.01162790581733378 0.0 0.10408431885980451
yahoo_A2 0.05696201974643535 0.0 0.2661596928264268
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.03652967674568947 0.0 0.28722001913211875
yahoo_A2 0.023622044903590014 0.0 0.05703421989137722
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.01797752630021475 0.0 0.3386034170502501
yahoo_A2 0.4285713818877595 0.018970502955849126 0.4575411862397146
yahoo_A2 0.011976046699415543 0.0 0.4914610513298984
yahoo_A2 0.3809523464852629 0.0 0.5309617784873571
yahoo_A2 0.07594935624899998 0.008092662208835125 0.6077312986203411
yahoo_A2 0.05232557626926761 0.0 0.2648922657177295
yahoo_A2 0.004065040243076211 0.0 0.06261858955168592
yahoo_A2 0.022222217851852605 0.0028409090182722127 0.1442687710778303
yahoo_A2 0.03333333004321016 0.0 0.2940430892177668
yahoo_A2 0.01694915083309394 0.004273503842501322 0.10056924988604102
yahoo_A2 0.0173

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/92 [00:00<?, ?it/s]

yahoo_A3 0.0 0.0 -0.0
yahoo_A3 0.02597402289222497 0.0 0.20749664977984228
yahoo_A3 0.7999999200000041 0.04254385801365809 0.23020624643885906
yahoo_A3 0.0 0.0 -0.0
yahoo_A3 0.8888888197530895 0.5999999580000026 0.20040079755502999
yahoo_A3 0.01851851666666676 0.004854243294746385 0.026892428928905948
yahoo_A3 0.019646363084904188 0.002721554278964139 0.0803212837857454
yahoo_A3 0.03389829531744 0.004274133973255668 0.33133731423858903
yahoo_A3 0.08695649829868218 0.0 0.28276778825582405
yahoo_A3 0.03208555625840105 0.0024257496622937607 0.3983967855393996
yahoo_A3 0.11764704067666523 0.014284544228795172 0.22985971479561934
yahoo_A3 0.030456847813652193 0.0 0.3703406738816954
yahoo_A3 0.01047120313587895 0.0 0.5606361267275133
yahoo_A3 0.02352940943021934 0.001820426615768418 0.016733600788696963
yahoo_A3 0.0 0.0 -0.0
yahoo_A3 0.9090908429752097 0.7633332859111138 0.23607213951982528
yahoo_A3 0.0 0.0 -0.0
yahoo_A3 0.006872851544029947 0.0 0.3836977746752129
yahoo_A3 0.0050125303283272

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/82 [00:00<?, ?it/s]

yahoo_A4 0.7999999200000041 0.3488818641102865 0.1457085779482952
yahoo_A4 0.00826446198005601 0.0 0.39761427427483204
yahoo_A4 0.01149425058792477 0.0017956925422823108 0.40737049747861254
yahoo_A4 0.007905137549407151 0.0014890813099579132 0.06573704849288126
yahoo_A4 0.049586770384537265 0.0027380485200336874 0.3229999918604002
yahoo_A4 0.6153845585798854 0.005308040266622258 0.4505009929037074
yahoo_A4 0.4999999375000028 0.03155447553424615 0.15701928947852814
yahoo_A4 0.23529408581315192 0.07027416748823953 0.11077843939904633
yahoo_A4 0.01860464839372666 0.002251051185992595 0.2624750410986413
yahoo_A4 0.009237874364896119 0.0 0.1454183193933434
yahoo_A4 0.2499999437500102 0.05496548502974741 0.17649999555220006
yahoo_A4 0.008695651304347869 0.001638748926333002 0.18326692307106288
yahoo_A4 0.009523807628118195 0.0012019230162490786 0.13894421613268082
yahoo_A4 0.08333331076389416 0.012903504163333884 0.13672654232134548
yahoo_A4 0.0 0.0 -0.0
yahoo_A4 0.0689655101070155 0.0178571

In [4]:
yahoo_results = pd.DataFrame(total_scores)
yahoo_results.groupby('dataset').mean()

Unnamed: 0_level_0,f1,pr_auc,roc_auc
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
yahoo_A1,0.726054,0.206912,0.27232
yahoo_A2,0.063139,0.00144,0.261625
yahoo_A3,0.274491,0.110465,0.230882
yahoo_A4,0.226731,0.046313,0.232913


### NASA

In [5]:
total_scores = {'dataset': [], 'f1': [], 'pr_auc': [], 'roc_auc': []}

In [6]:
for loader in [load_nasa]:
    datasets = loader()
    x_trains, x_tests, y_tests = datasets['x_train'], datasets['x_test'], datasets['y_test']
    
    for i in tqdm(range(len(x_trains))):
        X_train = x_trains[i]
        X_test = x_tests[i]
        
        clf = ISF(random_state=0, n_jobs=32).fit(X_train)
        # Negative --> Positive scores represent outliers, positive -- negative scores represent inliers.
        X_test_rec = -clf.decision_function(X_test)
        X_test_rec = X_test_rec.reshape(y_tests[i].shape)
        
        scores = evaluate(X_test, X_test_rec, y_tests[i], is_reconstructed=False)

        total_scores['dataset'].append(f'D{i+1}')
        total_scores['f1'].append(np.max(scores['f1']))
        total_scores['pr_auc'].append(scores['pr_auc'])
        total_scores['roc_auc'].append(scores['roc_auc'])
        print(f'D{i+1}', np.max(scores['f1']), scores['pr_auc'], scores['roc_auc'])   

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

D1 0.06848287760161073 0.005548290568631348 0.0031908091270212158
D2 0.019487891861075166 0.00395832984383277 0.00018187091979471773


In [7]:
nasa_results = pd.DataFrame(total_scores)
nasa_results.groupby('dataset').mean()

Unnamed: 0_level_0,f1,pr_auc,roc_auc
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
D1,0.068483,0.005548,0.003191
D2,0.019488,0.003958,0.000182


### SMD

In [8]:
total_scores = {'dataset': [], 'f1': [], 'pr_auc': [], 'roc_auc': []}

In [9]:
for loader in [load_smd]:
    datasets = loader()
    x_trains, x_tests, y_tests = datasets['x_train'], datasets['x_test'], datasets['y_test']
    
    for i in tqdm(range(len(x_trains))):
        X_train = x_trains[i]
        X_test = x_tests[i]
        
        clf = ISF(random_state=0, n_jobs=32).fit(X_train)
        # 1 -> 0 for inliers, -1 -> 1 for outliers.
        X_test_rec = -clf.decision_function(X_test)
        X_test_rec = X_test_rec.reshape(y_tests[i].shape)
        
        scores = evaluate(X_test, X_test_rec, y_tests[i], is_reconstructed=False)

        total_scores['dataset'].append(loader.__name__.replace('load_', ''))
        total_scores['f1'].append(np.max(scores['f1']))
        total_scores['pr_auc'].append(scores['pr_auc'])
        total_scores['roc_auc'].append(scores['roc_auc'])
        print(loader.__name__.replace('load_', ''), np.max(scores['f1']), scores['pr_auc'], scores['roc_auc'])   

  0%|          | 0/28 [00:00<?, ?it/s]

  0%|          | 0/28 [00:00<?, ?it/s]

smd 0.3634000984646172 0.35753598146641374 0.19750503815584164
smd 0.359840904503404 0.1999533024838473 0.022424242033625048
smd 0.27199995173448793 0.2181642778703282 0.1376678253200118
smd 0.29657789717475425 0.23185024360060297 0.21727344075424937
smd 0.18666661711803775 0.07105388819824249 0.1252101159464445
smd 0.5950919774490859 0.5525943420100534 0.6777964888973863
smd 0.2224052218271501 0.09772661441517907 0.011974703397400232
smd 0.15885017809548307 0.08296819615736704 0.134176483316161
smd 0.21385657813409287 0.08264733901409321 0.034229110977498554
smd 0.2183340322118948 0.06039447115395553 0.02148440298294967
smd 0.4901960283898551 0.45828959399590985 0.16388139321264217
smd 0.3918574586821599 0.27205601968657334 0.028591302679670676
smd 0.25497798128280486 0.1550873757593434 0.028312563297880212
smd 0.2657342272825132 0.2225511329831861 0.18531074039458817
smd 0.7613940523683088 0.7007891996166609 0.06957516591668128
smd 0.7790261687665729 0.7400354081884956 0.433035631566

In [10]:
smd_results = pd.DataFrame(total_scores)
smd_results.groupby('dataset').mean()

Unnamed: 0_level_0,f1,pr_auc,roc_auc
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
smd,0.319941,0.246773,0.178762


### ECG

In [11]:
total_scores = {'dataset': [], 'f1': [], 'pr_auc': [], 'roc_auc': []}

In [12]:
for loader in [load_ecg]:
    datasets = loader()
    x_trains, x_tests, y_tests = datasets['x_train'], datasets['x_test'], datasets['y_test']
    
    for i in tqdm(range(len(x_trains))):
        X_train = x_trains[i]
        X_test = x_tests[i]
        
        clf = ISF(random_state=0, n_jobs=32).fit(X_train)
        # 1 -> 0 for inliers, -1 -> 1 for outliers.
        X_test_rec = -clf.decision_function(X_test)
        X_test_rec = X_test_rec.reshape(y_tests[i].shape)
        
        scores = evaluate(X_test, X_test_rec, y_tests[i], is_reconstructed=False)

        total_scores['dataset'].append(f'D{i+1}')
        total_scores['f1'].append(np.max(scores['f1']))
        total_scores['pr_auc'].append(scores['pr_auc'])
        total_scores['roc_auc'].append(scores['roc_auc'])
        print(loader.__name__.replace('load_', ''), np.max(scores['f1']), scores['pr_auc'], scores['roc_auc'])   

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

ecg 0.4055727066899003 0.23917122329662105 0.10284296750065411
ecg 0.43065688528158685 0.2773996965673287 0.10794694224824497
ecg 0.2103003855928547 0.06156505270421019 0.023599027364044227
ecg 0.2409638077605697 0.054086108372760594 0.02262284319098508
ecg 0.3151969594500363 0.21862348835085452 0.4237839083156143
ecg 0.09318992038644923 0.010701520909631754 0.00923236885917654
ecg 0.06499994996878845 0.010469930670933646 0.050152653150311685
ecg 0.11123981891514258 0.009052028190697446 0.009958316209143997
ecg 0.17037032053438433 0.02172895565609409 0.019481378003458758


In [13]:
ecg_results = pd.DataFrame(total_scores)
ecg_results.groupby('dataset').mean()

Unnamed: 0_level_0,f1,pr_auc,roc_auc
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
D1,0.405573,0.239171,0.102843
D2,0.430657,0.2774,0.107947
D3,0.2103,0.061565,0.023599
D4,0.240964,0.054086,0.022623
D5,0.315197,0.218623,0.423784
D6,0.09319,0.010702,0.009232
D7,0.065,0.01047,0.050153
D8,0.11124,0.009052,0.009958
D9,0.17037,0.021729,0.019481


### Power Demand

In [14]:
total_scores = {'dataset': [], 'f1': [], 'pr_auc': [], 'roc_auc': []}

In [15]:
for loader in [load_power_demand]:
    datasets = loader()
    x_trains, x_tests, y_tests = datasets['x_train'], datasets['x_test'], datasets['y_test']
    
    for i in tqdm(range(len(x_trains))):
        X_train = x_trains[i]
        X_test = x_tests[i]
        
        clf = ISF(random_state=0, n_jobs=32).fit(X_train)
        # 1 -> 0 for inliers, -1 -> 1 for outliers.
        X_test_rec = -clf.decision_function(X_test)
        X_test_rec = X_test_rec.reshape(y_tests[i].shape)
        
        scores = evaluate(X_test, X_test_rec, y_tests[i], is_reconstructed=False)

        total_scores['dataset'].append(loader.__name__.replace('load_', ''))
        total_scores['f1'].append(np.max(scores['f1']))
        total_scores['pr_auc'].append(scores['pr_auc'])
        total_scores['roc_auc'].append(scores['roc_auc'])
        print(loader.__name__.replace('load_', ''), np.max(scores['f1']), scores['pr_auc'], scores['roc_auc'])   

  0%|          | 0/1 [00:00<?, ?it/s]

power_demand 0.18263806967535026 0.03232192777830734 0.05657091036655871


In [16]:
power_results = pd.DataFrame(total_scores)
power_results.groupby('dataset').mean()

Unnamed: 0_level_0,f1,pr_auc,roc_auc
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
power_demand,0.182638,0.032322,0.056571


### 2D Gesture

In [17]:
total_scores = {'dataset': [], 'f1': [], 'pr_auc': [], 'roc_auc': []}

In [18]:
for loader in [load_gesture]:
    datasets = loader()
    x_trains, x_tests, y_tests = datasets['x_train'], datasets['x_test'], datasets['y_test']
    
    for i in tqdm(range(len(x_trains))):
        X_train = x_trains[i]
        X_test = x_tests[i]
        
        clf = ISF(random_state=0, n_jobs=32).fit(X_train)
        # 1 -> 0 for inliers, -1 -> 1 for outliers.
        X_test_rec = -clf.decision_function(X_test)
        X_test_rec = X_test_rec.reshape(y_tests[i].shape)
        
        scores = evaluate(X_test, X_test_rec, y_tests[i], is_reconstructed=False)

        total_scores['dataset'].append(loader.__name__.replace('load_', ''))
        total_scores['f1'].append(np.max(scores['f1']))
        total_scores['pr_auc'].append(scores['pr_auc'])
        total_scores['roc_auc'].append(scores['roc_auc'])
        print(loader.__name__.replace('load_', ''), np.max(scores['f1']), scores['pr_auc'], scores['roc_auc'])   

  0%|          | 0/1 [00:00<?, ?it/s]

gesture 0.5091092651253853 0.3882724357241408 0.3604293797215846


In [19]:
gesture_results = pd.DataFrame(total_scores)
gesture_results.groupby('dataset').mean()

Unnamed: 0_level_0,f1,pr_auc,roc_auc
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
gesture,0.509109,0.388272,0.360429
