In [1]:
import random
import numpy as np
import pandas as pd


from evaluator import evaluate

from data_loader import load_kdd_cup_urc, load_yahoo_A1, load_yahoo_A2, load_yahoo_A3, load_yahoo_A4, load_power_demand # Univariate Datasets
from data_loader import load_nasa, load_ecg, load_gesture, load_smd # Multivariate Datasets

from tqdm.notebook import tqdm
from matplotlib import pyplot as plt
from sklearn.ensemble import IsolationForest as ISF

# THESE LINES ARE FOR REPRODUCIBILITY
random.seed(0)
np.random.seed(0)
# tf.random.set_seed(0)

### Yahoo S5

In [2]:
total_scores = {'dataset': [], 'f1': [], 'pr_auc': [], 'roc_auc': []}

In [3]:
for loader in [load_yahoo_A1, load_yahoo_A2, load_yahoo_A3, load_yahoo_A4]:
    datasets = loader()
    x_trains, x_tests, y_tests = datasets['x_train'], datasets['x_test'], datasets['y_test']
    
    for i in tqdm(range(len(x_trains))):
        X_train = x_trains[i]
        X_test = x_tests[i]
        
        clf = ISF(random_state=0, n_jobs=32).fit(X_train)
        # 1 -> 0 for inliers, -1 -> 1 for outliers.
        X_test_rec = -clf.decision_function(X_test)
        X_test_rec = X_test_rec.reshape(y_tests[i].shape)
        
        scores = evaluate(X_test, X_test_rec, y_tests[i], is_reconstructed=False)

        total_scores['dataset'].append(loader.__name__.replace('load_', ''))
        total_scores['f1'].append(np.max(scores['f1']))
        total_scores['pr_auc'].append(scores['pr_auc'])
        total_scores['roc_auc'].append(scores['roc_auc'])
        print(loader.__name__.replace('load_', ''), np.max(scores['f1']), scores['pr_auc'], scores['roc_auc'])   

  0%|          | 0/67 [00:00<?, ?it/s]

  0%|          | 0/67 [00:00<?, ?it/s]

yahoo_A1 0.14814813333333407 0.0 0.4599056372736083
yahoo_A1 0.9629629058984938 0.0 0.1483253576752258
yahoo_A1 0.9729729177501851 0.056783529418183425 0.8170669346939724
yahoo_A1 0.9999999000000052 0.0 0.2750582612412459
yahoo_A1 0.9090908429752097 0.5138729188363166 0.32921568071011925
yahoo_A1 0.0 0.0 -0.0
yahoo_A1 0.999999930000003 0.5999999693333345 0.30751173086678135
yahoo_A1 0.5999999460000028 0.0 0.31264366766759183
yahoo_A1 0.9082125099302228 0.8063588440897093 0.13508257691877498
yahoo_A1 0.0 0.0 -0.0
yahoo_A1 0.9107142349170947 0.9038292054717418 0.11451151827561118
yahoo_A1 0.7692307168967815 0.3089846907177446 0.6867469834941573
yahoo_A1 0.3636363272727291 0.12243248771369242 0.10444977232892078
yahoo_A1 0.7999999413333363 0.0 0.3285714230170069
yahoo_A1 0.9999999484127009 0.2380952372856116 0.027548209315074828
yahoo_A1 0.9999999250000032 0.7499999447916703 0.12559241389203976
yahoo_A1 0.999999943333336 0.06666666576190483 0.18203309567012613
yahoo_A1 0.999999947674421 0

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0.0 0.0 -0.0
yahoo_A2 0

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

yahoo_A3 0.0 0.0 -0.0
yahoo_A3 0.02597402289222497 0.0 0.20749664977984228
yahoo_A3 0.7999999200000041 0.04254385801365809 0.23020624643885906
yahoo_A3 0.0 0.0 -0.0
yahoo_A3 0.8888888197530895 0.5999999580000026 0.20040079755502999
yahoo_A3 0.01851851666666676 0.004854243294746385 0.026892428928905948
yahoo_A3 0.019646363084904188 0.002721554278964139 0.0803212837857454
yahoo_A3 0.03389829531744 0.004274133973255668 0.33133731423858903
yahoo_A3 0.08695649829868218 0.0 0.28276778825582405
yahoo_A3 0.0 0.0 -0.0
yahoo_A3 0.03208555625840105 0.0024257496622937607 0.3983967855393996
yahoo_A3 0.11764704067666523 0.014284544228795172 0.22985971479561934
yahoo_A3 0.030456847813652193 0.0 0.3703406738816954
yahoo_A3 0.01047120313587895 0.0 0.5606361267275133
yahoo_A3 0.02352940943021934 0.001820426615768418 0.016733600788696963
yahoo_A3 0.0 0.0 -0.0
yahoo_A3 0.9090908429752097 0.7633332859111138 0.23607213951982528
yahoo_A3 0.0 0.0 -0.0
yahoo_A3 0.006872851544029947 0.0 0.3836977746752129
yahoo

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

yahoo_A4 0.7999999200000041 0.3488818641102865 0.1457085779482952
yahoo_A4 0.0 0.0 -0.0
yahoo_A4 0.00826446198005601 0.0 0.39761427427483204
yahoo_A4 0.01149425058792477 0.0017956925422823108 0.40737049747861254
yahoo_A4 0.007905137549407151 0.0014890813099579132 0.06573704849288126
yahoo_A4 0.049586770384537265 0.0027380485200336874 0.3229999918604002
yahoo_A4 0.6153845585798854 0.005308040266622258 0.4505009929037074
yahoo_A4 0.4999999375000028 0.03155447553424615 0.15701928947852814
yahoo_A4 0.0 0.0 -0.0
yahoo_A4 0.23529408581315192 0.07027416748823953 0.11077843939904633
yahoo_A4 0.0 0.0 -0.0
yahoo_A4 0.01860464839372666 0.002251051185992595 0.2624750410986413
yahoo_A4 0.009237874364896119 0.0 0.1454183193933434
yahoo_A4 0.2499999437500102 0.05496548502974741 0.17649999555220006
yahoo_A4 0.008695651304347869 0.001638748926333002 0.18326692307106288
yahoo_A4 0.009523807628118195 0.0012019230162490786 0.13894421613268082
yahoo_A4 0.0 0.0 -0.0
yahoo_A4 0.08333331076389416 0.0129035041

In [4]:
yahoo_results = pd.DataFrame(total_scores)

In [5]:
yahoo_results.groupby('dataset').mean()

Unnamed: 0_level_0,f1,pr_auc,roc_auc
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
yahoo_A1,0.606851,0.172942,0.227611
yahoo_A2,0.0,0.0,0.0
yahoo_A3,0.252532,0.101628,0.212411
yahoo_A4,0.185919,0.037976,0.190988


### NASA

In [6]:
total_scores = {'dataset': [], 'f1': [], 'pr_auc': [], 'roc_auc': []}

In [7]:
for loader in [load_nasa]:
    datasets = loader()
    x_trains, x_tests, y_tests = datasets['x_train'], datasets['x_test'], datasets['y_test']
    
    for i in tqdm(range(len(x_trains))):
        X_train = x_trains[i]
        X_test = x_tests[i]
        
        clf = ISF(random_state=0, n_jobs=32).fit(X_train)
        # Negative --> Positive scores represent outliers, positive -- negative scores represent inliers.
        X_test_rec = -clf.decision_function(X_test)
        X_test_rec = X_test_rec.reshape(y_tests[i].shape)
        
        scores = evaluate(X_test, X_test_rec, y_tests[i], is_reconstructed=False)

        total_scores['dataset'].append(f'D{i+1}')
        total_scores['f1'].append(np.max(scores['f1']))
        total_scores['pr_auc'].append(scores['pr_auc'])
        total_scores['roc_auc'].append(scores['roc_auc'])
        print(f'D{i+1}', np.max(scores['f1']), scores['pr_auc'], scores['roc_auc'])   

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

D1 0.06848287760161073 0.005548290568631348 0.0031908091270212158
D2 0.019487891861075166 0.00395832984383277 0.00018187091979471773


In [8]:
nasa_results = pd.DataFrame(total_scores)

In [9]:
nasa_results.groupby('dataset').mean()

Unnamed: 0_level_0,f1,pr_auc,roc_auc
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
D1,0.068483,0.005548,0.003191
D2,0.019488,0.003958,0.000182


### SMD

In [10]:
total_scores = {'dataset': [], 'f1': [], 'pr_auc': [], 'roc_auc': []}

In [11]:
for loader in [load_smd]:
    datasets = loader()
    x_trains, x_tests, y_tests = datasets['x_train'], datasets['x_test'], datasets['y_test']
    
    for i in tqdm(range(len(x_trains))):
        X_train = x_trains[i]
        X_test = x_tests[i]
        
        clf = ISF(random_state=0, n_jobs=32).fit(X_train)
        # 1 -> 0 for inliers, -1 -> 1 for outliers.
        X_test_rec = -clf.decision_function(X_test)
        X_test_rec = X_test_rec.reshape(y_tests[i].shape)
        
        scores = evaluate(X_test, X_test_rec, y_tests[i], is_reconstructed=False)

        total_scores['dataset'].append(loader.__name__.replace('load_', ''))
        total_scores['f1'].append(np.max(scores['f1']))
        total_scores['pr_auc'].append(scores['pr_auc'])
        total_scores['roc_auc'].append(scores['roc_auc'])
        print(loader.__name__.replace('load_', ''), np.max(scores['f1']), scores['pr_auc'], scores['roc_auc'])   

  0%|          | 0/28 [00:00<?, ?it/s]

  0%|          | 0/28 [00:00<?, ?it/s]

smd 0.3634000984646172 0.35753598146641374 0.19750503815584164
smd 0.359840904503404 0.1999533024838473 0.022424242033625048
smd 0.27199995173448793 0.2181642778703282 0.1376678253200118
smd 0.29657789717475425 0.23185024360060297 0.21727344075424937
smd 0.18666661711803775 0.07105388819824249 0.1252101159464445
smd 0.5950919774490859 0.5525943420100534 0.6777964888973863
smd 0.2224052218271501 0.09772661441517907 0.011974703397400232
smd 0.15885017809548307 0.08296819615736704 0.134176483316161
smd 0.21385657813409287 0.08264733901409321 0.034229110977498554
smd 0.2183340322118948 0.06039447115395553 0.02148440298294967
smd 0.4901960283898551 0.45828959399590985 0.16388139321264217
smd 0.3918574586821599 0.27205601968657334 0.028591302679670676
smd 0.25497798128280486 0.1550873757593434 0.028312563297880212
smd 0.2657342272825132 0.2225511329831861 0.18531074039458817
smd 0.7613940523683088 0.7007891996166609 0.06957516591668128
smd 0.7790261687665729 0.7400354081884956 0.433035631566

In [12]:
smd_results = pd.DataFrame(total_scores)

In [13]:
smd_results.groupby('dataset').mean()

Unnamed: 0_level_0,f1,pr_auc,roc_auc
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
smd,0.319941,0.246773,0.178762
