In [1]:
import arrow
import socket
from sqlalchemy.orm import Session
from tqdm import tqdm

from april import Evaluator
from april.anomalydetection import *
from april.database import EventLog
from april.database import Model
from april.database import get_engine
from april.dataset import Dataset
from april.fs import DATE_FORMAT
from april.fs import get_event_log_files

This is the train method that can be called in parallel.

In [2]:
def fit_and_save(dataset_name, ad, ad_kwargs=None, fit_kwargs=None):
    if ad_kwargs is None:
        ad_kwargs = {}
    if fit_kwargs is None:
        fit_kwargs = {}

    # Save start time
    start_time = arrow.now()

    # Dataset
    dataset = Dataset(dataset_name)

    # AD
    ad = ad(**ad_kwargs)

    # Train and save
    ad.fit(dataset, **fit_kwargs)
    file_name = f'{dataset_name}_{ad.abbreviation}_{start_time.format(DATE_FORMAT)}'
    model_file = ad.save(file_name)

    # Save end time
    end_time = arrow.now()

    # Cache result
    #print(model_file.str_path)
    Evaluator(model_file.str_path).cache_result()

    # Calculate training time in seconds
    training_time = (end_time - start_time).total_seconds()

    # Write to database
    engine = get_engine()
    session = Session(engine)

    session.add(Model(creation_date=end_time.datetime,
                      algorithm=ad.name,
                      training_duration=training_time,
                      file_name=model_file.file,
                      training_event_log_id=EventLog.get_id_by_name(dataset_name),
                      training_host=socket.gethostname(),
                      hyperparameters=str(dict(**ad_kwargs, **fit_kwargs))))
    session.commit()
    session.close()

    if isinstance(ad, NNAnomalyDetector):
        from keras.backend import clear_session
        clear_session()

Now, we can run the training for the anomaly detection methods.

In [None]:
datasets = sorted([e.name for e in get_event_log_files() if e.p == 0.3])
ads = [
    #dict(ad=RandomAnomalyDetector),
    #dict(ad=TStidePlus, ad_kwargs=dict(k=2)),
    dict(ad=OneClassSVM),
    #dict(ad=LikelihoodPlusAnomalyDetector),
    #dict(ad=BoehmerLikelihoodAnomalyDetector),
    dict(ad=NaiveAnomalyDetector),
    #dict(ad=NaivePlusAnomalyDetector),
    dict(ad=SamplingAnomalyDetector),
    dict(ad=DAE, fit_kwargs=dict(epochs=50, batch_size=500)),
    dict(ad=BINetv1, fit_kwargs=dict(epochs=20, batch_size=500)),
    #dict(ad=BINetv2, fit_kwargs=dict(epochs=20, batch_size=500)),
    #dict(ad=BINetv3, fit_kwargs=dict(epochs=20, batch_size=500))
]
for ad in ads:
    [fit_and_save(d, **ad) for d in tqdm(datasets, desc=ad['ad'].name)]



OC-SVM: 100%|██████████████████████████████████████████████████████████████████████████| 10/10 [00:18<00:00,  1.66s/it]


Naive: 100%|███████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  8.81it/s]


Sampling: 100%|████████████████████████████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.46it/s]
DAE:   0%|                                                                                      | 0/10 [00:00<?, ?it/s]Using TensorFlow backend.
W0201 16:55:56.930886 15908 deprecation_wrapper.py:119] From C:\Users\Jonghyeon\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0201 16:55:56.962802 15908 deprecation_wrapper.py:119] From C:\Users\Jonghyeon\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0201 16:55:56.986757 15908 deprecation_wrapper.py:119] From C:\Users\Jonghyeon\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0201 16:55:56.987735 159

Train on 4500 samples, validate on 500 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50


Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


DAE:  10%|███████▊                                                                      | 1/10 [00:30<04:34, 30.45s/it]

Train on 4500 samples, validate on 500 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50


In [3]:
datasets = sorted([e.name for e in get_event_log_files() if e.p == 0.3])
datasets

['huge-0.3-1',
 'huge-0.3-2',
 'large-0.3-1',
 'large-0.3-2',
 'medium-0.3-1',
 'medium-0.3-2',
 'small-0.3-1',
 'small-0.3-2',
 'wide-0.3-1',
 'wide-0.3-2']

In [7]:
datasets = sorted([e.name for e in get_event_log_files() if e.p == 0.3])
ads = [
    #dict(ad=RandomAnomalyDetector),
    #dict(ad=TStidePlus, ad_kwargs=dict(k=2)),
    #dict(ad=OneClassSVM),
    #dict(ad=LikelihoodPlusAnomalyDetector),
    #dict(ad=BoehmerLikelihoodAnomalyDetector),
    dict(ad=NaiveAnomalyDetector),
    #dict(ad=NaivePlusAnomalyDetector),
    #dict(ad=SamplingAnomalyDetector),
    #dict(ad=DAE, fit_kwargs=dict(epochs=50, batch_size=500)),
    #dict(ad=BINetv1, fit_kwargs=dict(epochs=20, batch_size=500)),
    #dict(ad=BINetv2, fit_kwargs=dict(epochs=20, batch_size=500)),
    #dict(ad=BINetv3, fit_kwargs=dict(epochs=20, batch_size=500))
]
for ad in ads:
    for d in tqdm(datasets, desc=ad['ad'].name):
        dataset=Dataset(d)
        dataset.id
        


Naive:   0%|                                                                                     | 0/1 [00:00<?, ?it/s]

AttributeError: 'Dataset' object has no attribute 'id'