In [1]:
import itertools
from multiprocessing.pool import Pool

import pandas as pd
from sklearn import metrics
from sqlalchemy.orm import Session
from tqdm import tqdm

from april.anomalydetection import BINet
from april.anomalydetection.utils import label_collapse
from april.database import Evaluation
from april.database import Model
from april.database import get_engine
from april.enums import Base
from april.enums import Heuristic
from april.enums import Strategy
from april.evaluator import Evaluator
from april.fs import get_model_files
from april.fs import PLOT_DIR

In [2]:
heuristics = [h for h in Heuristic.keys() if h not in [Heuristic.DEFAULT, Heuristic.MANUAL, Heuristic.RATIO,
                                                       Heuristic.MEDIAN, Heuristic.MEAN]]
params = [(Base.SCORES, Heuristic.DEFAULT, Strategy.SINGLE), *itertools.product([Base.SCORES], heuristics, Strategy.keys())]

In [3]:
def _evaluate(params):
    e, base, heuristic, strategy = params

    session = Session(get_engine())
    model = session.query(Model).filter_by(file_name=e.model_file.name).first()
    session.close()

    # Generate evaluation frames
    y_pred = e.binarizer.binarize(base=base, heuristic=heuristic, strategy=strategy, go_backwards=False)
    y_true = e.binarizer.get_targets()

    evaluations = []
    for axis in [0, 1, 2]:
        for i, attribute_name in enumerate(e.dataset.attribute_keys):
            def get_evaluation(label, precision, recall, f1):
                return Evaluation(model_id=model.id, file_name=model.file_name,
                                  label=label, perspective=perspective, attribute_name=attribute_name,
                                  axis=axis, base=base, heuristic=heuristic, strategy=strategy,
                                  precision=precision, recall=recall, f1=f1)

            perspective = 'Control Flow' if i == 0 else 'Data'
            if i > 0  and not e.ad_.supports_attributes:
                evaluations.append(get_evaluation('Normal', 0.0, 0.0, 0.0))
                evaluations.append(get_evaluation('Anomaly', 0.0, 0.0, 0.0))
            else:
                yp = label_collapse(y_pred[:, :, i:i + 1], axis=axis).compressed()
                yt = label_collapse(y_true[:, :, i:i + 1], axis=axis).compressed()
                p, r, f, _ = metrics.precision_recall_fscore_support(yt, yp, labels=[0, 1])
                evaluations.append(get_evaluation('Normal', p[0], r[0], f[0]))
                evaluations.append(get_evaluation('Anomaly', p[1], r[1], f[1]))

    return evaluations

def evaluate(model_name):
    e = Evaluator(model_name)

    _params = []
    for base, heuristic, strategy in params:
        if e.dataset.num_attributes == 1 and strategy in [Strategy.ATTRIBUTE, Strategy.POSITION_ATTRIBUTE]:
            continue
        if isinstance(e.ad_, BINet) and e.ad_.version == 0:
            continue
        if heuristic is not None and heuristic not in e.ad_.supported_heuristics:
            continue
        if strategy is not None and strategy not in e.ad_.supported_strategies:
            continue
        if base is not None and base not in e.ad_.supported_bases:
            continue
        _params.append([e, base, heuristic, strategy])

    return [_e for p in _params for _e in _evaluate(p)]

In [5]:
models = sorted([m.name for m in get_model_files() ])
print(models)
len(models)





['huge-0.3-1_binetv1_20200201-165817.760446', 'huge-0.3-1_dae_20200201-165551.164305', 'huge-0.3-1_naive_20200201-165545.655035', 'huge-0.3-1_one-class-svm_20200201-165527.425465', 'huge-0.3-1_sampling_20200201-165546.769057', 'huge-0.3-2_binetv1_20200201-165842.812655', 'huge-0.3-2_dae_20200201-165621.611852', 'huge-0.3-2_naive_20200201-165545.752775', 'huge-0.3-2_one-class-svm_20200201-165529.878821', 'huge-0.3-2_sampling_20200201-165547.263735', 'large-0.3-1_binetv1_20200201-165908.448222', 'large-0.3-1_dae_20200201-165645.650875', 'large-0.3-1_naive_20200201-165545.869461', 'large-0.3-1_one-class-svm_20200201-165532.189176', 'large-0.3-1_sampling_20200201-165547.809274', 'large-0.3-2_binetv1_20200201-165935.101205', 'large-0.3-2_dae_20200201-165704.080263', 'large-0.3-2_naive_20200201-165545.979168', 'large-0.3-2_one-class-svm_20200201-165534.597318', 'large-0.3-2_sampling_20200201-165548.323899', 'medium-0.3-1_binetv1_20200201-170001.879912', 'medium-0.3-1_dae_20200201-165723.6461

50

In [6]:
models = sorted([m.name for m in get_model_files() ])

evaluations = []
for i in range(len(models)):
    e= evaluate(models[i])
    evaluations.append(e)

# Write to database
session = Session(get_engine())
for e in evaluations:
    session.bulk_save_objects(e)
    session.commit()
session.close()









In [None]:
#Original : error
models = sorted([m.name for m in get_model_files() ])

evaluations = []
with Pool() as p:
    for e in tqdm(p.imap(evaluate, models), total=len(models), desc='Evaluate'):
        evaluations.append(e)

# Write to database
session = Session(get_engine())
for e in evaluations:
    session.bulk_save_objects(e)
    session.commit()
session.close()

Evaluate:   0%|                                                                                  | 0/3 [00:00<?, ?it/s]

## Pickle the results

In [7]:
out_dir = PLOT_DIR / 'isj-2019'
eval_file = out_dir / 'eval.pkl'

session = Session(get_engine())
evaluations = session.query(Evaluation).all()
rows = []
for ev in tqdm(evaluations):
    m = ev.model
    el = ev.model.training_event_log
    rows.append([m.file_name, m.creation_date, m.hyperparameters, m.training_duration, m.training_host, m.algorithm, 
                 m.file_name, m.file_name, m.creation_date, m.hyperparameters,
                 ev.axis, ev.base, ev.heuristic, ev.strategy, ev.label, ev.attribute_name, ev.perspective, ev.precision, ev.recall, ev.f1])
session.close()
columns = ['file_name', 'date', 'hyperparameters', 'training_duration', 'training_host', 'ad',
           'dataset_name', 'process_model', 'noise', 'dataset_id',
           'axis', 'base', 'heuristic', 'strategy', 'label', 'attribute_name', 'perspective', 'precision', 'recall', 'f1']

evaluation = pd.DataFrame(rows, columns=columns)
evaluation.to_pickle(eval_file)

100%|███████████████████████████████████████████████████████████████████████████| 5196/5196 [00:00<00:00, 27133.71it/s]


In [34]:
out_dir = PLOT_DIR / 'isj-2019'
eval_file = out_dir / 'eval.pkl'

session = Session(get_engine())
evaluations = session.query(Evaluation).all()
rows = []
for ev in tqdm(evaluations):
    m = ev.model
    el = ev.model.training_event_log
    print(el.name)
session.close()


  0%|                                                                                          | 0/366 [00:00<?, ?it/s]


AttributeError: 'NoneType' object has no attribute 'name'