In [15]:
import math
import funcy
import json
import struct
import numpy as np

from lib.Autoencoder import Autoencoder
from lib.DatasetLoader import DatasetLoader
from lib.helpers.TimeLogger import TimeLogger

In [68]:
split_percent = 0.9
encoding_dim_percent = 0.5
use_dbscan = None
output_file = 'anomalies_results/differences.json'
files_map_file = 'dataset/files_map.json'
anomalies_output_file = 'anomalies_results/anomalies.json'
dataset_file = 'dataset/dataset.csv'

In [60]:
def ascii_read(differences_file):
    differences_read_logger = TimeLogger(task_name='Differences read')
    with open(differences_file) as f:
        differences = json.loads(f.read())

        difference_indexes = []
        difference_values = []
        for difference in differences:
            difference_indexes.append(difference[0])
            difference_values.append(difference[1])

    differences_read_logger.finish()

    return difference_indexes, difference_values

def ascii_write(output_file, differences):
    with open(output_file, 'w') as f:
        f.write(json.dumps(differences))

In [61]:
def deviation_anomaly_selection(differences, sigma_deviation_bound=3):
    time_logger = TimeLogger('%s-sigma anomaly selection' % sigma_deviation_bound)

    difference_indexes, difference_values = differences
    mean = np.mean(difference_values)
    std_deviation = np.std(difference_values)
    left_bound_deviation = mean - sigma_deviation_bound * std_deviation
    right_bound_deviation = mean + sigma_deviation_bound * std_deviation

    anomalies = []
    for i, x in enumerate(difference_values):
        if x < left_bound_deviation or x > right_bound_deviation:
            anomalies.append((difference_indexes[i], difference_values[i]))

    time_logger.finish()

    return anomalies

In [34]:
data = DatasetLoader(dataset_file).load(split_percent)

In [33]:
(_, _, features_number) = data
print(type(data))
print(features_number)
encoding_dim = math.ceil(features_number * encoding_dim_percent)

<class 'tuple'>
2568


In [35]:
time_logger = TimeLogger(task_name='Autoencoder fit')
autoencoder = Autoencoder(features_number, encoding_dim, data)
autoencoder.print_model_summary()
autoencoder.fit()
time_logger.finish()

Start Autoencoder fit
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 2568)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1284)              3298596   
_________________________________________________________________
dense_2 (Dense)              (None, 2568)              3299880   
Total params: 6,598,476
Trainable params: 6,598,476
Non-trainable params: 0
_________________________________________________________________
Train on 1490 samples, validate on 14901 samples
Epoch 1/5
 - 7s - loss: 0.7094 - val_loss: 0.7010
Epoch 2/5
 - 5s - loss: 0.6939 - val_loss: 0.6964
Epoch 3/5
 - 5s - loss: 0.6834 - val_loss: 0.6922
Epoch 4/5
 - 6s - loss: 0.6744 - val_loss: 0.6887
Epoch 5/5
 - 6s - loss: 0.6667 - val_loss: 0.6852
Autoencoder fit finished. Time: 0:00:29.061000


In [36]:
time_logger = TimeLogger(task_name='Autoencoder predict')
autoencoder.predict()
time_logger.finish()

Start Autoencoder predict
Autoencoder predict finished. Time: 0:00:07.551000


In [64]:
time_logger = TimeLogger(task_name='Calculate differences')
differences = autoencoder.calc_differences(full_differences=use_dbscan)
time_logger.finish()

if not use_dbscan:
    differences = sorted(enumerate(differences), key=lambda tup: tup[1], reverse=True)

Start Calculate differences
Calculate differences finished. Time: 0:00:00.854000


In [65]:
ascii_write(output_file, differences)

In [70]:
differences = ascii_read(output_file)
anomalies = deviation_anomaly_selection(differences)

anomalies_write_time_logger = TimeLogger('Anomaly list write')
with open(files_map_file) as files_map_file_descriptor:
    files_map = json.loads(files_map_file_descriptor.read())
    anomaly_files = []
    for anomaly_index, anomaly_value in anomalies:
        anomaly_files.append((files_map[anomaly_index], anomaly_value))

    with open(anomalies_output_file, 'w') as anomalies_output_file_descriptor:
        anomalies_output_file_descriptor.write(json.dumps(anomaly_files))

anomalies_write_time_logger.finish()

print(len(anomaly_files))

Start Differences read
Differences read finished. Time: 0:00:00.080000
Start 3-sigma anomaly selection
3-sigma anomaly selection finished. Time: 0:00:00.021000
Start Anomaly list write
Anomaly list write finished. Time: 0:00:00.098000
64
