In [1]:
from common.data_frame_columns import TIMESTAMP
from common.data_frame_columns import PM10, PM2_5, PM1
from common.date_time_helper import convert_to_datetime
from common.endpoints_urls import endpoints_config
from data_management.data_crawler import DataManager
from data_management.labeled_data_generator import LabeledDataGenerator, DataLabel

In [2]:
date_strings = ['01.01.2021 00:00', '28.02.2024 23:59']
test_dates_string = ['01.02.2024 00:00', '25.04.2024 23:59']

training_dates = [convert_to_datetime(date_strings[0]), convert_to_datetime(date_strings[1])]
test_dates = [convert_to_datetime(test_dates_string[0]), convert_to_datetime(test_dates_string[1])]

datas = DataManager(True).get_all_endpoints_data(endpoints_config, update=False)

column = PM10
L = LabeledDataGenerator(column)

test_data = L.generate_labeled_data(datas[:1], test_dates[0], test_dates[1], 50)
# single_data = L.generate_labeled_data(datas[:1], training_dates[0], training_dates[1], 40)
# multi_data = L.generate_labeled_data(datas, training_dates[0], training_dates[1], 40)

Loading station data: Gronie  https://datahub.ki.agh.edu.pl/api/endpoints/70/data/
    # Minimal data: 2022-07-13 23:38:02+00:00
    # Maximal data: 2024-04-25 12:36:38+00:00
Loading station data: Urząd Gminy  https://datahub.ki.agh.edu.pl/api/endpoints/71/data/


  df = pd.read_csv(filename)


    # Minimal data: 2021-10-07 18:51:17+00:00
    # Maximal data: 2024-04-25 12:43:51+00:00
Loading station data: Młynne  https://datahub.ki.agh.edu.pl/api/endpoints/72/data/
    # Minimal data: 2021-10-07 19:17:59+00:00
    # Maximal data: 2024-04-25 12:33:14+00:00
Loading station data: Sucharskiego  https://datahub.ki.agh.edu.pl/api/endpoints/73/data/
    # Minimal data: 2021-10-07 19:41:43+00:00
    # Maximal data: 2024-04-25 12:36:38+00:00
Loading station data: Twardowskiego  https://datahub.ki.agh.edu.pl/api/endpoints/74/data/
    # Minimal data: 2021-10-07 20:59:56+00:00
    # Maximal data: 2024-04-25 12:38:24+00:00
Loading station data: Konopnickiej  https://datahub.ki.agh.edu.pl/api/endpoints/75/data/
    # Minimal data: 2021-10-07 21:07:07+00:00
    # Maximal data: 2024-04-25 12:38:42+00:00
Finished loading data 
 
Daily datas: 85
Generated anomalies: 39
    NOISE: 8
    RANDOM_ZEROS: 11
    NORMAL: 46
    SCALED: 7
    EXTINCTION: 7
    ZEROS_IN_RANGE: 6


In [3]:
from detectors.z_score_detector import ZScoreDetector
import pandas as pd

column = PM10
thresholds = [1.0, 1.5, 2, 3, 4, 5, 6, 7, 8, 9, 10]

#single sensor detection
zscore = ZScoreDetector()
print(f"MAD sensor level:")
for threshold in thresholds:
    data_length = 0
    correct_anomaly = 0
    correct_normal = 0
    anomaly = 0
    normal = 0
    for dataframe, label in test_data:
        if not dataframe.empty:
            data_length += 1
            start_time = dataframe[TIMESTAMP].min()
            end_time = dataframe[TIMESTAMP].max()
            outliers = ZScoreDetector().detect_by_mad(dataframe, column, start_time, end_time, threshold)
            if label.value > 0:
                anomaly += 1
                if (len(outliers) > 5):
                    correct_anomaly += 1
            if label.value == 0:
                normal += 1
                if (len(outliers) < 5):
                    correct_normal += 1

    print(
        f'    [Threshold: {threshold}] Anomaly: {correct_anomaly}/{anomaly} ({round(correct_anomaly / anomaly, 2)}) | Normal {round(correct_normal / normal * 100, 2)} | {round((correct_anomaly + correct_normal) / (anomaly + normal) * 100, 2)}%')

print(f"AVG sensor level:")
for threshold in thresholds:
    data_length = 0
    correct_anomaly = 0
    correct_normal = 0
    anomaly = 0
    normal = 0
    for dataframe, label in test_data:
        if not dataframe.empty:
            data_length += 1
            start_time = dataframe[TIMESTAMP].min()
            end_time = dataframe[TIMESTAMP].max()
            outliers = ZScoreDetector().detect_by_avg(dataframe, column, start_time, end_time, threshold)
            if label.value > 0:
                anomaly += 1
                if (len(outliers) > 5):
                    correct_anomaly += 1
            if label.value == 0:
                normal += 1
                if (len(outliers) < 5):
                    correct_normal += 1

    print(
        f'    [Threshold: {threshold}] Anomaly: {correct_anomaly}/{anomaly} ({round(correct_anomaly / anomaly, 2)}) | Normal {round(correct_normal / normal * 100, 2)} | {round((correct_anomaly + correct_normal) / (anomaly + normal) * 100, 2)}%')

print(f"MAD network level:")
for threshold in thresholds:
    data_length = 0
    correct_anomaly = 0
    correct_normal = 0
    anomaly = 0
    normal = 0
    for dataframe, label in test_data:
        if not dataframe.empty:
            data_length += 1
            start_time = dataframe[TIMESTAMP].min()
            end_time = dataframe[TIMESTAMP].max()
            outliers = ZScoreDetector().detect_by_mad_network_level(datas, dataframe, column, start_time, end_time,
                                                                    threshold)
            if label.value > 0:
                anomaly += 1
                if (len(outliers) > 5):
                    correct_anomaly += 1
            if label.value == 0:
                normal += 1
                if (len(outliers) < 5):
                    correct_normal += 1

    print(
        f'    [Threshold: {threshold}] Anomaly: {correct_anomaly}/{anomaly} ({round(correct_anomaly / anomaly, 2)}) | Normal {round(correct_normal / normal * 100, 2)} | {round((correct_anomaly + correct_normal) / (anomaly + normal) * 100, 2)}%')

print(f"AVG network level:")
for threshold in thresholds:
    data_length = 0
    correct_anomaly = 0
    correct_normal = 0
    anomaly = 0
    normal = 0
    for dataframe, label in test_data:
        if not dataframe.empty:
            data_length += 1
            start_time = dataframe[TIMESTAMP].min()
            end_time = dataframe[TIMESTAMP].max()
            outliers = ZScoreDetector().detect_by_avg_network_level(datas, dataframe, column, start_time, end_time,
                                                                    threshold)
            if label.value > 0:
                anomaly += 1
                if (len(outliers) > 5):
                    correct_anomaly += 1
            if label.value == 0:
                normal += 1
                if (len(outliers) < 5):
                    correct_normal += 1

    print(
        f'    [Threshold: {threshold}] Anomaly: {correct_anomaly}/{anomaly} ({round(correct_anomaly / anomaly, 2)}) | Normal {round(correct_normal / normal * 100, 2)} | {round((correct_anomaly + correct_normal) / (anomaly + normal) * 100, 2)}%')




MAD sensor level:
    [Threshold: 1.0] Anomaly: 39/39 (1.0) | Normal 0.0 | 45.88%
    [Threshold: 1.5] Anomaly: 31/39 (0.79) | Normal 4.35 | 38.82%
    [Threshold: 2] Anomaly: 27/39 (0.69) | Normal 13.04 | 38.82%
    [Threshold: 3] Anomaly: 19/39 (0.49) | Normal 30.43 | 38.82%
    [Threshold: 4] Anomaly: 15/39 (0.38) | Normal 52.17 | 45.88%
    [Threshold: 5] Anomaly: 13/39 (0.33) | Normal 71.74 | 54.12%
    [Threshold: 6] Anomaly: 11/39 (0.28) | Normal 78.26 | 55.29%
    [Threshold: 7] Anomaly: 9/39 (0.23) | Normal 84.78 | 56.47%
    [Threshold: 8] Anomaly: 8/39 (0.21) | Normal 89.13 | 57.65%
    [Threshold: 9] Anomaly: 8/39 (0.21) | Normal 93.48 | 60.0%
    [Threshold: 10] Anomaly: 8/39 (0.21) | Normal 93.48 | 60.0%
AVG sensor level:
    [Threshold: 1.0] Anomaly: 39/39 (1.0) | Normal 0.0 | 45.88%
    [Threshold: 1.5] Anomaly: 37/39 (0.95) | Normal 6.52 | 47.06%
    [Threshold: 2] Anomaly: 16/39 (0.41) | Normal 39.13 | 40.0%
    [Threshold: 3] Anomaly: 6/39 (0.15) | Normal 100.0 | 61.

In [4]:
from detectors.pseudo_periodic import PseudoPeriodicDetector
import pandas as pd

column = PM10
sensor_data = datas[0]
thresholds = [1.0, 1.5, 2, 3, 4, 5, 6, 7, 8, 9, 10]
#single sensor detection

print(f"PERIODIC_MAD sensor level:")
for threshold in thresholds:
    data_length = 0
    correct_anomaly = 0
    correct_normal = 0
    anomaly = 0
    normal = 0
    for dataframe, label in test_data:
        if not dataframe.empty:
            data_length += 1
            start_time = dataframe[TIMESTAMP].min()
            end_time = dataframe[TIMESTAMP].max()
            destroyed = sensor_data.copy()
            destroyed.name = sensor_data.name
            destroyed.update(dataframe)
            outliers = PseudoPeriodicDetector().detect_by_periodic_mad(destroyed, column, start_time, end_time,
                                                                       threshold=threshold)
            if label.value > 0:
                anomaly += 1
                if (len(outliers) > 5):
                    correct_anomaly += 1
            if label.value == 0:
                normal += 1
                if (len(outliers) < 5):
                    correct_normal += 1

    print(
        f'    [Threshold: {threshold}] Anomaly: {correct_anomaly}/{anomaly} ({round(correct_anomaly / anomaly, 2)}) | Normal {round(correct_normal / normal * 100, 2)} | {round((correct_anomaly + correct_normal) / data_length * 100, 2)}%')

print(f"PERIODIC_AVG sensor level:")
for threshold in thresholds:
    data_length = 0
    correct_anomaly = 0
    correct_normal = 0
    anomaly = 0
    normal = 0
    for dataframe, label in test_data:
        if not dataframe.empty:
            data_length += 1
            start_time = dataframe[TIMESTAMP].min()
            end_time = dataframe[TIMESTAMP].max()
            destroyed = sensor_data.copy()
            destroyed.name = sensor_data.name
            destroyed.update(dataframe)
            outliers = PseudoPeriodicDetector().detect_by_periodic_avg(destroyed, column, start_time, end_time,
                                                                       threshold=threshold)
            if label.value > 0:
                anomaly += 1
                if (len(outliers) > 5):
                    correct_anomaly += 1
            if label.value == 0:
                normal += 1
                if (len(outliers) < 5):
                    correct_normal += 1

    print(
        f'    [Threshold: {threshold}] Anomaly: {correct_anomaly}/{anomaly} ({round(correct_anomaly / anomaly, 2)}) | Normal {round(correct_normal / normal * 100, 2)} | {round((correct_anomaly + correct_normal) / data_length * 100, 2)}%')

print(f"PERIODIC_MAD network level:")
for threshold in thresholds:
    data_length = 0
    correct_anomaly = 0
    correct_normal = 0
    anomaly = 0
    normal = 0
    for dataframe, label in test_data:
        if not dataframe.empty:
            data_length += 1
            start_time = dataframe[TIMESTAMP].min()
            end_time = dataframe[TIMESTAMP].max()
            destroyed = sensor_data.copy()
            destroyed.name = sensor_data.name
            destroyed.update(dataframe)
            outliers = PseudoPeriodicDetector().detect_by_periodic_mad_network_level(datas, destroyed, column,
                                                                                     start_time, end_time,
                                                                                     threshold=threshold)
            if label.value > 0:
                anomaly += 1
                if (len(outliers) > 5):
                    correct_anomaly += 1
            if label.value == 0:
                normal += 1
                if (len(outliers) < 5):
                    correct_normal += 1

    print(
        f'    [Threshold: {threshold}] Anomaly: {correct_anomaly}/{anomaly} ({round(correct_anomaly / anomaly, 2)}) | Normal {round(correct_normal / normal * 100, 2)} | {round((correct_anomaly + correct_normal) / data_length * 100, 2)}%')

print(f"PERIODIC_AVG network level:")
for threshold in thresholds:
    data_length = 0
    correct_anomaly = 0
    correct_normal = 0
    anomaly = 0
    normal = 0
    for dataframe, label in test_data:
        if not dataframe.empty:
            data_length += 1
            start_time = dataframe[TIMESTAMP].min()
            end_time = dataframe[TIMESTAMP].max()
            destroyed = sensor_data.copy()
            destroyed.name = sensor_data.name
            destroyed.update(dataframe)
            outliers = PseudoPeriodicDetector().detect_by_periodic_avg_network_level(datas,  destroyed, column,
                                                                                     start_time, end_time,
                                                                                     threshold=threshold)
            if label.value > 0:
                anomaly += 1
                if (len(outliers) > 5):
                    correct_anomaly += 1
            if label.value == 0:
                normal += 1
                if (len(outliers) < 5):
                    correct_normal += 1

    print(
        f'    [Threshold: {threshold}] Anomaly: {correct_anomaly}/{anomaly} ({round(correct_anomaly / anomaly, 2)}) | Normal {round(correct_normal / normal * 100, 2)} | {round((correct_anomaly + correct_normal) / data_length * 100, 2)}%')




PERIODIC_MAD sensor level:
    [Threshold: 1.0] Anomaly: 39/39 (1.0) | Normal 0.0 | 45.88%
    [Threshold: 1.5] Anomaly: 33/39 (0.85) | Normal 13.04 | 45.88%
    [Threshold: 2] Anomaly: 25/39 (0.64) | Normal 26.09 | 43.53%
    [Threshold: 3] Anomaly: 19/39 (0.49) | Normal 60.87 | 55.29%
    [Threshold: 4] Anomaly: 16/39 (0.41) | Normal 78.26 | 61.18%
    [Threshold: 5] Anomaly: 15/39 (0.38) | Normal 89.13 | 65.88%
    [Threshold: 6] Anomaly: 14/39 (0.36) | Normal 93.48 | 67.06%
    [Threshold: 7] Anomaly: 13/39 (0.33) | Normal 93.48 | 65.88%
    [Threshold: 8] Anomaly: 13/39 (0.33) | Normal 97.83 | 68.24%
    [Threshold: 9] Anomaly: 13/39 (0.33) | Normal 100.0 | 69.41%
    [Threshold: 10] Anomaly: 12/39 (0.31) | Normal 100.0 | 68.24%
PERIODIC_AVG sensor level:
    [Threshold: 1.0] Anomaly: 39/39 (1.0) | Normal 0.0 | 45.88%
    [Threshold: 1.5] Anomaly: 30/39 (0.77) | Normal 23.91 | 48.24%
    [Threshold: 2] Anomaly: 20/39 (0.51) | Normal 63.04 | 57.65%
    [Threshold: 3] Anomaly: 9/39 