In [3]:
import sys

from pathlib import Path
from datetime import timedelta

import dateutil
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch

from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import trange
from TaPR_pkg import etapr

ModuleNotFoundError: No module named 'TaPR_pkg'

## 데이터 전처리
학습 데이터와 테스트 데이터는 CSV로 제공
HAI 2.0은 단일파일이 아니라 여러 파일로 제공되기 때문에 디렉토리 안에 있는 모든 CSV를 읽는다.

In [5]:
TRAIN_DATASET = sorted([x for x in Path("data/training/").glob("*.csv")])
TRAIN_DATASET

[WindowsPath('data/training/train1.csv'),
 WindowsPath('data/training/train2.csv'),
 WindowsPath('data/training/train3.csv')]

In [6]:
TEST_DATASET = sorted([x for x in Path("data/testing/").glob("*.csv")])
TEST_DATASET

[WindowsPath('data/testing/test1.csv'),
 WindowsPath('data/testing/test2.csv'),
 WindowsPath('data/testing/test3.csv'),
 WindowsPath('data/testing/test4.csv')]

In [7]:
VALIDATION_DATASET = sorted([x for x in Path("data/validation/").glob("*.csv")])
VALIDATION_DATASET

[WindowsPath('data/validation/validation.csv')]

In [8]:
def dataframe_from_csv(target):
    return pd.read_csv(target).rename(columns=lambda x:x.strip())

def dataframe_from_csvs(targets):
    return pd.concat([dataframe_from_csv(x) for x in targets])

TRAIN_DF_RAW는 공격을 받지 않는 평상시 데이터
시간을 나타내는 필드인 time
나머지는 모두 비식별화된 센서/액추에이터의 값
    -> 정규화는 센서/액추에이터 값만 대상으로


In [9]:
TRAIN_DF_RAW = dataframe_from_csvs(TRAIN_DATASET)
TRAIN_DF_RAW.head()

Unnamed: 0,time,C01,C02,C03,C04,C05,C06,C07,C08,C09,...,C70,C71,C72,C73,C74,C75,C76,C77,C78,C79
0,2020-07-11 00:00:00,395.19528,12,10,52.80456,-1.2648,-1.87531,779.59595,28.02645,10832.0,...,808.2962,0.0,1.3681,8.79882,35.437,12.01782,305.03113,301.35992,33.6555,6.0951
1,2020-07-11 00:00:01,395.1442,12,10,52.78931,-1.3147,-1.88294,780.67328,28.02473,10984.0,...,819.16809,0.0,1.3681,8.78811,35.45227,12.01782,304.27161,297.43567,33.6555,5.9262
2,2020-07-11 00:00:02,395.1442,12,10,52.79694,-1.4032,-1.88294,780.06574,28.02817,11120.0,...,823.51697,0.0,1.36734,8.81787,35.45227,12.01782,303.89179,298.66534,33.6555,5.8101
3,2020-07-11 00:00:03,395.19528,12,10,52.79694,-1.6074,-1.88294,780.15265,28.02301,11256.0,...,823.95172,0.0,1.36734,8.87493,35.437,12.01782,303.67474,298.0686,33.6555,5.7509
4,2020-07-11 00:00:04,395.34866,12,10,52.79694,-1.7811,-1.88294,781.8316,28.03595,11384.0,...,827.8656,0.0,1.3681,8.83838,35.45227,12.01782,303.22266,296.53137,33.6555,5.8547


In [10]:
TRAIN_DF_RAW.columns

Index(['time', 'C01', 'C02', 'C03', 'C04', 'C05', 'C06', 'C07', 'C08', 'C09',
       'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19',
       'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26', 'C27', 'C28', 'C29',
       'C30', 'C31', 'C32', 'C33', 'C34', 'C35', 'C36', 'C37', 'C38', 'C39',
       'C40', 'C41', 'C42', 'C43', 'C44', 'C45', 'C46', 'C47', 'C48', 'C49',
       'C50', 'C51', 'C52', 'C53', 'C54', 'C55', 'C56', 'C57', 'C58', 'C59',
       'C60', 'C61', 'C62', 'C63', 'C64', 'C65', 'C66', 'C67', 'C68', 'C69',
       'C70', 'C71', 'C72', 'C73', 'C74', 'C75', 'C76', 'C77', 'C78', 'C79'],
      dtype='object')

해당 문서에서는 전체 데이터를 대상으로 이상을 탐지 "attack" 필드만 사용

VALID_COLUMNS_IN_TRAIN_DATASET은 학습 데이터셋에 있는 모든 센서/액추에이터 필드를 담고 있다.

In [11]:
TIMESTAMP_FIELD = "time"
IDSTAMP_FIELD = 'id'
ATTACK_FIELD = "attack"
VALID_COLUMNS_IN_TRAIN_DATASET = TRAIN_DF_RAW.columns.drop([TIMESTAMP_FIELD])
VALID_COLUMNS_IN_TRAIN_DATASET

Index(['C01', 'C02', 'C03', 'C04', 'C05', 'C06', 'C07', 'C08', 'C09', 'C10',
       'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20',
       'C21', 'C22', 'C23', 'C24', 'C25', 'C26', 'C27', 'C28', 'C29', 'C30',
       'C31', 'C32', 'C33', 'C34', 'C35', 'C36', 'C37', 'C38', 'C39', 'C40',
       'C41', 'C42', 'C43', 'C44', 'C45', 'C46', 'C47', 'C48', 'C49', 'C50',
       'C51', 'C52', 'C53', 'C54', 'C55', 'C56', 'C57', 'C58', 'C59', 'C60',
       'C61', 'C62', 'C63', 'C64', 'C65', 'C66', 'C67', 'C68', 'C69', 'C70',
       'C71', 'C72', 'C73', 'C74', 'C75', 'C76', 'C77', 'C78', 'C79'],
      dtype='object')

TAG_MIN, TAG_MAX 학습 데이터셋에서 최솟값, 쵀댓값을 얻은 결과

In [12]:
TAG_MIN = TRAIN_DF_RAW[VALID_COLUMNS_IN_TRAIN_DATASET].min()
TAG_MAX = TRAIN_DF_RAW[VALID_COLUMNS_IN_TRAIN_DATASET].max()

### DATAFRAME 정규화

최솟값, 최댓값을 이용하여 0~1의 범위에 들어오도록
값이 전혀 변하지 않는 필드에 대해서는 최솟값, 최댓값이 같은 것 본 문서에서는 이런 필드를 0으로

In [13]:
def normalize(df):
    ndf = df.copy()
    for c in df.columns:
        if TAG_MIN[c] == TAG_MAX[c]:
            ndf[c] = df[c] - TAG_MIN[c]
        else:
            ndf[c] = (df[c] - TAG_MIN[c]) / (TAG_MAX[c] - TAG_MIN[c])
    return ndf

TRAIN_DF 정규화를 마치고 exponential weighted function을 통과시킴
센서에서 발생하는 noise를 smotthing 시켜주기를 기대했다고 함

In [14]:
TRAIN_DF = normalize(TRAIN_DF_RAW[VALID_COLUMNS_IN_TRAIN_DATASET]).ewm(alpha=0.9).mean()
TRAIN_DF

Unnamed: 0,C01,C02,C03,C04,C05,C06,C07,C08,C09,C10,...,C70,C71,C72,C73,C74,C75,C76,C77,C78,C79
0,0.378953,0.0,0.0,0.227071,0.372380,0.000230,0.386721,0.410567,0.784144,0.508049,...,0.584892,0.0,0.326835,0.254687,0.331076,0.916661,0.269393,0.265017,1.00000,0.567254
1,0.378504,0.0,0.0,0.226596,0.353516,0.000161,0.399074,0.364415,0.794139,0.540538,...,0.592044,0.0,0.326835,0.254315,0.337223,0.916661,0.266791,0.251792,1.00000,0.512135
2,0.378463,0.0,0.0,0.226789,0.318663,0.000154,0.393283,0.451729,0.803903,0.538802,...,0.595523,0.0,0.326387,0.255304,0.337777,0.916661,0.265266,0.254707,1.00000,0.469622
3,0.378904,0.0,0.0,0.226808,0.238782,0.000154,0.393697,0.323289,0.813725,0.459532,...,0.596151,0.0,0.326343,0.257362,0.331746,0.916661,0.264379,0.253005,1.00000,0.446285
4,0.380282,0.0,0.0,0.226810,0.165794,0.000154,0.412796,0.654203,0.823039,0.333541,...,0.598763,0.0,0.326786,0.256312,0.337229,0.916661,0.262757,0.247706,1.00000,0.477489
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478796,0.302372,0.0,0.0,0.703684,0.509016,0.000538,0.420902,0.355242,0.064622,0.639006,...,0.683154,0.0,0.300845,0.497901,0.202699,0.111119,0.315343,0.248152,0.26162,0.757619
478797,0.304595,0.0,0.0,0.703684,0.485295,0.000538,0.481569,0.335967,0.061671,0.685457,...,0.680815,0.0,0.300850,0.497868,0.196619,0.111119,0.309817,0.245953,0.26162,0.681373
478798,0.304373,0.0,0.0,0.703684,0.386965,0.000538,0.497306,0.519990,0.058250,0.607073,...,0.677466,0.0,0.301298,0.497555,0.196011,0.111119,0.303620,0.236562,0.26162,0.631425
478799,0.306574,0.0,0.0,0.703684,0.289108,0.000538,0.468238,0.422572,0.055304,0.430538,...,0.683078,0.0,0.300895,0.494969,0.202036,0.111119,0.308706,0.266275,0.26162,0.677024


boundary_check 함수는 값이 0이상 1이하인지, NaN이 있는지 점검

In [16]:
def boundary_check(df):
    x=np.array(df, dtype=np.float32)
    return np.any(x>1.0), np.any(x<0), np.any(np.isnan(x))
boundary_check(TRAIN_DF)

(False, False, False)