# 1　データ整形：モデル前データまで

## 準備

In [1]:
from pathlib import Path
import pandas as pd
import os
import numpy as np
from tqdm import tqdm
from xml.etree.ElementTree import fromstring, ElementTree
from Evtx.Evtx import Evtx
import csv

In [2]:
%load_ext autoreload
%autoreload 2
import preprocess
import main

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
INTERIM_DIR = Path('../data/interim')
PROCESSED_DIR = Path('../data/processed')
RAW_DIR = Path('../data/raw')
NO_MEANING_DIR = Path('../data/no_meaning')

## 1-1 AtomicRT（T1105）

### processed 作成（使用特徴量：EventId, deltaT）

In [None]:
input_dir = INTERIM_DIR/project_name
output_dir = PROCESSED_DIR/project_name

In [None]:
df = pd.read_csv(input_dir / f'{file_name}_prepared_structured.csv')

# 諸操作
df["Label"] = df["Label"].apply(lambda x: int(x != "-"))
df["datetime"] = pd.to_datetime(df["Timestamp"], format="%Y-%m-%d %H:%M:%S.%f %z")
df["timestamp"] = df["datetime"].view("int64") // 10**9  
df["deltaT"] = df["datetime"].diff().dt.total_seconds().fillna(0)

  df["timestamp"] = df["datetime"].view("int64") // 10**9


In [None]:
def sliding_window(raw_data, para):
    """
    split logs into sliding windows/session
    :param raw_data: dataframe columns=[timestamp, label, eventid, time duration]
    :param para:{window_size: seconds, step_size: seconds}
    :return: dataframe columns=[eventids, time durations, label]
    """
    log_size = raw_data.shape[0]
    label_data, time_data = raw_data.iloc[:, 1], raw_data.iloc[:, 0]
    logkey_data, deltaT_data = raw_data.iloc[:, 2], raw_data.iloc[:, 3]
    new_data = []
    start_end_index_pair = set()

    start_time = time_data[0]
    end_time = start_time + para["window_size"]
    start_index = 0
    end_index = 0

    # get the first start, end index, end time
    for cur_time in time_data:
        if cur_time < end_time:
            end_index += 1
        else:
            break

    start_end_index_pair.add(tuple([start_index, end_index]))

    # move the start and end index until next sliding window
    num_session = 1
    while end_index < log_size:
        start_time = start_time + para["step_size"]
        end_time = start_time + para["window_size"]
        for i in range(start_index, log_size):
            if time_data[i] < start_time:
                i += 1
            else:
                break
        for j in range(end_index, log_size):
            if time_data[j] < end_time:
                j += 1
            else:
                break
        start_index = i
        end_index = j

        # when start_index == end_index, there is no value in the window
        if start_index != end_index:
            start_end_index_pair.add(tuple([start_index, end_index]))

        num_session += 1
        if num_session % 1000 == 0:
            print("process {} time window".format(num_session), end="\r")

    for start_index, end_index in start_end_index_pair:
        dt = deltaT_data[start_index:end_index].values
        dt[0] = 0
        new_data.append(
            [
                time_data[start_index:end_index].values,
                max(label_data[start_index:end_index]),
                logkey_data[start_index:end_index].values,
                dt,
            ]
        )

    assert len(start_end_index_pair) == len(new_data)
    print(
        "there are %d instances (sliding windows) in this dataset\n"
        % len(start_end_index_pair)
    )
    return pd.DataFrame(new_data, columns=raw_data.columns)

# dfをdeeplog_fileに変換
def deeplog_file_generator(filename, df, features):
    with open(filename, "w") as f:
        for _, row in df.iterrows():
            for val in zip(*row[features]):
                f.write(",".join([str(v) for v in val]) + " ")
            f.write("\n")

In [None]:
window_size = 5
step_size = 1

# sampling with sliding window
deeplog_df = sliding_window(
    df[["timestamp", "Label", "EventId", "deltaT"]],
    para={"window_size": int(window_size) * 60, "step_size": int(step_size) * 60},
)

there are 820 instances (sliding windows) in this dataset



In [None]:
# normalとabnormalを切り分け
df_normal = deeplog_df[deeplog_df["Label"] == 0]
df_abnormal = deeplog_df[deeplog_df["Label"] == 1]

df_normal = df_normal.sample(frac=1, random_state=12).reset_index(
    drop=True
)  # shuffle
normal_len = len(df_normal)

In [None]:
df_abnormal

Unnamed: 0,timestamp,Label,EventId,deltaT


##### train_ratio = 40%, 60%, 80%

In [None]:
# 案１（ファイル名を分けない）
train_ratio_list = [0.4, 0.6, 0.8]
for train_ratio in train_ratio_list:

    train_len = int(normal_len * train_ratio)
    save_dir = output_dir/f'ratio_{train_ratio}'

    os.makedirs(save_dir, exist_ok=True)

    # train
    train = df_normal[:train_len]
    deeplog_file_generator(
        filename = str(save_dir) + '/train',
        df = train,
        features = ["EventId", "deltaT"],
    )
    print("training size {}".format(train_len))

    # test(normal)
    test_normal = df_normal[train_len:]
    deeplog_file_generator(
        filename = str(save_dir) + '/test_normal',
        df = test_normal,
        features = ["EventId", "deltaT"],
    )
    print("test normal size {}".format(normal_len - train_len))

    # abnormal
    
    # 必要なら EventId のマッピングを復活
    # df_abnormal["EventId"] = df_abnormal["EventId"].progress_apply(
    #     lambda e: event_index_map[e] if event_index_map.get(e) else UNK
    # )

    deeplog_file_generator(
        filename = str(save_dir) + '/test_abnormal',
        df = df_abnormal,
        features = ["EventId", "deltaT"], 
    )
    print("test abnormal size {}".format(len(df_abnormal)))

training size 328
test normal size 492
test abnormal size 0
training size 492
test normal size 328
test abnormal size 0
training size 656
test normal size 164
test abnormal size 0


In [None]:
# 案２（それぞれファイル名を分ける）

# train
train = df_normal[:train_len]
deeplog_file_generator(
    filename = str(save_dir) + f'/train_{train_ratio}',
    df = train,
    features = ["EventId", "deltaT"],
)
print("training size {}".format(train_len))

# test
test_normal = df_normal[train_len:]
deeplog_file_generator(
    filename = str(save_dir) + f'/test_{(1.0 - train_ratio):.1f}',
    df = test_normal,
    features = ["EventId", "deltaT"],
)

## 1-1 T1105
- interim下にあるsecurity.csvは10日間を抽出したものであることに注意（コードに反映していない）

### 準備

In [5]:
parent_dir = "ScenarioData"
project_name = 'T1105'
ver_number = "2"

### evtx → csv

In [8]:
# test
input_dir = RAW_DIR/parent_dir/project_name
output_dir = INTERIM_DIR/project_name

preprocess.evtx_to_csv_without_eventdata_columns_samplingver(
    evtx_filepath = input_dir/"20240927120753_7EA74D56-6663-313B-2CC1-A7843FCD1AE6/Evtx/Security.evtx",
    output_dir = output_dir,
    output_filename = "security2_test",
)

Processing records:   0%|          | 4900/1027696 [00:00<02:01, 8434.40it/s]
Writing to CSV: 100%|██████████| 50/50 [00:00<00:00, 71453.22it/s]


In [9]:
# 実行
preprocess.evtx_to_csv_without_eventdata_columns(
    evtx_filepath = input_dir/"20240927120753_7EA74D56-6663-313B-2CC1-A7843FCD1AE6/Evtx/Security.evtx",
    output_dir = output_dir,
    output_filename = "security2",
)

Processing records: 100%|██████████| 1027696/1027696 [1:57:49<00:00, 145.36it/s] 
Writing to CSV: 100%|██████████| 1027696/1027696 [00:13<00:00, 74554.24it/s]


### アノテーション

In [40]:
project_name = 'T1105'
ver_number = "2"
ano = pd.read_excel(RAW_DIR/parent_dir/project_name/"取りまとめ後参考"/"AtomicRedTeam_Analyze.xlsx", sheet_name="解析結果")
df = pd.DataFrame()
df["Channel"] = ano["Unnamed: 9"].iloc[1:].values
df["EventRecordID"] = ano["Unnamed: 14"].iloc[1:].values

labeled = preprocess.anotate_csv(
    csv_filepath = INTERIM_DIR/project_name/"security2.csv",
    ano_df = df,
    output_dir = INTERIM_DIR/project_name,
)

#### 異常期間の同定
- **期間**：9/26 17:57 ~ 9/26 18:55 (73件)

In [15]:
labeled[labeled["Label"] == "anomaly"]

Unnamed: 0,Level,EventID,EventRecordID,TimeCreated_SystemTime,Channel,Task,Provider_Name,Correlation_RelatedActivityID,Execution_ProcessID,Execution_ThreadID,Security_UserID,Keywords,Content,Correlation_ActivityID,Provider_Guid,Computer,Opcode,Version,Label,date
170778,0,4688,4718654,2024-09-26 17:57:13.401667+00:00,Security,13312,Microsoft-Windows-Security-Auditing,,4,300,,0x8020000000000000,SubjectUserSid=S-1-5-21-143320146-2996204461-2...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,2,anomaly,2024-09-26
170779,0,4688,4718655,2024-09-26 17:57:13.409899+00:00,Security,13312,Microsoft-Windows-Security-Auditing,,4,32,,0x8020000000000000,SubjectUserSid=S-1-5-21-143320146-2996204461-2...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,2,anomaly,2024-09-26
170780,0,4688,4718656,2024-09-26 17:57:13.442478+00:00,Security,13312,Microsoft-Windows-Security-Auditing,,4,32,,0x8020000000000000,SubjectUserSid=S-1-5-21-143320146-2996204461-2...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,2,anomaly,2024-09-26
170834,0,4688,4718710,2024-09-26 17:59:47.880032+00:00,Security,13312,Microsoft-Windows-Security-Auditing,,4,8064,,0x8020000000000000,SubjectUserSid=S-1-5-21-143320146-2996204461-2...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,2,anomaly,2024-09-26
170835,0,4688,4718711,2024-09-26 17:59:47.883730+00:00,Security,13312,Microsoft-Windows-Security-Auditing,,4,8744,,0x8020000000000000,SubjectUserSid=S-1-5-21-143320146-2996204461-2...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,2,anomaly,2024-09-26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171622,0,5156,4719498,2024-09-26 18:48:50.349300+00:00,Security,12810,Microsoft-Windows-Security-Auditing,,4,6840,,0x8020000000000000,ProcessID=1284|Application=\device\harddiskvol...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,1,anomaly,2024-09-26
171624,0,5156,4719500,2024-09-26 18:48:50.512531+00:00,Security,12810,Microsoft-Windows-Security-Auditing,,4,6840,,0x8020000000000000,ProcessID=6912|Application=\device\harddiskvol...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,1,anomaly,2024-09-26
171627,0,5156,4719503,2024-09-26 18:48:50.929255+00:00,Security,12810,Microsoft-Windows-Security-Auditing,,4,300,,0x8020000000000000,ProcessID=6912|Application=\device\harddiskvol...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,1,anomaly,2024-09-26
171912,0,4688,4719788,2024-09-26 18:55:46.865633+00:00,Security,13312,Microsoft-Windows-Security-Auditing,,4,4972,,0x8020000000000000,SubjectUserSid=S-1-5-21-143320146-2996204461-2...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,2,anomaly,2024-09-26


### パース(Drain)
- ここからのsecurity.csvは10日間を抽出したものを想定！！！！！！！！！！！！！！！！

In [41]:
project_name = 'T1105'
ver_number = "2"
input_dir = INTERIM_DIR/project_name
output_dir = INTERIM_DIR/project_name

preprocess.parse_log(input_dir=input_dir, output_dir=output_dir, logfile_name=f'security{ver_number}', parser_type="drain")

Parsing file: ../data/interim/T1105/buffer.csv
Total size after reading CSV: 177827
Parsing done. [Time taken: 0:00:20.552472]


### モデル前データ作成

In [None]:
ver_number = "2"
input_dir = INTERIM_DIR/project_name
output_dir = PROCESSED_DIR/project_name/f"ver_{ver_number}"
output_dir.mkdir(exist_ok=True)


preprocess.prepare_model_data(
    logdata_filepath = Path(input_dir/"security2_clean_structured.csv"),
    output_dir = output_dir,
    ano_df = df,
)

  


there are 10396 instances (sliding windows) in this dataset

training size 4137
test normal size 6206
test abnormal size 53
training size 6205
test normal size 4138
test abnormal size 53
training size 8274
test normal size 2069
test abnormal size 53
training size 10343


### training

In [5]:
# train
main.main_cli([
    "train",
    "bert/test3",           # conf/bert_config_name.yaml
    #"default.device_id=0",        # ここに好きな key=value を並べる
    #"default.epochs=10",
])

setup
Building Vocab


100%|██████████| 10343/10343 [00:00<00:00, 65955.78it/s]


VOCAB SIZE: 55
before filtering short session
train size  7447
valid size  827


100%|██████████| 8274/8274 [00:02<00:00, 3951.56it/s]


Num of train seqs 7412
Num of valid seqs 827


231it [00:18, 12.77it/s]
25it [00:00, 39.89it/s]


epoch 1 || TRAIN_Loss:8.8102 ||VAL_Loss:8.6666
Best epoch = 1


231it [00:17, 13.34it/s]
25it [00:00, 40.58it/s]


epoch 2 || TRAIN_Loss:7.5329 ||VAL_Loss:6.1900
Best epoch = 2


231it [00:17, 13.27it/s]
25it [00:00, 39.40it/s]


epoch 3 || TRAIN_Loss:3.8475 ||VAL_Loss:1.8928
Best epoch = 3


231it [00:17, 13.20it/s]
25it [00:00, 38.52it/s]


epoch 4 || TRAIN_Loss:1.7370 ||VAL_Loss:1.4548
Best epoch = 4


231it [00:17, 13.16it/s]
25it [00:00, 38.36it/s]


epoch 5 || TRAIN_Loss:1.3618 ||VAL_Loss:1.2298
Best epoch = 5


231it [00:17, 13.12it/s]
25it [00:00, 37.99it/s]


epoch 6 || TRAIN_Loss:1.2244 ||VAL_Loss:1.1169
Best epoch = 6


231it [00:17, 13.07it/s]
25it [00:00, 37.32it/s]


epoch 7 || TRAIN_Loss:1.1664 ||VAL_Loss:1.0889
Best epoch = 7


231it [00:17, 13.03it/s]
25it [00:00, 36.66it/s]


epoch 8 || TRAIN_Loss:1.1496 ||VAL_Loss:1.0899
Best epoch = 7


231it [00:17, 13.05it/s]
25it [00:00, 36.43it/s]


epoch 9 || TRAIN_Loss:1.1446 ||VAL_Loss:1.0874
Best epoch = 9


231it [00:17, 13.01it/s]
25it [00:00, 37.78it/s]


epoch 10 || TRAIN_Loss:1.1349 ||VAL_Loss:1.0863
Best epoch = 10


231it [00:17, 12.98it/s]
25it [00:00, 37.01it/s]


epoch 11 || TRAIN_Loss:1.1314 ||VAL_Loss:1.0823
Best epoch = 11


231it [00:17, 12.93it/s]
25it [00:00, 36.50it/s]


epoch 12 || TRAIN_Loss:1.1311 ||VAL_Loss:1.0786
Best epoch = 12


231it [00:17, 12.94it/s]
25it [00:00, 36.18it/s]


epoch 13 || TRAIN_Loss:1.1240 ||VAL_Loss:1.0795
Best epoch = 12


231it [00:17, 12.92it/s]
25it [00:00, 37.06it/s]


epoch 14 || TRAIN_Loss:1.1211 ||VAL_Loss:1.0725
Best epoch = 14


231it [00:17, 12.89it/s]
25it [00:00, 35.72it/s]


epoch 15 || TRAIN_Loss:1.1178 ||VAL_Loss:1.0759
Best epoch = 14


231it [00:17, 12.88it/s]
25it [00:00, 36.56it/s]


epoch 16 || TRAIN_Loss:1.1142 ||VAL_Loss:1.0724
Best epoch = 16


231it [00:17, 12.85it/s]
25it [00:00, 37.32it/s]


epoch 17 || TRAIN_Loss:1.1144 ||VAL_Loss:1.0701
Best epoch = 17


231it [00:17, 12.86it/s]
25it [00:00, 36.24it/s]


epoch 18 || TRAIN_Loss:1.1112 ||VAL_Loss:1.0689
Best epoch = 18


231it [00:17, 12.87it/s]
25it [00:00, 37.49it/s]


epoch 19 || TRAIN_Loss:1.1065 ||VAL_Loss:1.0633
Best epoch = 19


231it [00:17, 12.85it/s]
25it [00:00, 36.78it/s]


epoch 20 || TRAIN_Loss:1.0999 ||VAL_Loss:1.0614
Best epoch = 20


231it [00:17, 12.87it/s]
25it [00:00, 36.61it/s]


epoch 21 || TRAIN_Loss:1.1013 ||VAL_Loss:1.0558
Best epoch = 21


231it [00:17, 12.84it/s]
25it [00:00, 36.22it/s]


epoch 22 || TRAIN_Loss:1.0985 ||VAL_Loss:1.0562
Best epoch = 21


231it [00:18, 12.83it/s]
25it [00:00, 35.66it/s]


epoch 23 || TRAIN_Loss:1.0920 ||VAL_Loss:1.0579
Best epoch = 21


231it [00:17, 12.87it/s]
25it [00:00, 37.35it/s]


epoch 24 || TRAIN_Loss:1.0902 ||VAL_Loss:1.0556
Best epoch = 24


231it [00:17, 12.86it/s]
25it [00:00, 37.01it/s]


epoch 25 || TRAIN_Loss:1.0868 ||VAL_Loss:1.0605
Best epoch = 24


231it [00:17, 12.87it/s]
25it [00:00, 36.62it/s]


epoch 26 || TRAIN_Loss:1.0803 ||VAL_Loss:1.0512
Best epoch = 26


231it [00:17, 12.87it/s]
25it [00:00, 36.68it/s]


epoch 27 || TRAIN_Loss:1.0773 ||VAL_Loss:1.0432
Best epoch = 27


231it [00:17, 12.89it/s]
25it [00:00, 38.11it/s]


epoch 28 || TRAIN_Loss:1.0729 ||VAL_Loss:1.0485
Best epoch = 27


231it [00:17, 12.93it/s]
25it [00:00, 35.83it/s]


epoch 29 || TRAIN_Loss:1.0688 ||VAL_Loss:1.0502
Best epoch = 27


231it [00:17, 12.87it/s]
25it [00:00, 36.81it/s]


epoch 30 || TRAIN_Loss:1.0653 ||VAL_Loss:1.0387
Best epoch = 30


231it [00:17, 12.90it/s]
25it [00:00, 36.07it/s]


epoch 31 || TRAIN_Loss:1.0611 ||VAL_Loss:1.0340
Best epoch = 31


231it [00:17, 12.89it/s]
25it [00:00, 36.55it/s]


epoch 32 || TRAIN_Loss:1.0598 ||VAL_Loss:1.0394
Best epoch = 31


231it [00:17, 12.90it/s]
25it [00:00, 37.19it/s]


epoch 33 || TRAIN_Loss:1.0574 ||VAL_Loss:1.0304
Best epoch = 33


231it [00:17, 12.87it/s]
25it [00:00, 36.89it/s]


epoch 34 || TRAIN_Loss:1.0504 ||VAL_Loss:1.0419
Best epoch = 33


231it [00:17, 12.88it/s]
25it [00:00, 36.11it/s]


epoch 35 || TRAIN_Loss:1.0516 ||VAL_Loss:1.0369
Best epoch = 33


231it [00:17, 12.88it/s]
25it [00:00, 36.49it/s]


epoch 36 || TRAIN_Loss:1.0481 ||VAL_Loss:1.0269
Best epoch = 36


231it [00:17, 12.88it/s]
25it [00:00, 36.65it/s]


epoch 37 || TRAIN_Loss:1.0498 ||VAL_Loss:1.0296
Best epoch = 36


231it [00:17, 12.83it/s]
25it [00:00, 37.85it/s]


epoch 38 || TRAIN_Loss:1.0454 ||VAL_Loss:1.0240
Best epoch = 38


231it [00:17, 12.84it/s]
25it [00:00, 36.37it/s]


epoch 39 || TRAIN_Loss:1.0427 ||VAL_Loss:1.0225
Best epoch = 39


231it [00:17, 12.92it/s]
25it [00:00, 36.90it/s]


epoch 40 || TRAIN_Loss:1.0382 ||VAL_Loss:1.0239
Best epoch = 39


231it [00:17, 12.87it/s]
25it [00:00, 35.86it/s]


epoch 41 || TRAIN_Loss:1.0413 ||VAL_Loss:1.0202
Best epoch = 41


231it [00:17, 12.89it/s]
25it [00:00, 36.49it/s]


epoch 42 || TRAIN_Loss:1.0399 ||VAL_Loss:1.0180
Best epoch = 42


231it [00:17, 12.89it/s]
25it [00:00, 36.84it/s]


epoch 43 || TRAIN_Loss:1.0373 ||VAL_Loss:1.0180
Best epoch = 42


231it [00:17, 12.86it/s]
25it [00:00, 37.31it/s]


epoch 44 || TRAIN_Loss:1.0376 ||VAL_Loss:1.0198
Best epoch = 42


231it [00:17, 12.87it/s]
25it [00:00, 37.61it/s]


epoch 45 || TRAIN_Loss:1.0349 ||VAL_Loss:1.0218
Best epoch = 42


231it [00:17, 12.86it/s]
25it [00:00, 38.01it/s]


epoch 46 || TRAIN_Loss:1.0321 ||VAL_Loss:1.0207
Best epoch = 42


231it [00:18, 12.83it/s]
25it [00:00, 37.08it/s]


epoch 47 || TRAIN_Loss:1.0334 ||VAL_Loss:1.0107
Best epoch = 47


231it [00:17, 12.87it/s]
25it [00:00, 36.07it/s]


epoch 48 || TRAIN_Loss:1.0340 ||VAL_Loss:1.0117
Best epoch = 47


231it [00:17, 12.84it/s]
25it [00:00, 36.62it/s]


epoch 49 || TRAIN_Loss:1.0325 ||VAL_Loss:1.0125
Best epoch = 47


231it [00:17, 12.84it/s]
25it [00:00, 36.92it/s]


epoch 50 || TRAIN_Loss:1.0314 ||VAL_Loss:1.0147
Best epoch = 47


231it [00:17, 12.85it/s]
25it [00:00, 37.45it/s]


epoch 51 || TRAIN_Loss:1.0316 ||VAL_Loss:1.0122
Best epoch = 47


231it [00:17, 12.84it/s]
25it [00:00, 36.31it/s]


epoch 52 || TRAIN_Loss:1.0293 ||VAL_Loss:1.0234
Best epoch = 47


231it [00:17, 12.86it/s]
25it [00:00, 37.46it/s]


epoch 53 || TRAIN_Loss:1.0254 ||VAL_Loss:1.0173
Best epoch = 47


231it [00:17, 12.90it/s]
25it [00:00, 36.79it/s]


epoch 54 || TRAIN_Loss:1.0285 ||VAL_Loss:1.0153
Best epoch = 47


231it [00:17, 12.86it/s]
25it [00:00, 36.80it/s]


epoch 55 || TRAIN_Loss:1.0262 ||VAL_Loss:1.0210
Best epoch = 47


231it [00:17, 12.85it/s]
25it [00:00, 37.91it/s]


epoch 56 || TRAIN_Loss:1.0297 ||VAL_Loss:1.0189
Best epoch = 47


231it [00:17, 12.87it/s]
25it [00:00, 37.20it/s]


epoch 57 || TRAIN_Loss:1.0305 ||VAL_Loss:1.0117
Best epoch = 47


231it [00:17, 12.88it/s]
25it [00:00, 37.49it/s]


epoch 58 || TRAIN_Loss:1.0246 ||VAL_Loss:1.0159
Best epoch = 47


231it [00:17, 12.91it/s]
25it [00:00, 37.85it/s]


epoch 59 || TRAIN_Loss:1.0245 ||VAL_Loss:1.0159
Best epoch = 47


231it [00:17, 12.88it/s]
25it [00:00, 37.64it/s]


epoch 60 || TRAIN_Loss:1.0264 ||VAL_Loss:1.0207
Best epoch = 47


231it [00:17, 12.89it/s]
25it [00:00, 37.49it/s]


epoch 61 || TRAIN_Loss:1.0256 ||VAL_Loss:1.0165
Best epoch = 47


231it [00:17, 12.91it/s]
25it [00:00, 36.77it/s]


epoch 62 || TRAIN_Loss:1.0257 ||VAL_Loss:1.0109
Best epoch = 47


231it [00:17, 12.90it/s]
25it [00:00, 36.73it/s]


epoch 63 || TRAIN_Loss:1.0275 ||VAL_Loss:1.0177
Best epoch = 47


231it [00:17, 12.88it/s]
25it [00:00, 37.00it/s]


epoch 64 || TRAIN_Loss:1.0226 ||VAL_Loss:1.0091
Best epoch = 64


231it [00:17, 12.88it/s]
25it [00:00, 37.98it/s]


epoch 65 || TRAIN_Loss:1.0265 ||VAL_Loss:1.0156
Best epoch = 64


231it [00:17, 12.91it/s]
25it [00:00, 37.10it/s]


epoch 66 || TRAIN_Loss:1.0238 ||VAL_Loss:1.0123
Best epoch = 64


231it [00:17, 12.89it/s]
25it [00:00, 37.21it/s]


epoch 67 || TRAIN_Loss:1.0262 ||VAL_Loss:1.0097
Best epoch = 64


231it [00:17, 12.89it/s]
25it [00:00, 35.61it/s]


epoch 68 || TRAIN_Loss:1.0246 ||VAL_Loss:1.0167
Best epoch = 64


231it [00:17, 12.89it/s]
25it [00:00, 37.35it/s]


epoch 69 || TRAIN_Loss:1.0231 ||VAL_Loss:1.0067
Best epoch = 69


231it [00:17, 12.93it/s]
25it [00:00, 37.19it/s]


epoch 70 || TRAIN_Loss:1.0240 ||VAL_Loss:1.0076
Best epoch = 69


231it [00:17, 12.91it/s]
25it [00:00, 36.07it/s]


epoch 71 || TRAIN_Loss:1.0262 ||VAL_Loss:1.0078
Best epoch = 69


231it [00:17, 12.92it/s]
25it [00:00, 36.18it/s]


epoch 72 || TRAIN_Loss:1.0245 ||VAL_Loss:1.0113
Best epoch = 69


231it [00:17, 12.91it/s]
25it [00:00, 37.51it/s]


epoch 73 || TRAIN_Loss:1.0245 ||VAL_Loss:1.0070
Best epoch = 69


231it [00:17, 12.92it/s]
25it [00:00, 37.23it/s]


epoch 74 || TRAIN_Loss:1.0232 ||VAL_Loss:1.0105
Best epoch = 69


231it [00:17, 12.91it/s]
25it [00:00, 36.49it/s]


epoch 75 || TRAIN_Loss:1.0203 ||VAL_Loss:1.0158
Best epoch = 69


231it [00:17, 12.91it/s]
25it [00:00, 38.48it/s]


epoch 76 || TRAIN_Loss:1.0222 ||VAL_Loss:1.0096
Best epoch = 69


231it [00:17, 12.91it/s]
25it [00:00, 37.63it/s]


epoch 77 || TRAIN_Loss:1.0234 ||VAL_Loss:1.0100
Best epoch = 69


231it [00:17, 12.93it/s]
25it [00:00, 36.70it/s]


epoch 78 || TRAIN_Loss:1.0205 ||VAL_Loss:1.0044
Best epoch = 78


231it [00:17, 12.91it/s]
25it [00:00, 37.32it/s]


epoch 79 || TRAIN_Loss:1.0214 ||VAL_Loss:1.0118
Best epoch = 78


231it [00:17, 12.91it/s]
25it [00:00, 37.27it/s]


epoch 80 || TRAIN_Loss:1.0225 ||VAL_Loss:1.0087
Best epoch = 78


231it [00:17, 12.90it/s]
25it [00:00, 36.85it/s]


epoch 81 || TRAIN_Loss:1.0217 ||VAL_Loss:1.0054
Best epoch = 78


231it [00:17, 12.92it/s]
25it [00:00, 37.12it/s]


epoch 82 || TRAIN_Loss:1.0189 ||VAL_Loss:1.0064
Best epoch = 78


231it [00:17, 12.92it/s]
25it [00:00, 37.38it/s]


epoch 83 || TRAIN_Loss:1.0212 ||VAL_Loss:1.0116
Best epoch = 78


231it [00:17, 12.91it/s]
25it [00:00, 37.70it/s]


epoch 84 || TRAIN_Loss:1.0207 ||VAL_Loss:1.0051
Best epoch = 78


231it [00:17, 12.93it/s]
25it [00:00, 37.86it/s]


epoch 85 || TRAIN_Loss:1.0174 ||VAL_Loss:1.0156
Best epoch = 78


231it [00:17, 12.91it/s]
25it [00:00, 37.81it/s]


epoch 86 || TRAIN_Loss:1.0188 ||VAL_Loss:1.0053
Best epoch = 78


231it [00:17, 12.92it/s]
25it [00:00, 36.90it/s]


epoch 87 || TRAIN_Loss:1.0192 ||VAL_Loss:1.0098
Best epoch = 78


231it [00:17, 12.91it/s]
25it [00:00, 36.25it/s]


epoch 88 || TRAIN_Loss:1.0199 ||VAL_Loss:1.0137
Best epoch = 78


231it [00:17, 12.91it/s]
25it [00:00, 36.90it/s]


epoch 89 || TRAIN_Loss:1.0239 ||VAL_Loss:1.0140
Best epoch = 78


231it [00:17, 12.90it/s]
25it [00:00, 38.92it/s]


epoch 90 || TRAIN_Loss:1.0200 ||VAL_Loss:1.0040
Best epoch = 90


231it [00:17, 12.95it/s]
25it [00:00, 36.08it/s]


epoch 91 || TRAIN_Loss:1.0202 ||VAL_Loss:1.0041
Best epoch = 90


231it [00:17, 12.93it/s]
25it [00:00, 37.80it/s]


epoch 92 || TRAIN_Loss:1.0173 ||VAL_Loss:1.0084
Best epoch = 90


231it [00:17, 12.91it/s]
25it [00:00, 37.02it/s]


epoch 93 || TRAIN_Loss:1.0212 ||VAL_Loss:1.0137
Best epoch = 90


231it [00:17, 12.92it/s]
25it [00:00, 36.95it/s]


epoch 94 || TRAIN_Loss:1.0201 ||VAL_Loss:1.0105
Best epoch = 90


231it [00:17, 12.93it/s]
25it [00:00, 37.29it/s]


epoch 95 || TRAIN_Loss:1.0196 ||VAL_Loss:1.0055
Best epoch = 90


231it [00:17, 12.91it/s]
25it [00:00, 37.31it/s]


epoch 96 || TRAIN_Loss:1.0197 ||VAL_Loss:1.0110
Best epoch = 90


231it [00:17, 12.92it/s]
25it [00:00, 37.28it/s]


epoch 97 || TRAIN_Loss:1.0208 ||VAL_Loss:1.0043
Best epoch = 90


231it [00:17, 12.92it/s]
25it [00:00, 36.86it/s]


epoch 98 || TRAIN_Loss:1.0158 ||VAL_Loss:1.0043
Best epoch = 90


231it [00:17, 12.93it/s]
25it [00:00, 35.06it/s]


epoch 99 || TRAIN_Loss:1.0173 ||VAL_Loss:1.0118
Best epoch = 90


231it [00:17, 12.94it/s]
25it [00:00, 37.75it/s]


epoch 100 || TRAIN_Loss:1.0169 ||VAL_Loss:1.0067
Best epoch = 90


231it [00:17, 12.90it/s]
25it [00:00, 35.94it/s]


epoch 101 || TRAIN_Loss:1.0172 ||VAL_Loss:1.0053
Best epoch = 90


231it [00:17, 12.92it/s]
25it [00:00, 37.85it/s]


epoch 102 || TRAIN_Loss:1.0206 ||VAL_Loss:1.0118
Best epoch = 90


231it [00:17, 12.93it/s]
25it [00:00, 37.29it/s]


epoch 103 || TRAIN_Loss:1.0201 ||VAL_Loss:1.0156
Best epoch = 90


231it [00:17, 12.93it/s]
25it [00:00, 36.67it/s]


epoch 104 || TRAIN_Loss:1.0157 ||VAL_Loss:1.0053
Best epoch = 90


231it [00:17, 12.94it/s]
25it [00:00, 35.33it/s]


epoch 105 || TRAIN_Loss:1.0198 ||VAL_Loss:1.0050
Best epoch = 90


231it [00:17, 12.93it/s]
25it [00:00, 37.49it/s]


epoch 106 || TRAIN_Loss:1.0179 ||VAL_Loss:1.0052
Best epoch = 90


231it [00:17, 12.91it/s]
25it [00:00, 37.90it/s]


epoch 107 || TRAIN_Loss:1.0169 ||VAL_Loss:1.0120
Best epoch = 90


231it [00:17, 12.95it/s]
25it [00:00, 36.66it/s]


epoch 108 || TRAIN_Loss:1.0165 ||VAL_Loss:1.0076
Best epoch = 90


231it [00:17, 12.90it/s]
25it [00:00, 37.53it/s]


epoch 109 || TRAIN_Loss:1.0159 ||VAL_Loss:1.0081
Best epoch = 90


231it [00:17, 12.89it/s]
25it [00:00, 37.43it/s]


epoch 110 || TRAIN_Loss:1.0168 ||VAL_Loss:1.0149
Early stopping at epoch 110 (best = 90)


In [7]:
# テスト
main.main_cli([
    "test",  # run_mode
    "outputs/logbert/bert/T1105/ver_1/ratio_0.8/seq_len_256/r_seed_31/weights/ValTotalbest.pth", # 重みファイルまでの相対パス
    "32",   # eval_batchsize
    "cuda:0",     # gpu
])

outputs/logbert/bert/T1105/ver_1/ratio_0.8/seq_len_256/r_seed_31
cuda:0 cuda:0
Building Vocab


100%|██████████| 34455/34455 [00:00<00:00, 78005.94it/s]


VOCAB SIZE: 60


Processing normal test data: 100%|██████████| 2069/2069 [00:00<00:00, 3986.61it/s]
Processing abnormal test data: 100%|██████████| 53/53 [00:00<00:00, 3463.34it/s]
100%|██████████| 64/64 [00:01<00:00, 44.75it/s]
100%|██████████| 53/53 [00:00<00:00, 70.81it/s]

Saving test normal results
Saving test abnormal results
[seq_th, FP, TP, TN, FN, Precision, Recall, F1, TPR, FPR]
threshold=0.000['18.0000', '8.0000', '2030.0000', '45.0000', '30.7692', '15.0943', '20.2532', '15.0943', '0.8789']
threshold=0.100['8.0000', '0.0000', '2040.0000', '53.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.3906']
threshold=0.200['0.0000', '0.0000', '2048.0000', '53.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000']
threshold=0.300['0.0000', '0.0000', '2048.0000', '53.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000']
threshold=0.400['0.0000', '0.0000', '2048.0000', '53.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000']
threshold=0.500['0.0000', '0.0000', '2048.0000', '53.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000']
threshold=0.600['0.0000', '0.0000', '2048.0000', '53.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000']
threshold=0.700['0.0000', '0.0000', '2048.0000', '53.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000']
t




## 1-2 VSCode1

### 準備

In [9]:
parent_dir = "ScenarioData"
project_name = 'VSCode1'

### evtx → csv

In [10]:
# test
input_dir = RAW_DIR/parent_dir/project_name
output_dir = INTERIM_DIR/project_name

preprocess.evtx_to_csv_without_eventdata_columns_samplingver(
    evtx_filepath = input_dir/"20240706063537_7EA74D56-6663-313B-2CC1-A7843FCD1AE6/Evtx/Security.evtx",
    output_dir = output_dir,
    output_filename = "security2_test",
)

Processing records:   5%|▌         | 4900/93912 [00:00<00:10, 8582.05it/s]
Writing to CSV: 100%|██████████| 50/50 [00:00<00:00, 68556.78it/s]


In [11]:
# 実行
preprocess.evtx_to_csv_without_eventdata_columns(
    evtx_filepath = input_dir/"20240706063537_7EA74D56-6663-313B-2CC1-A7843FCD1AE6/Evtx/Security.evtx",
    output_dir = output_dir,
    output_filename = "security2",
)

Processing records: 100%|██████████| 93912/93912 [09:13<00:00, 169.80it/s]
Writing to CSV: 100%|██████████| 93912/93912 [00:01<00:00, 67158.05it/s]


### パース(Drain)

In [None]:
input_dir = INTERIM_DIR/project_name
output_dir = INTERIM_DIR/project_name

df = pd.read_csv(input_dir/"security.csv")
df = df.dropna(subset=["Content"]) # Contentカラムに欠損値をもつ行を削除
df.to_csv(input_dir / "security_clean.csv", index=False)

In [None]:
preprocess.parse_log(input_dir=input_dir, output_dir=output_dir, logfile_name='security_clean', parser_type="drain")

## 1-3 WEB01

### 準備

In [20]:
parent_dir = "ScenarioData"
project_name = 'WEB1'
ver_number = "2"

### evtx → csv

In [13]:
# test
input_dir = RAW_DIR/parent_dir/project_name
output_dir = INTERIM_DIR/project_name

preprocess.evtx_to_csv_without_eventdata_columns_samplingver(
    evtx_filepath = input_dir/"20240317044842_7EA74D56-6663-313B-2CC1-A7843FCD1AE6/Evtx/Security.evtx",
    output_dir = output_dir,
    output_filename = "security2_test",
)

Processing records:   2%|▏         | 4900/215806 [00:00<00:27, 7632.09it/s]
Writing to CSV: 100%|██████████| 50/50 [00:00<00:00, 61230.72it/s]


In [14]:
# 実行
preprocess.evtx_to_csv_without_eventdata_columns(
    evtx_filepath = input_dir/"20240317044842_7EA74D56-6663-313B-2CC1-A7843FCD1AE6/Evtx/Security.evtx",
    output_dir = output_dir,
    output_filename = "security2",
)

Processing records: 100%|██████████| 215806/215806 [21:56<00:00, 163.89it/s]
Writing to CSV: 100%|██████████| 215806/215806 [00:02<00:00, 72085.06it/s]


### 余計な区間を削除（11/22~11/23）

In [21]:
project_name = 'WEB1'
ver_number = "2"
data = pd.read_csv(INTERIM_DIR/project_name/f"security{ver_number}.csv")
data["TimeCreated_SystemTime"] = pd.to_datetime(
    data["TimeCreated_SystemTime"], 
    errors="coerce",      # 変換できないものは NaT にする
)

data["date"] = data["TimeCreated_SystemTime"].dt.date

filtered = data[
    (data["date"] <= pd.to_datetime("2024-11-01").date()) &
    (data["date"] >= pd.to_datetime("2023-12-01").date())
]

# データ上書き注意！！！！！！！！！！！！！！
filtered.to_csv(INTERIM_DIR/project_name/f"security{ver_number}.csv", index=False)

### アノテーション

In [None]:
project_name = 'WEB1'
ver_number = "2"
ano = pd.read_excel(RAW_DIR/parent_dir/"_anotation"/project_name/"web1_labeled_incident_data.xlsx")
df = pd.DataFrame()
df["Channel"] = ano["Channel"].values
df["EventRecordID"] = ano["RecordID"].values

labeled = preprocess.anotate_csv(
    csv_filepath = INTERIM_DIR/project_name/"security2.csv",
    ano_df = df,
    output_dir = INTERIM_DIR/project_name,
)

#### 異常期間の同定
- **期間**：3/14 15:00 ~ 3/17 16:48 (21162件)

In [27]:
labeled[labeled["Label"] == "anomaly"]

Unnamed: 0,TimeCreated_SystemTime,Security_UserID,Correlation_ActivityID,Channel,Version,Content,Correlation_RelatedActivityID,Provider_Guid,Keywords,Opcode,EventID,Provider_Name,Computer,Task,EventRecordID,Level,Execution_ProcessID,Execution_ThreadID,Label
144356,2024-03-14 15:00:01.005144+00:00,,,Security,2,SubjectUserSid=S-1-5-18|SubjectUserName=SWATTC...,,{54849625-5478-4994-a5ba-3e3b0328c30d},0x8020000000000000,0,4688,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,13312,1511462,0,4,4488,anomaly
144358,2024-03-14 15:00:49.232059+00:00,,,Security,1,ProcessID=2572|Application=\device\harddiskvol...,,{54849625-5478-4994-a5ba-3e3b0328c30d},0x8020000000000000,0,5156,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,12810,1511464,0,4,4488,anomaly
144361,2024-03-14 15:01:49.291647+00:00,,,Security,1,ProcessID=1264|Application=\device\harddiskvol...,,{54849625-5478-4994-a5ba-3e3b0328c30d},0x8020000000000000,0,5156,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,12810,1511467,0,4,4488,anomaly
144363,2024-03-14 15:01:49.317078+00:00,,,Security,1,ProcessID=2572|Application=\device\harddiskvol...,,{54849625-5478-4994-a5ba-3e3b0328c30d},0x8020000000000000,0,5156,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,12810,1511469,0,4,4488,anomaly
144365,2024-03-14 15:02:49.371119+00:00,,,Security,1,ProcessID=2572|Application=\device\harddiskvol...,,{54849625-5478-4994-a5ba-3e3b0328c30d},0x8020000000000000,0,5156,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,12810,1511471,0,4,4488,anomaly
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194059,2024-03-17 16:48:45.044725+00:00,,,Security,1,ProcessID=4|Application=System|Direction=%%145...,,{54849625-5478-4994-a5ba-3e3b0328c30d},0x8020000000000000,0,5156,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,12810,1561165,0,4,2028,anomaly
194060,2024-03-17 16:48:45.211576+00:00,,,Security,2,SubjectUserSid=S-1-5-18|SubjectUserName=SWATTC...,,{54849625-5478-4994-a5ba-3e3b0328c30d},0x8020000000000000,0,4688,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,13312,1561166,0,4,2656,anomaly
194061,2024-03-17 16:48:45.240389+00:00,,,Security,2,SubjectUserSid=S-1-5-18|SubjectUserName=SWATTC...,,{54849625-5478-4994-a5ba-3e3b0328c30d},0x8020000000000000,0,4688,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,13312,1561167,0,4,7440,anomaly
194071,2024-03-17 16:48:46.617451+00:00,,,Security,2,SubjectUserSid=S-1-5-18|SubjectUserName=SWATTC...,,{54849625-5478-4994-a5ba-3e3b0328c30d},0x8020000000000000,0,4688,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,13312,1561177,0,4,5188,anomaly


### 

### パース(Drain)

In [24]:
input_dir = INTERIM_DIR/project_name
output_dir = INTERIM_DIR/project_name
preprocess.parse_log(input_dir=input_dir, output_dir=output_dir, logfile_name='security2', parser_type="drain")

Parsing file: ../data/interim/WEB1/buffer.csv
Total size after reading CSV: 194072
Parsing done. [Time taken: 0:00:22.991458]


## 1-4 WEB02

### 準備

In [28]:
parent_dir = "ScenarioData"
project_name = 'WEB2'
ver_number = "2"

### evtx → csv

In [None]:
# test
input_dir = RAW_DIR/parent_dir/project_name
output_dir = INTERIM_DIR/project_name

preprocess.evtx_to_csv_without_eventdata_columns_samplingver(
    evtx_filepath = input_dir/"20240317051629_7EA74D56-6663-313B-2CC1-A7843FCD1AE6/Evtx/Security.evtx",
    output_dir = output_dir,
    output_filename = f"security{ver_number}_test",
)

Processing records:   2%|▏         | 4900/215808 [00:00<00:33, 6286.26it/s]
Writing to CSV: 100%|██████████| 50/50 [00:00<00:00, 46386.91it/s]


In [32]:
# 実行
preprocess.evtx_to_csv_without_eventdata_columns(
    evtx_filepath = input_dir/"20240317051629_7EA74D56-6663-313B-2CC1-A7843FCD1AE6/Evtx/Security.evtx",
    output_dir = output_dir,
    output_filename = f"security{ver_number}",
)

Processing records: 100%|██████████| 215808/215808 [26:05<00:00, 137.89it/s]
Writing to CSV: 100%|██████████| 215808/215808 [00:03<00:00, 64105.31it/s]


### アノテーション

In [29]:
project_name = 'WEB2'
ver_number = "2"
ano = pd.read_excel(RAW_DIR/parent_dir/"_anotation"/project_name/"web2_labeled_incident_data.xlsx")
df = pd.DataFrame()
df["Channel"] = ano["Channel"].values
df["EventRecordID"] = ano["RecordID"].values

labeled = preprocess.anotate_csv(
    csv_filepath = INTERIM_DIR/project_name/"security2.csv",
    ano_df = df,
    output_dir = INTERIM_DIR/project_name,
)

#### 異常期間の同定
- **期間**：3/14 15:00 ~ 3/17 17:17 (21683件)

In [39]:
labeled[labeled["Label"] == "anomaly"]

Unnamed: 0,Security_UserID,Level,Keywords,Channel,Content,EventID,Correlation_ActivityID,TimeCreated_SystemTime,Correlation_RelatedActivityID,Task,Execution_ProcessID,Execution_ThreadID,Provider_Name,Computer,Version,Opcode,EventRecordID,Provider_Guid,Label
144356,,0,0x8020000000000000,Security,SubjectUserSid=S-1-5-18|SubjectUserName=SWATTC...,4688,,2024-03-14 15:00:01.005144+00:00,,13312,4,4488,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,2,0,1511462,{54849625-5478-4994-a5ba-3e3b0328c30d},anomaly
144358,,0,0x8020000000000000,Security,ProcessID=2572|Application=\device\harddiskvol...,5156,,2024-03-14 15:00:49.232059+00:00,,12810,4,4488,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,1,0,1511464,{54849625-5478-4994-a5ba-3e3b0328c30d},anomaly
144361,,0,0x8020000000000000,Security,ProcessID=1264|Application=\device\harddiskvol...,5156,,2024-03-14 15:01:49.291647+00:00,,12810,4,4488,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,1,0,1511467,{54849625-5478-4994-a5ba-3e3b0328c30d},anomaly
144363,,0,0x8020000000000000,Security,ProcessID=2572|Application=\device\harddiskvol...,5156,,2024-03-14 15:01:49.317078+00:00,,12810,4,4488,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,1,0,1511469,{54849625-5478-4994-a5ba-3e3b0328c30d},anomaly
144365,,0,0x8020000000000000,Security,ProcessID=2572|Application=\device\harddiskvol...,5156,,2024-03-14 15:02:49.371119+00:00,,12810,4,4488,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,1,0,1511471,{54849625-5478-4994-a5ba-3e3b0328c30d},anomaly
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195225,,0,0x8020000000000000,Security,ProcessID=1264|Application=\device\harddiskvol...,5156,,2024-03-17 17:16:04.655096+00:00,,12810,4,6388,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,1,0,1562331,{54849625-5478-4994-a5ba-3e3b0328c30d},anomaly
195228,,0,0x8020000000000000,Security,ProcessID=2572|Application=\device\harddiskvol...,5156,,2024-03-17 17:16:28.634140+00:00,,12810,4,3960,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,1,0,1562334,{54849625-5478-4994-a5ba-3e3b0328c30d},anomaly
195229,,0,0x8020000000000000,Security,SubjectUserSid=S-1-5-18|SubjectUserName=SWATTC...,4688,,2024-03-17 17:16:28.766081+00:00,,13312,4,3960,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,2,0,1562335,{54849625-5478-4994-a5ba-3e3b0328c30d},anomaly
195230,,0,0x8020000000000000,Security,SubjectUserSid=S-1-5-18|SubjectUserName=SWATTC...,4688,,2024-03-17 17:16:29.024967+00:00,,13312,4,8988,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,2,0,1562336,{54849625-5478-4994-a5ba-3e3b0328c30d},anomaly


### パース(Drain)

In [30]:
project_name = 'WEB2'
ver_number = "2"
input_dir = INTERIM_DIR/project_name
output_dir = INTERIM_DIR/project_name
preprocess.parse_log(input_dir=input_dir, output_dir=output_dir, logfile_name='security2', parser_type="drain")

Parsing file: ../data/interim/WEB2/buffer.csv
Total size after reading CSV: 215806
Parsing done. [Time taken: 0:00:25.667182]


## 1-5 WEB03

### 準備

In [33]:
parent_dir = "ScenarioData"
project_name = 'WEB3'
ver_number = "2"

### evtx → csv

In [34]:
# test
input_dir = RAW_DIR/parent_dir/project_name
output_dir = INTERIM_DIR/project_name

preprocess.evtx_to_csv_without_eventdata_columns_samplingver(
    evtx_filepath = input_dir/"20240317054607_7EA74D56-6663-313B-2CC1-A7843FCD1AE6/Evtx/Security.evtx",
    output_dir = output_dir,
    output_filename = f"security{ver_number}_test",
)

Processing records:   2%|▏         | 4900/215773 [00:00<00:36, 5798.09it/s]
Writing to CSV: 100%|██████████| 50/50 [00:00<00:00, 46172.44it/s]


In [35]:
# 実行
preprocess.evtx_to_csv_without_eventdata_columns(
    evtx_filepath = input_dir/"20240317054607_7EA74D56-6663-313B-2CC1-A7843FCD1AE6/Evtx/Security.evtx",
    output_dir = output_dir,
    output_filename = f"security{ver_number}",
)

Processing records: 100%|██████████| 215773/215773 [26:04<00:00, 137.94it/s]
Writing to CSV: 100%|██████████| 215773/215773 [00:03<00:00, 62537.91it/s]


### アノテーション

### パース(Drain)

In [17]:
input_dir = INTERIM_DIR/project_name
output_dir = INTERIM_DIR/project_name

df = pd.read_csv(input_dir/"security.csv")
df = df.dropna(subset=["Content"]) # Contentカラムに欠損値をもつ行を削除
df.to_csv(input_dir / "security_clean.csv", index=False)

In [18]:
preprocess.parse_log(input_dir=input_dir, output_dir=output_dir, logfile_name='security_clean', parser_type="drain")

Parsing file: ../data/interim/WEB3/security_clean.csv
Initial DataFrame:
    Task  Level                Computer  EventID  \
0  12804      0  SWAttckd.swtestnet.com     4702   
1  12804      0  SWAttckd.swtestnet.com     4702   
2  12804      0  SWAttckd.swtestnet.com     4702   
3  12804      0  SWAttckd.swtestnet.com     4702   
4  12810      0  SWAttckd.swtestnet.com     5158   

                   Correlation_ActivityID  \
0  {326660a5-67c0-0001-6861-6632c067da01}   
1  {326660a5-67c0-0001-6861-6632c067da01}   
2  {326660a5-67c0-0001-6861-6632c067da01}   
3  {326660a5-67c0-0001-6861-6632c067da01}   
4                                     NaN   

                                             Content  EventRecordID  \
0  SubjectUserSid=S-1-5-18;SubjectUserName=SWATTC...        1367106   
1  SubjectUserSid=S-1-5-18;SubjectUserName=SWATTC...        1367107   
2  SubjectUserSid=S-1-5-18;SubjectUserName=SWATTC...        1367108   
3  SubjectUserSid=S-1-5-18;SubjectUserName=SWATTC...       

## 1-6 総合データ（Integrated）

### csv ⇒ 統合csv

In [15]:
input_dir = output_dir = INTERIM_DIR/"Integrated"

project_list = ["T1105", "WEB1", "WEB2"]

# T1105 だけフルデータが別ディレクトリ下なので、別扱い
df = pd.read_csv(NO_MEANING_DIR/"T1105/security2_complement.csv")
df["project"] = "T1105_C"

for project in project_list:
    dir = Path(INTERIM_DIR / project / "security2.csv")
    temp_df = pd.read_csv(dir)
    temp_df["project"] = str(project)    
    df = pd.concat([df, temp_df], ignore_index=True)

df.to_csv(output_dir/"security_integrated.csv", index=False)

### パース(Drain)

In [16]:
input_dir = output_dir = INTERIM_DIR/"Integrated"
preprocess.parse_log(input_dir=input_dir, output_dir=output_dir, logfile_name='security_integrated', parser_type="drain", integrated=True)

Parsing file: ../data/interim/Integrated/buffer.csv
Total size after reading CSV: 1416987
Parsing done. [Time taken: 0:03:10.633761]


### モデル前データ作成

In [17]:
preprocess.prepare_integrated_model_data(
    logdata_filepath = INTERIM_DIR/"Integrated"/"security_integrated_structured.csv", 
    output_dir = PROCESSED_DIR/"Integrated",
    project_list = ["T1105_C","T1105", "WEB1", "WEB2"]
)

  data["timestamp"] = data["datetime"].view("int64") // 10**9


there are 42619 instances (sliding windows) in this dataset

there are 10396 instances (sliding windows) in this dataset

there are 16485 instances (sliding windows) in this dataset

there are 16512 instances (sliding windows) in this dataset

there are 42619 instances (sliding windows) in this dataset

there are 10396 instances (sliding windows) in this dataset

there are 16485 instances (sliding windows) in this dataset

there are 16512 instances (sliding windows) in this dataset

there are 42619 instances (sliding windows) in this dataset

there are 10396 instances (sliding windows) in this dataset

there are 16485 instances (sliding windows) in this dataset

there are 16512 instances (sliding windows) in this dataset

vocab size 34455


### 実行

In [None]:
# train
main.main_cli([
    "train",
    "bert/test",           # conf/bert_config_name.yaml
    #"default.device_id=0",        # ここに好きな key=value を並べる
    #"default.epochs=10",
])

setup
Building Vocab


100%|██████████| 34455/34455 [00:00<00:00, 72592.75it/s]


VOCAB SIZE: 60
before filtering short session
train size  63163
valid size  7018


100%|██████████| 70181/70181 [00:18<00:00, 3767.41it/s]


Num of train seqs 63127
Num of valid seqs 7018


97it [00:06, 17.75it/s]

⚠️ NaN detected in batch 93, skipping...


1972it [01:58, 16.60it/s]

⚠️ Total batches with NaN: 1



219it [00:03, 72.93it/s]


epoch 1 || TRAIN_Loss:2.2597 ||VAL_Loss:1.1614
Best epoch = 1


1972it [01:57, 16.85it/s]
219it [00:03, 71.61it/s]


epoch 2 || TRAIN_Loss:1.1468 ||VAL_Loss:1.0919
Best epoch = 2


1972it [01:58, 16.60it/s]
219it [00:03, 71.58it/s]


epoch 3 || TRAIN_Loss:1.1078 ||VAL_Loss:1.0852
Best epoch = 3


1972it [01:59, 16.49it/s]
105it [00:01, 67.78it/s]


RuntimeError: CUDA error: an illegal instruction was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
"""
HyperSphereLossなしでテスト
Jupyter Notebookで実行してください
"""

import sys
sys.path.insert(0, '/home/siwamura/My_lad/src')

from main import main_cli

print("=" * 80)
print("🚀 HyperSphereLoss無効化テスト")
print("=" * 80)
print("HyperSphereLossを無効化してトレーニングします")
print("これでNaNが消えれば、HyperSphereLossが原因です")
print("=" * 80)

try:
    main_cli([
        "train",
        "bert/test",
        "dataset.sample.seq_len=128",
        "default.epochs=2",
        "loss.hypersphere.bias=0.0",  # HyperSphereLossを無効化
        "loss.mask.bias=1.0",  # MaskLossのみ使用
    ])
    
    print("\n" + "=" * 80)
    print("✅ トレーニング完了（HyperSphereLossなし）")
    print("=" * 80)
    print("\n結論:")
    print("  HyperSphereLossを無効化すれば正常に動作する")
    print("  → HyperSphereLossの実装に問題がある")

except Exception as e:
    print(f"\n❌ Error: {e}")
    import traceback
    traceback.print_exc()


## 補足

### "Channel="を削ぐ
- 一応過程として残す

In [14]:
def remove_leading_channel_security(s):
    # NaN はそのまま返す
    if pd.isna(s):
        return s

    text = str(s)
    parts = text.split(";")

    # 先頭がちょうど "Channel=Security" なら、それを落とす
    if parts and parts[0] == "Channel=Security":
        parts = parts[1:]

    # 空文字を除いて再結合
    parts = [p for p in parts if p != ""]
    return ";".join(parts)

In [30]:
project_name = "WEB1"
input_dir = INTERIM_DIR/project_name

df = pd.read_csv(input_dir/"security.csv")
# Content カラムに適用
df["Content"] = df["Content"].apply(remove_leading_channel_security)
# ""をNaNに変換
df.loc[df["Content"] == "", "Content"] = np.nan

In [31]:
# 保存
df.to_csv(input_dir/"security_.csv", index=False)

### 余計なデータを消す
scenario：削除対象 [正常期間]
- VSCode1：(2024)5/22 [(2024)6/27~7/6]
- WEB1：(2023)11/22 ~ 11/23 [(2024)3/6~3/17]
- WEB2：(2023)11/22 ~ 11/23 [(2024)3/6~3/17]


In [11]:
preprocess.delete_unwanted_logs(
    input_filepath=Path(INTERIM_DIR /"VSCode1"/ "security2.csv"),
    start_date="2024-06-01",
    end_date="2024-07-30",
)

Unnamed: 0,TimeCreated_SystemTime,Security_UserID,Correlation_ActivityID,Channel,Version,Content,Correlation_RelatedActivityID,Provider_Guid,Keywords,Opcode,EventID,Provider_Name,Computer,Task,EventRecordID,Level,Execution_ProcessID,Execution_ThreadID,date
7458,2024-06-27 05:12:47.146389+00:00,,,Security,1,SubjectUserSid=S-1-5-18|SubjectUserName=SWATTC...,,{54849625-5478-4994-a5ba-3e3b0328c30d},0x8020000000000000,0,4616,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,12288,2543604,0,4,6596,2024-06-27
7459,2024-06-27 05:12:48.635813+00:00,,,Security,2,SubjectUserSid=S-1-5-18|SubjectUserName=SWATTC...,,{54849625-5478-4994-a5ba-3e3b0328c30d},0x8020000000000000,0,4688,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,13312,2543605,0,4,6596,2024-06-27
7460,2024-06-27 05:12:50.183855+00:00,,,Security,2,SubjectUserSid=S-1-5-18|SubjectUserName=SWATTC...,,{54849625-5478-4994-a5ba-3e3b0328c30d},0x8020000000000000,0,4688,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,13312,2543606,0,4,2332,2024-06-27
7461,2024-06-27 05:12:50.618172+00:00,,,Security,2,SubjectUserSid=S-1-5-18|SubjectUserName=SWATTC...,,{54849625-5478-4994-a5ba-3e3b0328c30d},0x8020000000000000,0,4688,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,13312,2543607,0,4,2648,2024-06-27
7462,2024-06-27 05:12:50.619019+00:00,,,Security,2,SubjectUserSid=S-1-5-18|SubjectUserName=SWATTC...,,{54849625-5478-4994-a5ba-3e3b0328c30d},0x8020000000000000,0,4688,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,13312,2543608,0,4,2988,2024-06-27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93907,2024-07-06 18:35:40.437977+00:00,,,Security,0,ProcessId=676|Application=\device\harddiskvolu...,,{54849625-5478-4994-a5ba-3e3b0328c30d},0x8020000000000000,0,5158,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,12810,2630053,0,4,5552,2024-07-06
93908,2024-07-06 18:35:40.438011+00:00,,,Security,1,ProcessID=676|Application=\device\harddiskvolu...,,{54849625-5478-4994-a5ba-3e3b0328c30d},0x8020000000000000,0,5156,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,12810,2630054,0,4,5552,2024-07-06
93909,2024-07-06 18:35:40.498657+00:00,,,Security,2,SubjectUserSid=S-1-5-18|SubjectUserName=SWATTC...,,{54849625-5478-4994-a5ba-3e3b0328c30d},0x8020000000000000,0,4688,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,13312,2630055,0,4,6436,2024-07-06
93910,2024-07-06 18:35:40.527555+00:00,,,Security,2,SubjectUserSid=S-1-5-18|SubjectUserName=SWATTC...,,{54849625-5478-4994-a5ba-3e3b0328c30d},0x8020000000000000,0,4688,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,13312,2630056,0,4,6436,2024-07-06


In [12]:
preprocess.delete_unwanted_logs(
    input_filepath=Path(INTERIM_DIR /"WEB1"/ "security2.csv"),
    start_date="2024-03-01",
    end_date="2024-03-30",
)

Unnamed: 0,TimeCreated_SystemTime,Security_UserID,Correlation_ActivityID,Channel,Version,Content,Correlation_RelatedActivityID,Provider_Guid,Keywords,Opcode,EventID,Provider_Name,Computer,Task,EventRecordID,Level,Execution_ProcessID,Execution_ThreadID,date,Label
0,2024-03-06 05:48:33.170567+00:00,,{326660a5-67c0-0001-6861-6632c067da01},Security,1,SubjectUserSid=S-1-5-18|SubjectUserName=SWATTC...,,{54849625-5478-4994-a5ba-3e3b0328c30d},0x8020000000000000,0,4702,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,12804,1367106,0,692,6000,2024-03-06,-
1,2024-03-06 05:48:33.174620+00:00,,{326660a5-67c0-0001-6861-6632c067da01},Security,1,SubjectUserSid=S-1-5-18|SubjectUserName=SWATTC...,,{54849625-5478-4994-a5ba-3e3b0328c30d},0x8020000000000000,0,4702,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,12804,1367107,0,692,6000,2024-03-06,-
2,2024-03-06 05:48:33.180763+00:00,,{326660a5-67c0-0001-6861-6632c067da01},Security,1,SubjectUserSid=S-1-5-18|SubjectUserName=SWATTC...,,{54849625-5478-4994-a5ba-3e3b0328c30d},0x8020000000000000,0,4702,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,12804,1367108,0,692,6000,2024-03-06,-
3,2024-03-06 05:48:33.184887+00:00,,{326660a5-67c0-0001-6861-6632c067da01},Security,1,SubjectUserSid=S-1-5-18|SubjectUserName=SWATTC...,,{54849625-5478-4994-a5ba-3e3b0328c30d},0x8020000000000000,0,4702,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,12804,1367109,0,692,6000,2024-03-06,-
4,2024-03-06 05:48:38.115097+00:00,,,Security,0,ProcessId=8596|Application=\device\harddiskvol...,,{54849625-5478-4994-a5ba-3e3b0328c30d},0x8020000000000000,0,5158,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,12810,1367110,0,4,5136,2024-03-06,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194069,2024-03-17 16:48:46.498323+00:00,,{81f03b91-7558-0001-6c3c-f0815875da01},Security,0,TargetUserName=WDAGUtilityAccount|TargetDomain...,,{54849625-5478-4994-a5ba-3e3b0328c30d},0x8020000000000000,0,4798,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,13824,1561175,0,680,8556,2024-03-17,-
194070,2024-03-17 16:48:46.498625+00:00,,{81f03b91-7558-0001-6c3c-f0815875da01},Security,0,TargetUserName=一般|TargetDomainName=SWATTCKD|Ta...,,{54849625-5478-4994-a5ba-3e3b0328c30d},0x8020000000000000,0,4798,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,13824,1561176,0,680,8556,2024-03-17,-
194071,2024-03-17 16:48:46.617451+00:00,,,Security,2,SubjectUserSid=S-1-5-18|SubjectUserName=SWATTC...,,{54849625-5478-4994-a5ba-3e3b0328c30d},0x8020000000000000,0,4688,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,13312,1561177,0,4,5188,2024-03-17,anomaly
194072,2024-03-17 16:48:46.646482+00:00,,,Security,2,SubjectUserSid=S-1-5-18|SubjectUserName=SWATTC...,,{54849625-5478-4994-a5ba-3e3b0328c30d},0x8020000000000000,0,4688,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,13312,1561178,0,4,2656,2024-03-17,anomaly


In [14]:
preprocess.delete_unwanted_logs(
    input_filepath=Path(INTERIM_DIR /"WEB2"/ "security2.csv"),
    start_date="2024-03-01",
    end_date="2024-03-30",
)

Unnamed: 0,Security_UserID,Level,Keywords,Channel,Content,EventID,Correlation_ActivityID,TimeCreated_SystemTime,Correlation_RelatedActivityID,Task,Execution_ProcessID,Execution_ThreadID,Provider_Name,Computer,Version,Opcode,EventRecordID,Provider_Guid,Label,date
0,,0,0x8020000000000000,Security,SubjectUserSid=S-1-5-18|SubjectUserName=SWATTC...,4702,{326660a5-67c0-0001-6861-6632c067da01},2024-03-06 05:48:33.170567+00:00,,12804,692,6000,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,1,0,1367106,{54849625-5478-4994-a5ba-3e3b0328c30d},-,2024-03-06
1,,0,0x8020000000000000,Security,SubjectUserSid=S-1-5-18|SubjectUserName=SWATTC...,4702,{326660a5-67c0-0001-6861-6632c067da01},2024-03-06 05:48:33.174620+00:00,,12804,692,6000,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,1,0,1367107,{54849625-5478-4994-a5ba-3e3b0328c30d},-,2024-03-06
2,,0,0x8020000000000000,Security,SubjectUserSid=S-1-5-18|SubjectUserName=SWATTC...,4702,{326660a5-67c0-0001-6861-6632c067da01},2024-03-06 05:48:33.180763+00:00,,12804,692,6000,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,1,0,1367108,{54849625-5478-4994-a5ba-3e3b0328c30d},-,2024-03-06
3,,0,0x8020000000000000,Security,SubjectUserSid=S-1-5-18|SubjectUserName=SWATTC...,4702,{326660a5-67c0-0001-6861-6632c067da01},2024-03-06 05:48:33.184887+00:00,,12804,692,6000,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,1,0,1367109,{54849625-5478-4994-a5ba-3e3b0328c30d},-,2024-03-06
4,,0,0x8020000000000000,Security,ProcessId=8596|Application=\device\harddiskvol...,5158,,2024-03-06 05:48:38.115097+00:00,,12810,4,5136,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,0,0,1367110,{54849625-5478-4994-a5ba-3e3b0328c30d},-,2024-03-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195229,,0,0x8020000000000000,Security,SubjectUserSid=S-1-5-18|SubjectUserName=SWATTC...,4688,,2024-03-17 17:16:28.766081+00:00,,13312,4,3960,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,2,0,1562335,{54849625-5478-4994-a5ba-3e3b0328c30d},anomaly,2024-03-17
195230,,0,0x8020000000000000,Security,SubjectUserSid=S-1-5-18|SubjectUserName=SWATTC...,4688,,2024-03-17 17:16:29.024967+00:00,,13312,4,8988,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,2,0,1562336,{54849625-5478-4994-a5ba-3e3b0328c30d},anomaly,2024-03-17
195231,,0,0x8020000000000000,Security,SubjectUserSid=S-1-5-18|SubjectUserName=SWATTC...,4688,,2024-03-17 17:16:29.153864+00:00,,13312,4,3960,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,2,0,1562337,{54849625-5478-4994-a5ba-3e3b0328c30d},anomaly,2024-03-17
195232,,0,0x8020000000000000,Security,SubjectUserSid=S-1-5-18|SubjectUserName=SWATTC...,4673,,2024-03-17 17:16:29.450386+00:00,,13056,4,8988,Microsoft-Windows-Security-Auditing,SWAttckd.swtestnet.com,0,0,1562338,{54849625-5478-4994-a5ba-3e3b0328c30d},-,2024-03-17


### T1105のフルバージョンのstructuredを作成

In [8]:
preprocess.parse_log(
    input_dir=NO_MEANING_DIR/"T1105", 
    output_dir=NO_MEANING_DIR/"T1105", 
    logfile_name='security2', 
    parser_type="drain", 
)

Parsing file: ../data/no_meaning/T1105/buffer.csv
Total size after reading CSV: 1027687
Parsing done. [Time taken: 0:02:07.250112]


### HDFS