## 共通

In [2]:
from pathlib import Path
import pandas as pd
import os
import numpy as np
from tqdm import tqdm
from xml.etree.ElementTree import fromstring, ElementTree
from Evtx.Evtx import Evtx
import csv
import sys

INTERIM_DIR = Path('../data/interim')
PROCESSED_DIR = Path('../data/processed')
RAW_DIR = Path('../data/raw')
NO_MEANING_DIR = Path('../data/no_meaning')

In [3]:
%load_ext autoreload
%autoreload 2
import preprocess

In [4]:
parent_dir = "ScenarioData"

## 実験１：security.csvの欠損値等 把握

In [8]:
project_name_list = ['T1105', 'VSCode1', 'WEB1', 'WEB2', 'WEB3']

dfs = {}  # プロジェクト名 → DataFrame を入れる辞書

for project_name in project_name_list:
    input_dir = INTERIM_DIR / project_name
    csv_path = input_dir / "security.csv"  # 読み込みたいファイル名に合わせて変更

    df = pd.read_csv(csv_path)
    dfs[project_name] = df

In [9]:
# 欠損数マトリクス（カラム×project_name）
null_dict = {
    project_name: df.isnull().sum()
    for project_name, df in dfs.items()
}
null_matrix = pd.DataFrame(null_dict).fillna(0).astype(int)

# 各 DataFrame の行数を集める
row_counts = {project_name: len(df) for project_name, df in dfs.items()}

# 一番下に行数の行を追加
null_matrix.loc["__total_rows__"] = pd.Series(row_counts)

In [10]:
null_matrix

Unnamed: 0,T1105,VSCode1,WEB1,WEB2,WEB3
Channel,0,0,0,0,0
Computer,0,0,0,0,0
Content,2,5,2,2,2
Correlation_ActivityID,162811,81906,197463,197448,197370
Correlation_RelatedActivityID,177829,93912,215806,215808,215773
EventID,0,0,0,0,0
EventRecordID,0,0,0,0,0
Execution_ProcessID,0,0,0,0,0
Execution_ThreadID,0,0,0,0,0
Keywords,0,0,0,0,0


In [11]:
null_matrix_T = null_matrix.T
null_matrix_T

Unnamed: 0,Channel,Computer,Content,Correlation_ActivityID,Correlation_RelatedActivityID,EventID,EventRecordID,Execution_ProcessID,Execution_ThreadID,Keywords,...,Level,Opcode,Provider_Guid,Provider_Name,Security_UserID,Task,TimeCreated_SystemTime,Version,date,__total_rows__
T1105,0,0,2,162811,177829,0,0,0,0,0,...,0,0,0,0,177829,0,0,0,0,177829
VSCode1,0,0,5,81906,93912,0,0,0,0,0,...,0,0,0,0,93912,0,0,0,0,93912
WEB1,0,0,2,197463,215806,0,0,0,0,0,...,0,0,0,0,215806,0,0,0,0,215806
WEB2,0,0,2,197448,215808,0,0,0,0,0,...,0,0,0,0,215808,0,0,0,0,215808
WEB3,0,0,2,197370,215773,0,0,0,0,0,...,0,0,0,0,215773,0,0,0,0,215773


結論：   
全シナリオについて、以下は全行で欠損。
- Correlation_RelatedActivityID
- Security_UserID  


In [12]:
time_line = pd.read_csv(RAW_DIR/parent_dir/"T1105/command-and-control_timeline_20240927.csv", encoding="cp932")

In [13]:
time_line.isnull().sum()

Timestamp         0
RuleTitle         0
Level             0
Computer          0
Channel           0
EventID           0
RecordID          0
Details           0
ExtraFieldInfo    0
dtype: int64

## 実験２：パースまで

TypeError: expected string or bytes-like object, got 'float'  
これはNanによるエラー

In [38]:
project_name = 'WEB1'
input_dir = INTERIM_DIR/project_name
output_dir = INTERIM_DIR/project_name

In [33]:
df = pd.read_csv(input_dir/"security.csv")

In [34]:
# Contentカラムについて処理（欠損値を含む行を削除）
df = df.dropna(subset=["Content"])

In [35]:
df.to_csv(input_dir / "security_clean.csv", index=False)

In [36]:
preprocess.parse_log(input_dir=input_dir, output_dir=output_dir, logfile_name='security_clean', parser_type="drain")

Parsing file: ../data/interim/WEB1/security_clean.csv
Initial DataFrame:
   Execution_ThreadID   Channel  Level  Opcode   Task  \
0                6000  Security      0       0  12804   
1                6000  Security      0       0  12804   
2                6000  Security      0       0  12804   
3                6000  Security      0       0  12804   
4                5136  Security      0       0  12810   

                   Correlation_ActivityID  \
0  {326660a5-67c0-0001-6861-6632c067da01}   
1  {326660a5-67c0-0001-6861-6632c067da01}   
2  {326660a5-67c0-0001-6861-6632c067da01}   
3  {326660a5-67c0-0001-6861-6632c067da01}   
4                                     NaN   

                                             Content  Security_UserID  \
0  SubjectUserSid=S-1-5-18;SubjectUserName=SWATTC...              NaN   
1  SubjectUserSid=S-1-5-18;SubjectUserName=SWATTC...              NaN   
2  SubjectUserSid=S-1-5-18;SubjectUserName=SWATTC...              NaN   
3  SubjectUserSid=S-1

## 実験３：カラムの削減（T1105のみ）

方針：  
- 8/13~9/27までのログであり、量があまりにも膨大(csvでおよそ500M)。
- WEB系の100M(10日間)を参考に、異常ログを含む形で適当に削減。
- 削減は"日"単位で行う。
- 実行ファイルは NO_MEANING_DIR から拾う。**作成の際の上書きに注意**。

In [4]:
project_name = 'T1105'
ver_number = 2
input_dir = NO_MEANING_DIR/project_name
output_dir = INTERIM_DIR/project_name
data = pd.read_csv(input_dir/f"security{ver_number}.csv")
ano = pd.read_excel(RAW_DIR/parent_dir/project_name/"取りまとめ後参考"/"AtomicRedTeam_Analyze.xlsx", sheet_name="解析結果")

In [52]:
 # ラベルを取得（インデックスとして扱う準備）
df = pd.DataFrame()
df["Channel"] = ano["Unnamed: 9"].iloc[1:].values
df["EventRecordID"] = ano["Unnamed: 14"].iloc[1:].values

# Securityのみ抽出
df = df[df["Channel"] == 'Sec']

In [53]:
# Label カラム作成
data["Label"] = "-"

mask = data["EventRecordID"].isin(df["EventRecordID"])
data.loc[mask, "Label"] = "anomaly"

In [54]:
data[data["Label"] == "anomaly"]

Unnamed: 0,Level,EventID,EventRecordID,TimeCreated_SystemTime,Channel,Task,Provider_Name,Correlation_RelatedActivityID,Execution_ProcessID,Execution_ThreadID,Security_UserID,Keywords,Content,Correlation_ActivityID,Provider_Guid,Computer,Opcode,Version,Label
1020645,0,4688,4718654,2024-09-26 17:57:13.401667+00:00,Security,13312,Microsoft-Windows-Security-Auditing,,4,300,,0x8020000000000000,SubjectUserSid=S-1-5-21-143320146-2996204461-2...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,2,anomaly
1020646,0,4688,4718655,2024-09-26 17:57:13.409899+00:00,Security,13312,Microsoft-Windows-Security-Auditing,,4,32,,0x8020000000000000,SubjectUserSid=S-1-5-21-143320146-2996204461-2...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,2,anomaly
1020647,0,4688,4718656,2024-09-26 17:57:13.442478+00:00,Security,13312,Microsoft-Windows-Security-Auditing,,4,32,,0x8020000000000000,SubjectUserSid=S-1-5-21-143320146-2996204461-2...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,2,anomaly
1020701,0,4688,4718710,2024-09-26 17:59:47.880032+00:00,Security,13312,Microsoft-Windows-Security-Auditing,,4,8064,,0x8020000000000000,SubjectUserSid=S-1-5-21-143320146-2996204461-2...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,2,anomaly
1020702,0,4688,4718711,2024-09-26 17:59:47.883730+00:00,Security,13312,Microsoft-Windows-Security-Auditing,,4,8744,,0x8020000000000000,SubjectUserSid=S-1-5-21-143320146-2996204461-2...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,2,anomaly
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1021489,0,5156,4719498,2024-09-26 18:48:50.349300+00:00,Security,12810,Microsoft-Windows-Security-Auditing,,4,6840,,0x8020000000000000,ProcessID=1284|Application=\device\harddiskvol...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,1,anomaly
1021491,0,5156,4719500,2024-09-26 18:48:50.512531+00:00,Security,12810,Microsoft-Windows-Security-Auditing,,4,6840,,0x8020000000000000,ProcessID=6912|Application=\device\harddiskvol...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,1,anomaly
1021494,0,5156,4719503,2024-09-26 18:48:50.929255+00:00,Security,12810,Microsoft-Windows-Security-Auditing,,4,300,,0x8020000000000000,ProcessID=6912|Application=\device\harddiskvol...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,1,anomaly
1021779,0,4688,4719788,2024-09-26 18:55:46.865633+00:00,Security,13312,Microsoft-Windows-Security-Auditing,,4,4972,,0x8020000000000000,SubjectUserSid=S-1-5-21-143320146-2996204461-2...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,2,anomaly


- 異常ログは9/26に発生している
- よって、10日間(9/18~9/27)を切り出す。

In [5]:
data["TimeCreated_SystemTime"] = pd.to_datetime(
    data["TimeCreated_SystemTime"], 
    errors="coerce",      # 変換できないものは NaT にする
)

data["date"] = data["TimeCreated_SystemTime"].dt.date

filtered = data[
    (data["date"] >= pd.to_datetime("2024-09-18").date()) &
    (data["date"] <= pd.to_datetime("2024-09-28").date())
]

In [56]:
filtered

Unnamed: 0,Level,EventID,EventRecordID,TimeCreated_SystemTime,Channel,Task,Provider_Name,Correlation_RelatedActivityID,Execution_ProcessID,Execution_ThreadID,Security_UserID,Keywords,Content,Correlation_ActivityID,Provider_Guid,Computer,Opcode,Version,Label,date
849867,0,5156,4547876,2024-09-18 00:00:14.836159+00:00,Security,12810,Microsoft-Windows-Security-Auditing,,4,2548,,0x8020000000000000,ProcessID=5236|Application=\device\harddiskvol...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,1,-,2024-09-18
849868,0,5158,4547877,2024-09-18 00:00:36.620955+00:00,Security,12810,Microsoft-Windows-Security-Auditing,,4,2548,,0x8020000000000000,ProcessId=2800|Application=\device\harddiskvol...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,0,-,2024-09-18
849869,0,5156,4547878,2024-09-18 00:00:36.621227+00:00,Security,12810,Microsoft-Windows-Security-Auditing,,4,2548,,0x8020000000000000,ProcessID=2800|Application=\device\harddiskvol...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,1,-,2024-09-18
849870,0,5156,4547879,2024-09-18 00:00:55.269754+00:00,Security,12810,Microsoft-Windows-Security-Auditing,,4,2548,,0x8020000000000000,ProcessID=5236|Application=\device\harddiskvol...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,1,-,2024-09-18
849871,0,5156,4547880,2024-09-18 00:01:35.623026+00:00,Security,12810,Microsoft-Windows-Security-Auditing,,4,2548,,0x8020000000000000,ProcessID=5236|Application=\device\harddiskvol...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,1,-,2024-09-18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1027691,0,5156,4725700,2024-09-27 00:07:55.983658+00:00,Security,12810,Microsoft-Windows-Security-Auditing,,4,2684,,0x8020000000000000,ProcessID=4|Application=System|Direction=%%145...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,1,-,2024-09-27
1027692,0,5158,4725701,2024-09-27 00:07:55.987034+00:00,Security,12810,Microsoft-Windows-Security-Auditing,,4,8524,,0x8020000000000000,ProcessId=4|Application=System|SourceAddress=0...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,0,-,2024-09-27
1027693,0,5156,4725702,2024-09-27 00:07:55.987099+00:00,Security,12810,Microsoft-Windows-Security-Auditing,,4,8524,,0x8020000000000000,ProcessID=4|Application=System|Direction=%%145...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,1,-,2024-09-27
1027694,0,4688,4725703,2024-09-27 00:07:56.035160+00:00,Security,13312,Microsoft-Windows-Security-Auditing,,4,10016,,0x8020000000000000,SubjectUserSid=S-1-5-21-143320146-2996204461-2...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,2,-,2024-09-27


In [6]:
#--------------ファイル上書き注意！！！！！！！！！-----------------#
filtered.to_csv(output_dir/f"security{ver_number}.csv", index=False)

## 実験４：モデル前データ作成！

### 準備

In [58]:
project_name = 'T1105'
input_dir = INTERIM_DIR/project_name
output_dir = PROCESSED_DIR/project_name
output_dir.mkdir(exist_ok=True)

In [16]:
data = pd.read_csv(input_dir/"security_clean_structured.csv")
ano = pd.read_excel(RAW_DIR/parent_dir/project_name/"取りまとめ後参考"/"AtomicRedTeam_Analyze.xlsx", sheet_name="解析結果")

In [17]:
 # ラベルを取得（インデックスとして扱う準備）
df = pd.DataFrame()
df["Channel"] = ano["Unnamed: 9"].iloc[1:].values
df["EventRecordID"] = ano["Unnamed: 14"].iloc[1:].values

# Securityのみ抽出
df = df[df["Channel"] == 'Sec']

# Label カラム作成
data["Label"] = "-"

mask = data["EventRecordID"].isin(df["EventRecordID"])
data.loc[mask, "Label"] = "anomaly"

In [18]:
# 諸操作
data["Label"] = data["Label"].apply(lambda x: int(x != "-"))
data["datetime"] = pd.to_datetime(data["TimeCreated_SystemTime"])
data["timestamp"] = data["datetime"].view("int64") // 10**9  
data["deltaT"] = data["datetime"].diff().dt.total_seconds().fillna(0)

  data["timestamp"] = data["datetime"].view("int64") // 10**9


In [19]:
data

Unnamed: 0,LineId,Execution_ThreadID,Level,Version,TimeCreated_SystemTime,Correlation_RelatedActivityID,Provider_Guid,Computer,EventRecordID,Provider_Name,...,Task,EventID,Execution_ProcessID,Label,date,EventId,EventTemplate,datetime,timestamp,deltaT
0,1,2548,0,1,2024-09-18 00:00:14.836159+00:00,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,4547876,Microsoft-Windows-Security-Auditing,...,12810,5156,4,0,2024-09-18,f469db94,ProcessID=<*>;Application=\device\harddiskvolu...,2024-09-18 00:00:14.836159+00:00,1726617614,0.000000
1,2,2548,0,0,2024-09-18 00:00:36.620955+00:00,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,4547877,Microsoft-Windows-Security-Auditing,...,12810,5158,4,0,2024-09-18,1bd2065f,ProcessId=<*>;Application=\device\harddiskvolu...,2024-09-18 00:00:36.620955+00:00,1726617636,21.784796
2,3,2548,0,1,2024-09-18 00:00:36.621227+00:00,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,4547878,Microsoft-Windows-Security-Auditing,...,12810,5156,4,0,2024-09-18,dfb7a98c,ProcessID=<*>;Application=\device\harddiskvolu...,2024-09-18 00:00:36.621227+00:00,1726617636,0.000272
3,4,2548,0,1,2024-09-18 00:00:55.269754+00:00,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,4547879,Microsoft-Windows-Security-Auditing,...,12810,5156,4,0,2024-09-18,f469db94,ProcessID=<*>;Application=\device\harddiskvolu...,2024-09-18 00:00:55.269754+00:00,1726617655,18.648527
4,5,2548,0,1,2024-09-18 00:01:35.623026+00:00,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,4547880,Microsoft-Windows-Security-Auditing,...,12810,5156,4,0,2024-09-18,f469db94,ProcessID=<*>;Application=\device\harddiskvolu...,2024-09-18 00:01:35.623026+00:00,1726617695,40.353272
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177822,177823,2684,0,1,2024-09-27 00:07:55.983658+00:00,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,4725700,Microsoft-Windows-Security-Auditing,...,12810,5156,4,0,2024-09-27,19d38fdc,ProcessID=<*>;Application=System;Direction=%%<...,2024-09-27 00:07:55.983658+00:00,1727395675,0.000048
177823,177824,8524,0,0,2024-09-27 00:07:55.987034+00:00,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,4725701,Microsoft-Windows-Security-Auditing,...,12810,5158,4,0,2024-09-27,58b8a4e0,ProcessId=<*>;Application=System;SourceAddress...,2024-09-27 00:07:55.987034+00:00,1727395675,0.003376
177824,177825,8524,0,1,2024-09-27 00:07:55.987099+00:00,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,4725702,Microsoft-Windows-Security-Auditing,...,12810,5156,4,0,2024-09-27,19d38fdc,ProcessID=<*>;Application=System;Direction=%%<...,2024-09-27 00:07:55.987099+00:00,1727395675,0.000065
177825,177826,10016,0,2,2024-09-27 00:07:56.035160+00:00,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,4725703,Microsoft-Windows-Security-Auditing,...,13312,4688,4,0,2024-09-27,5b31d60c,SubjectUserSid=S-<*>-<*>;SubjectUserName=taro;...,2024-09-27 00:07:56.035160+00:00,1727395676,0.048061


In [20]:
window_size = 5
step_size = 1

# sampling with sliding window
deeplog_df = preprocess.sliding_window(
    data[["timestamp", "Label", "EventId", "deltaT"]],
    para={"window_size": int(window_size) * 60, "step_size": int(step_size) * 60},
)

there are 12742 instances (sliding windows) in this dataset



In [21]:
# normalとabnormalを切り分け
df_normal = deeplog_df[deeplog_df["Label"] == 0]
df_abnormal = deeplog_df[deeplog_df["Label"] == 1]

df_normal = df_normal.sample(frac=1, random_state=12).reset_index(
    drop=True
)  # shuffle
normal_len = len(df_normal)

### 作成

In [99]:
train_ratio_list = [0.4, 0.6, 0.8]
for train_ratio in train_ratio_list:

    train_len = int(normal_len * train_ratio)
    save_dir = output_dir/f'ratio_{train_ratio}'

    os.makedirs(save_dir, exist_ok=True)

    # train
    train = df_normal[:train_len]
    preprocess.deeplog_file_generator(
        filename = str(save_dir) + '/train',
        df = train,
        features = ["EventId", "deltaT"],
    )
    print("training size {}".format(train_len))

    # test(normal)
    test_normal = df_normal[train_len:]
    preprocess.deeplog_file_generator(
        filename = str(save_dir) + '/test_normal',
        df = test_normal,
        features = ["EventId", "deltaT"],
    )
    print("test normal size {}".format(normal_len - train_len))

    # abnormal
    
    # 必要なら EventId のマッピングを復活
    # df_abnormal["EventId"] = df_abnormal["EventId"].progress_apply(
    #     lambda e: event_index_map[e] if event_index_map.get(e) else UNK
    # )

    preprocess.deeplog_file_generator(
        filename = str(save_dir) + '/test_abnormal',
        df = df_abnormal,
        features = ["EventId", "deltaT"], 
    )
    print("test abnormal size {}".format(len(df_abnormal)))

training size 5076
test normal size 7615
test abnormal size 51
test normal size 7615
test abnormal size 51
training size 7614
training size 7614
test normal size 5077
test abnormal size 51
test normal size 5077
test abnormal size 51
training size 10152
training size 10152
test normal size 2539
test abnormal size 51
test normal size 2539
test abnormal size 51


### 補助実験：vocab用データ作成
- 完全な正常データ（EventIdのみ）のファイルを作成する
- 保存先は project_name / vocab 下

In [None]:
train_ratio = 1.0

train_len = int(normal_len * train_ratio)
save_dir = output_dir/f'vocab'

os.makedirs(save_dir, exist_ok=True)

# train
train = df_normal[:train_len]
preprocess.deeplog_file_generator(
    filename = str(save_dir) + '/train',
    df = train,
    features = ["EventId"], # EventId only
)
print("training size {}".format(train_len))

training size 12691


## 実験５：モデル動作検証（training）
- confはとりあえず「art3.yaml」を参考にした「test.yaml」で指定

In [4]:
%load_ext autoreload
%autoreload 2
import main

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# train
main.main_cli([
    "train",
    "bert/test",           # conf/bert_config_name.yaml
    #"default.device_id=0",        # ここに好きな key=value を並べる
    #"default.epochs=10",
])

In [None]:
# テスト
main.main_cli([
    "test",  # run_mode
    "outputs/logbert/bert/T1105/ratio_0.6/seq_len_128/r_seed_31/weights/ValTotalbest.pth", # 重みファイルまでの相対パス
    "64",   # eval_batchsize
    "cuda:0",     # gpu
])

outputs/logbert/bert/T1105/ratio_0.6/seq_len_128/r_seed_31
cuda:0 cuda:0
Building Vocab


100%|██████████| 12691/12691 [00:00<00:00, 61864.31it/s]


VOCAB SIZE: 990


Processing normal test data: 100%|██████████| 5077/5077 [00:01<00:00, 4061.19it/s]
Processing abnormal test data: 100%|██████████| 51/51 [00:00<00:00, 3650.21it/s]
100%|██████████| 78/78 [00:07<00:00, 10.81it/s]
100%|██████████| 51/51 [00:01<00:00, 45.68it/s]


Saving test normal results
Saving test abnormal results
[seq_th, FP, TP, TN, FN, Precision, Recall, F1, TPR, FPR]
thresholds=0.0['1730.0000', '50.0000', '3262.0000', '1.0000', '2.8090', '98.0392', '5.4615', '34.6554', '98.0392']
thresholds=0.1['578.0000', '43.0000', '4414.0000', '8.0000', '6.9243', '84.3137', '12.7976', '11.5785', '84.3137']
thresholds=0.2['189.0000', '33.0000', '4803.0000', '18.0000', '14.8649', '64.7059', '24.1758', '3.7861', '64.7059']
thresholds=0.3['83.0000', '16.0000', '4909.0000', '35.0000', '16.1616', '31.3725', '21.3333', '1.6627', '31.3725']
thresholds=0.4['35.0000', '7.0000', '4957.0000', '44.0000', '16.6667', '13.7255', '15.0538', '0.7011', '13.7255']
thresholds=0.5['14.0000', '0.0000', '4978.0000', '51.0000', '0.0000', '0.0000', '0.0000', '0.2804', '0.0000']
thresholds=0.6['9.0000', '0.0000', '4983.0000', '51.0000', '0.0000', '0.0000', '0.0000', '0.1803', '0.0000']
thresholds=0.7['4.0000', '0.0000', '4988.0000', '51.0000', '0.0000', '0.0000', '0.0000', '0.

## 余談

### T1105の余事象サブセット作成

In [8]:
data = pd.read_csv(NO_MEANING_DIR/"T1105"/"security2.csv")

data["TimeCreated_SystemTime"] = pd.to_datetime(
    data["TimeCreated_SystemTime"], 
    errors="coerce",      # 変換できないものは NaT にする
)

data["date"] = data["TimeCreated_SystemTime"].dt.date

filtered = data[
    (data["date"] < pd.to_datetime("2024-09-18").date()) 
]

filtered.to_csv(NO_MEANING_DIR/"T1105"/"security2_complement.csv", index=False)

In [6]:
filtered

Unnamed: 0,Level,EventID,EventRecordID,TimeCreated_SystemTime,Channel,Task,Provider_Name,Correlation_RelatedActivityID,Execution_ProcessID,Execution_ThreadID,Security_UserID,Keywords,Content,Correlation_ActivityID,Provider_Guid,Computer,Opcode,Version,Label,date
0,4,1102,3698009,2024-08-13 09:13:51.656221+00:00,Security,104,Microsoft-Windows-Eventlog,,1276,10556,,0x4020000000000000,,,{fc65ddd8-d6ef-4962-83d5-6e5cfe9ce148},SWAttckd.swtestnet.com,0,1,-,2024-08-13
1,0,4673,3698010,2024-08-13 09:14:00.897017+00:00,Security,13056,Microsoft-Windows-Security-Auditing,,4,5320,,0x8010000000000000,SubjectUserSid=S-1-5-21-143320146-2996204461-2...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,0,-,2024-08-13
2,0,4673,3698011,2024-08-13 09:14:15.903051+00:00,Security,13056,Microsoft-Windows-Security-Auditing,,4,5320,,0x8010000000000000,SubjectUserSid=S-1-5-21-143320146-2996204461-2...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,0,-,2024-08-13
3,0,5156,3698012,2024-08-13 09:14:16.729042+00:00,Security,12810,Microsoft-Windows-Security-Auditing,,4,5320,,0x8020000000000000,ProcessID=5216|Application=\device\harddiskvol...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,1,-,2024-08-13
4,0,5156,3698013,2024-08-13 09:14:17.731905+00:00,Security,12810,Microsoft-Windows-Security-Auditing,,4,5320,,0x8020000000000000,ProcessID=5216|Application=\device\harddiskvol...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,1,-,2024-08-13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
868046,0,5156,4566055,2024-09-18 23:59:53.532356+00:00,Security,12810,Microsoft-Windows-Security-Auditing,,4,4632,,0x8020000000000000,ProcessID=1176|Application=\device\harddiskvol...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,1,-,2024-09-18
868047,0,5156,4566056,2024-09-18 23:59:53.976131+00:00,Security,12810,Microsoft-Windows-Security-Auditing,,4,4632,,0x8020000000000000,ProcessID=1176|Application=\device\harddiskvol...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,1,-,2024-09-18
868048,0,5156,4566057,2024-09-18 23:59:53.976151+00:00,Security,12810,Microsoft-Windows-Security-Auditing,,4,4632,,0x8020000000000000,ProcessID=1176|Application=\device\harddiskvol...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,1,-,2024-09-18
868049,0,5156,4566058,2024-09-18 23:59:54.286472+00:00,Security,12810,Microsoft-Windows-Security-Auditing,,4,4632,,0x8020000000000000,ProcessID=4|Application=System|Direction=%%145...,,{54849625-5478-4994-a5ba-3e3b0328c30d},SWAttckd.swtestnet.com,0,1,-,2024-09-18


In [35]:
project_name = 'WEB1'
input_dir = INTERIM_DIR/project_name
output_dir = NO_MEANING_DIR/"Windows_5000"

In [36]:
output_dir.mkdir(exist_ok=True)

In [37]:
df = pd.read_csv(input_dir/"security2.csv")

df = df.dropna(subset=["Content"])

# 後端5000件を取り出す
last_5000_rows = df.tail(5000)

# 結果を保存
last_5000_rows.to_csv(output_dir/"windows_events_5000_part3.csv", index=False)


### LLMParser 用の教師データ作成

In [30]:
project_name = 'VSCode1'
input_dir = INTERIM_DIR/project_name
output_dir = NO_MEANING_DIR/"Windows_5000"
output_dir.mkdir(exist_ok=True)

In [6]:
df = pd.read_csv(input_dir / "security.csv")

df = df.dropna(subset=["Content"])

# 抽出数（データ件数が 5000 未満でも動くように調整）
n_samples = min(5000, len(df))

# ランダムに n_samples 行を抽出
sampled_rows = df.sample(n=n_samples, random_state=42)

sampled_rows.to_csv(output_dir / "security_5000.csv", index=False)

In [26]:
input1 = str(INTERIM_DIR/"T1105"/"security2.csv")
input2 = str(INTERIM_DIR/"VSCode1"/"security2.csv")
input3 = str(INTERIM_DIR/"WEB1"/"security2.csv")

output_dir = str(NO_MEANING_DIR/"Windows_5000") 

sample1, sample2 = preprocess.stratified_sample_by_eventid_two_sets(
    input_files=[input1, input2, input3],
    output_file1=output_dir+"/windows_events_5000_part1.csv",
    output_file2=output_dir+"/windows_events_5000_part2.csv",
    target_n_each=5000,
    event_id_col="EventID",
)

結合後の全レコード数: 1337414
サンプル1件数: 5000
サンプル1除外後の残りレコード数: 1332414
サンプル2件数: 5000
サンプル1・2の重複インデックス数: 0
サンプル1を保存しました: ../data/no_meaning/Windows_5000/windows_events_5000_part1.csv
サンプル2を保存しました: ../data/no_meaning/Windows_5000/windows_events_5000_part2.csv


In [28]:
df1 = pd.read_csv(output_dir+"/windows_events_5000_part1.csv")
df2 = pd.read_csv(output_dir+"/windows_events_5000_part2.csv")

df1["EventID"].value_counts()

EventID
5156    2380
5158    1738
4673     264
4688     147
4702     115
        ... 
4614       1
4660       1
5382       1
4616       1
5381       1
Name: count, Length: 65, dtype: int64

### 正規表現調整してDrain

In [59]:
project_name = 'VSCode1'
input_dir = INTERIM_DIR/project_name
output_dir = INTERIM_DIR/project_name

df = pd.read_csv(input_dir/"security2.csv")
df = df.dropna(subset=["Content"]) # Contentカラムに欠損値をもつ行を削除
df.to_csv(input_dir / "security2_clean.csv", index=False)

In [None]:
preprocess.parse_log(input_dir=input_dir, output_dir=output_dir, logfile_name='security2_clean', parser_type="drain")

なんかうまくいってそう...

In [60]:
df["EventID"].value_counts()

EventID
5156    25244
4673    22852
5158    21596
5447     6188
4688     5687
5154     2165
4985     2138
4702     1449
4907     1228
4945      626
5157      564
4627      498
4624      498
4672      488
5379      472
4797      223
4776      219
4625      209
4948      178
4946      176
4798      158
4799      144
4674      135
4957      126
4663      123
5061      114
4611       78
4662       59
6416       47
4697       43
4634       34
4648       21
4622       20
5142       13
5382       10
4698        9
4699        8
4657        8
4616        7
4947        6
4953        6
4701        4
4660        4
4614        4
5140        3
4902        2
4610        2
4826        2
4696        2
4944        2
4718        2
4700        2
4717        2
4779        2
4778        2
4800        2
4647        2
4732        1
Name: count, dtype: int64

- だいたいイベント数同じ...

In [77]:
df_ = pd.read_csv(input_dir/"security2_clean_structured.csv")

In [78]:
df_["pair_id"] = df_.groupby(["EventId", "EventID"]).ngroup() + 1

In [79]:
df_.to_csv(input_dir/"security2_clean_structured.csv")

### Sysmon のパース

In [4]:
parent_dir = "ScenarioData"
project_name = 'VSCode1'

In [6]:
# test
input_dir = RAW_DIR/parent_dir/project_name
output_dir = INTERIM_DIR/project_name

preprocess.evtx_to_csv_without_eventdata_columns_samplingver(
    evtx_filepath = input_dir/"20240706063537_7EA74D56-6663-313B-2CC1-A7843FCD1AE6/Evtx/Microsoft-Windows-Sysmon%4Operational.evtx",
    output_dir = output_dir,
    output_filename = "Sysmon_test",
)

Processing records:   1%|          | 4900/839211 [00:00<02:16, 6111.97it/s]
Writing to CSV: 100%|██████████| 50/50 [00:00<00:00, 19790.05it/s]


In [None]:
# 実行
input_dir = RAW_DIR/parent_dir/project_name
output_dir = INTERIM_DIR/project_name

preprocess.evtx_to_csv_without_eventdata_columns(
    evtx_filepath = input_dir/"20240706063537_7EA74D56-6663-313B-2CC1-A7843FCD1AE6/Evtx/Microsoft-Windows-Sysmon%4Operational.evtx",
    output_dir = output_dir,
    output_filename = "Sysmon",
)