In [13]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from datetime import datetime
import json

In [14]:
# JSON 파일 로드
with open("learning_output_data.json", "r", encoding="utf-8") as f:
    json_data = json.load(f)
with open("Tag_Onehot.json", "r", encoding="utf-8") as f:
    onehot_data = json.load(f)

In [15]:
# BERT 모델 로드
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name)


In [16]:
# 1. Timestamp 처리 함수
def process_timestamp(timestamp):
    try:
        dt = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S.%f")
    except ValueError:
        dt = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
    return dt.timestamp()

In [17]:
# 2. 텍스트 임베딩 함수
def embed_text(text):
    if text is None or not isinstance(text, str) or text.strip() == "":
        return [0.0] * 768  # 빈 값은 768차원의 0으로 채움
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().tolist()


In [18]:
service_type_mapping = {
    "mail": [1, 0, 0],
    "drive": [0, 1, 0],
    "blog": [0, 0, 1]
}
zero_list = [0] * 99

In [19]:
# 3. 데이터 전처리
processed_data = []

for record in json_data:
    processed_record = {}

    # 3.1 메타데이터는 그대로 저장
    processed_record["incident_type"] = record.get("incident_type", None)

    # 3.2 Timestamp 처리
    if "Timestamp" in record:
        processed_record["Timestamp_processed"] = process_timestamp(record["Timestamp"])

    # 3.3 숫자형 필드는 그대로 저장
    for key, value in record.items():
        if key.startswith("jumplist_access_counts_"):
            processed_record[key] = value

    # 3.4 텍스트 필드 임베딩
    for key, value in record.items():
        if (
            key.startswith("usnjrnl_file_names_") or
            key.startswith("lnk_files_linked_paths_") or
            key.startswith("jumplist_data_entries_") or
            key.startswith("usnjrnl_reasons_") or
            key.startswith("usnjrnl_attributes_") or
            key.startswith("edge_internet_explorer_urls_") or
            key.startswith("locally_accessed_files_paths_") or
            key.startswith("mru_recent_files_folder_names_")
        ):
            processed_record[f"{key}_embedding"] = embed_text(value)

    # 추가: file_name과 service_type도 임베딩
    if "file_name" in record:
        processed_record["file_name_embedding"] = embed_text(record["file_name"])
    if "service_type" in record:
        service_type = record.get("service_type", "unknown")
        one_hot_service_type = service_type_mapping.get(service_type, [0, 0, 0])  # 없는 값은 [0, 0, 0]으로 처리
        processed_record["service_type_one_hot"] = one_hot_service_type  # 새로운 키로 추가
    if "web_upload_tag" in record:
        tag_type = record.get("web_upload_tag", "unnknown")
        one_hot_tag_type = onehot_data.get(tag_type, zero_list)
        processed_record["tag_type_one_hot"] = one_hot_tag_type


    processed_data.append(processed_record)


In [25]:
# 5. JSON 파일로 저장
with open("processed_data.json", "w", encoding="utf-8") as f:
    json.dump(processed_data, f, ensure_ascii=False, indent=4)