In [None]:
# 분류 모델에 사용할 최종 train 파일 확인용
import pandas as pd

#df = pd.read_csv("../data/processed/5000_w_정답라벨_42.csv") # random_state=42
df = pd.read_csv("../data/processed/5000_w_정답라벨_final.csv") # 랜덤 추출 -> json 넘길때 사용한 파일
#count = df.groupby(["predicted_category", "predicted_MAIN_NAME"]).size().reset_index(name="count")
#count = df.groupby(["predicted_category", "MAIN_NAME"]).size().reset_index(name="count")
count = df.groupby(["category", "MAIN_NAME"]).size().reset_index(name="count")
print(count)

                                category       MAIN_NAME  count
0                         개인정보보호법,정보통신망법    개인정보 유출·보호조치    399
1                         개인정보보호법,정보통신망법      개인정보보호법 기타    329
2                         개인정보보호법,정보통신망법     법적 분쟁·정치 연루    496
3                         개인정보보호법,정보통신망법   온라인·플랫폼 관련 이슈    261
4                                  아동복지법     법·제도·사회적 이슈    144
5                                  아동복지법  아동 유기·방임·사망 사건    135
6                                  아동복지법       아동 학대·성범죄    311
7                                  아동복지법        아동복지법 기타    104
8   자본시장법,특정금융정보법,전자금융거래법,전자증권법,금융소비자보호법       가상자산·규제정책    442
9   자본시장법,특정금융정보법,전자금융거래법,전자증권법,금융소비자보호법     금융사고·소비자 피해    202
10  자본시장법,특정금융정보법,전자금융거래법,전자증권법,금융소비자보호법     금융소비자보호법 기타    304
11  자본시장법,특정금융정보법,전자금융거래법,전자증권법,금융소비자보호법     시장·기업 관련 사건    453
12  자본시장법,특정금융정보법,전자금융거래법,전자증권법,금융소비자보호법      특검정치 연루 사건    267
13                               중대재해처벌법         산업재해 사건    513
14                               중대재해처벌법

In [None]:
# train/test
import os
import pickle

import numpy as np
import pandas as pd
from collections import defaultdict

import torch
import torch.nn as nn
from torch.nn.functional import softmax

from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModel,
    TrainingArguments, Trainer,
    EarlyStoppingCallback
)

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
import shutil

FILE_PATH_WITH_FEATURES = "../data/processed/5000_w_label.csv"
MODEL_NAME = "klue/roberta-base"
MAX_LENGTH = 512
SLIDING_STRIDE = 256
OTHER_NAME = "기타"

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 데이터 로더(인코딩 견고)
def load_data_with_robust_encodings(file_path):
    encodings = ['utf-8', 'euc-kr', 'cp949']
    last_err = None
    for encoding in encodings:
        try:
            df = pd.read_csv(file_path, encoding=encoding)
            return df
        except UnicodeDecodeError as e:
            last_err = e
            continue
    raise Exception(f"파일 로드 실패: {file_path} / 마지막 오류: {last_err}")

# 데이터 로드
df = load_data_with_robust_encodings(FILE_PATH_WITH_FEATURES).copy()

# content/라벨 필수 컬럼 체크
required_cols = {"content", "category", "MAIN_NAME"}
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f"필수 컬럼 누락: {missing}")

df["content"] = df["content"].astype(str)
df["category"] = df["category"].astype(str)
df["MAIN_NAME"] = df["MAIN_NAME"].astype(str)
df["__index"] = df.index  # 문서 식별자

print(df["category"].unique())
print(df["MAIN_NAME"].unique())

tmp_main = df["MAIN_NAME"].astype(str)
train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=SEED, stratify=tmp_main
)

# 학습에서만 '기타' 제외
train_df = train_df[train_df["MAIN_NAME"] != OTHER_NAME]

# 라벨 맵 (기타 제외 기준)
category_list = sorted(train_df["category"].astype(str).unique().tolist())
mainname_list = sorted(train_df["MAIN_NAME"].astype(str).unique().tolist())

law2id = {l: i for i, l in enumerate(category_list)}
cat2id = {c: i for i, c in enumerate(mainname_list)}
id2law = {v: k for k, v in law2id.items()}
id2cat = {v: k for k, v in cat2id.items()}

# 학습 라벨 부여
train_df["labels_law"] = train_df["category"].map(law2id)
train_df["labels_cat"] = train_df["MAIN_NAME"].map(cat2id)

# 평가셋: 공정 비교 위해 '기타' 샘플 제외 권장
test_df_eval = test_df[test_df["MAIN_NAME"] != OTHER_NAME].copy()
test_df_eval["labels_law"] = test_df_eval["category"].map(law2id)
test_df_eval["labels_cat"] = test_df_eval["MAIN_NAME"].map(cat2id)

# 혹시 train엔 없고 test에만 있는 클래스가 있으면 NaN → 제거
before = len(test_df_eval)
test_df_eval = test_df_eval.dropna(subset=["labels_law", "labels_cat"])
test_df_eval["labels_law"] = test_df_eval["labels_law"].astype(int)
test_df_eval["labels_cat"] = test_df_eval["labels_cat"].astype(int)
if len(test_df_eval) < before:
    print(f"[주의] 평가셋에서 train에 없는 클래스 {before - len(test_df_eval)}개 제거됨")

num_laws = len(law2id)
num_cats = len(cat2id)

# 법안-중분류 허용 마스크(기타 제외 기준)
allowed_cats_by_law = {
    law2id[l]: set(
        train_df[train_df["category"] == l]["labels_cat"].unique().tolist()
    )
    for l in category_list
}
mask_mat = torch.full((num_laws, num_cats), fill_value=-1e9, dtype=torch.float)
for law_id, cats in allowed_cats_by_law.items():
    for c in cats:
        mask_mat[law_id, c] = 0.0


# 토크나이저 & 슬라이딩 토큰화
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_with_sliding(batch):
    tokenized_batch = tokenizer(
        batch["content"],
        truncation=True, padding=False, max_length=MAX_LENGTH,
        stride=SLIDING_STRIDE, return_overflowing_tokens=True, return_offsets_mapping=False,
    )
    labels_law_list, labels_cat_list, doc_index_list = [], [], []
    # 입력 배치의 각 샘플이 여러 청크로 늘어날 수 있으므로 매핑 사용
    for sample_index in tokenized_batch["overflow_to_sample_mapping"]:
        labels_law_list.append(batch["labels_law"][sample_index])
        labels_cat_list.append(batch["labels_cat"][sample_index])
        doc_index_list.append(batch["__index"][sample_index])

    tokenized_batch["labels_law"] = labels_law_list
    tokenized_batch["labels_cat"] = labels_cat_list
    tokenized_batch["doc_index"]  = doc_index_list
    del tokenized_batch["overflow_to_sample_mapping"]
    return tokenized_batch

# Dataset 생성 및 map 
train_dataset = Dataset.from_pandas(
    train_df[["content", "labels_law", "labels_cat", "__index"]].copy()
)
test_dataset = Dataset.from_pandas(
    test_df_eval[["content", "labels_law", "labels_cat", "__index"]].copy()
)

cols_remove_train = train_dataset.column_names
cols_remove_test  = test_dataset.column_names

train_dataset = train_dataset.map(
    tokenize_with_sliding, batched=True,
    remove_columns=cols_remove_train, batch_size=256
)
test_dataset = test_dataset.map(
    tokenize_with_sliding, batched=True,
    remove_columns=cols_remove_test, batch_size=64
)


# 클래스 가중치(기타 제외 학습 분포 기준)
class_weights_cat_np = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(train_df["labels_cat"]),
    y=train_df["labels_cat"]
)
class_weights_cat = torch.tensor(class_weights_cat_np, dtype=torch.float)

# 모델 정의
class HierarchicalClassifier(nn.Module):
    def __init__(self, model_name, num_laws, num_cats, mask_mat, class_weights_cat=None):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        hidden = self.encoder.config.hidden_size

       
        self.config = self.encoder.config

        self.dropout = nn.Dropout(p=0.3)
        self.law_head = nn.Linear(hidden, num_laws)
        self.cat_head = nn.Linear(hidden, num_cats)

        self.register_buffer("mask_mat", mask_mat)
        if class_weights_cat is not None:
            self.register_buffer("class_weights_cat", class_weights_cat)
        else:
            self.class_weights_cat = None

    
    def get_input_embeddings(self):
        return self.encoder.get_input_embeddings()

    def set_input_embeddings(self, new_embeddings):
        self.encoder.set_input_embeddings(new_embeddings)

    def forward(self, input_ids, attention_mask, labels_law=None, labels_cat=None, use_predicted_law_for_mask=False):
        enc = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled = enc.last_hidden_state[:, 0]
        features = self.dropout(pooled)

        law_logits = self.law_head(features)
        cat_logits_raw = self.cat_head(features)

        if labels_law is not None and not use_predicted_law_for_mask:
            law_for_mask = labels_law
        else:
            law_for_mask = law_logits.argmax(dim=-1)

        # mask_selected = self.mask_mat.index_select(dim=0, index=law_for_mask)
        mask_selected = self.mask_mat[law_for_mask]  # index_select 대신 직접 인덱싱        
        cat_logits = cat_logits_raw + mask_selected

        loss = None
        if (labels_law is not None) and (labels_cat is not None):
            loss_law = nn.CrossEntropyLoss()(law_logits, labels_law)
            if self.class_weights_cat is not None:
                loss_cat = nn.CrossEntropyLoss(weight=self.class_weights_cat)(cat_logits, labels_cat)
            else:
                loss_cat = nn.CrossEntropyLoss()(cat_logits, labels_cat)
            loss = loss_law + loss_cat 

        return {"loss": loss, "law_logits": law_logits, "cat_logits": cat_logits, "cat_logits_raw": cat_logits_raw}

    
def custom_collator_dynamic(batch):
    text_inputs = [{k: v for k, v in item.items() if k in ["input_ids", "attention_mask"]} for item in batch]
    tokenized_batch = tokenizer.pad(text_inputs, padding=True, return_tensors="pt", max_length=MAX_LENGTH)
    return {
        **tokenized_batch,
        "labels_law": torch.tensor([b["labels_law"] for b in batch], dtype=torch.long),
        "labels_cat": torch.tensor([b["labels_cat"] for b in batch], dtype=torch.long),
    }

# 커스텀 Trainer (WeightedRandomSampler)
class HierTrainer(Trainer):
    def get_train_dataloader(self):
        if self.train_dataset is None:
            raise ValueError("Trainer: training requires a train_dataset.")
        labels_cat = np.array(self.train_dataset["labels_cat"])

        unique_labels, counts = np.unique(labels_cat, return_counts=True)
        class_weights_map = {label: 1.0 / count for label, count in zip(unique_labels, counts)}
        sample_weights = np.array([class_weights_map[label] for label in labels_cat])
        sample_weights = torch.from_numpy(sample_weights).double()

        sampler = torch.utils.data.WeightedRandomSampler(
            weights=sample_weights,
            num_samples=len(sample_weights),
            replacement=True
        )

        return torch.utils.data.DataLoader(
            self.train_dataset,
            batch_size=self.args.per_device_train_batch_size,
            sampler=sampler,
            collate_fn=self.data_collator,
            drop_last=self.args.dataloader_drop_last,
            num_workers=self.args.dataloader_num_workers,
            pin_memory=self.args.dataloader_pin_memory,
        )

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels_law = inputs.pop("labels_law")
        labels_cat = inputs.pop("labels_cat")
        outputs = model(**inputs, labels_law=labels_law, labels_cat=labels_cat, use_predicted_law_for_mask=False)
        loss = outputs["loss"]
        return (loss, outputs) if return_outputs else loss

    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
        labels_law = inputs.pop("labels_law")
        labels_cat = inputs.pop("labels_cat")
        with torch.no_grad():
            outputs = model(**inputs, labels_law=labels_law, labels_cat=labels_cat, use_predicted_law_for_mask=False)
        law_logits = outputs["law_logits"].detach().cpu()
        cat_logits = outputs["cat_logits"].detach().cpu()
        logits = torch.cat([law_logits, cat_logits], dim=1)
        labels = torch.stack([labels_law.cpu(), labels_cat.cpu()], dim=1)
        loss = outputs["loss"].detach().cpu()
        return (loss, logits, labels)


# 문서단위 메트릭 (청크 -> 문서 평균 후 Argmax)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    law_logits = logits[:, :num_laws]
    cat_logits = logits[:, num_laws:]

    law_probs = softmax(torch.tensor(law_logits), dim=-1).numpy()
    cat_probs = softmax(torch.tensor(cat_logits), dim=-1).numpy()

   
    doc_indices = test_dataset["doc_index"]

    doc_law_probs, doc_cat_probs = defaultdict(list), defaultdict(list)
    for pL, pC, idx in zip(law_probs, cat_probs, doc_indices):
        doc_law_probs[idx].append(pL)
        doc_cat_probs[idx].append(pC)

    final_law_preds, final_cat_preds = [], []
    final_law_labels, final_cat_labels = [], []

    for idx in doc_law_probs.keys():
        avg_law = np.mean(doc_law_probs[idx], axis=0)
        avg_cat = np.mean(doc_cat_probs[idx], axis=0)
        final_law_preds.append(int(np.argmax(avg_law)))
        final_cat_preds.append(int(np.argmax(avg_cat)))

        
        row = test_df_eval.loc[test_df_eval["__index"] == idx]
        if len(row) == 0:
            
            continue
        final_law_labels.append(int(row["labels_law"].iloc[0]))
        final_cat_labels.append(int(row["labels_cat"].iloc[0]))

    acc_law = accuracy_score(final_law_labels, final_law_preds)
    f1_law = f1_score(final_law_labels, final_law_preds, average="macro")
    acc_cat = accuracy_score(final_cat_labels, final_cat_preds)
    f1_cat = f1_score(final_cat_labels, final_cat_preds, average="macro")
    f1_cat_weighted = f1_score(final_cat_labels, final_cat_preds, average="weighted")

    return {
        "accuracy_law": acc_law,
        "f1_law": f1_law,
        "accuracy_cat": acc_cat,
        "f1_cat": f1_cat,
        "f1_cat_weighted": f1_cat_weighted
    }


# 모델/학습 세팅
model = HierarchicalClassifier(
    MODEL_NAME, num_laws=num_laws, num_cats=num_cats,
    mask_mat=mask_mat, class_weights_cat=class_weights_cat
).to(device)

training_args = TrainingArguments(
    output_dir="./results_hier_weighted",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1.5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),   
    logging_dir="./logs_hier_weighted",
    logging_steps=50,
    metric_for_best_model="f1_cat",
    greater_is_better=True,
    label_smoothing_factor=0.1,  
    dataloader_pin_memory=True,  # GPU 사용 시 메모리 고정
    dataloader_drop_last=True,   # 마지막 불완전한 배치 제거         
)

trainer = HierTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=custom_collator_dynamic,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# 학습 시작
train_output = trainer.train()
print("\n[완료] 학습 종료. best checkpoint가 로드되었습니다.")

# best checkpoint를 ./results_hier_weighted/best_model 폴더에 복사
best_ckpt = trainer.state.best_model_checkpoint
best_model_dir = "./results_hier_weighted/best_model"

if best_ckpt is not None:
    shutil.copytree(best_ckpt, best_model_dir, dirs_exist_ok=True)
    print(f"[저장 완료] Best model을 {best_model_dir} 에 복사했습니다.")
else:
    print("[경고] best_model_checkpoint가 없습니다. TrainingArguments에 load_best_model_at_end=True 가 설정되었는지 확인하세요.")

# 검증 실행
eval_output = trainer.evaluate()
print("[검증 결과]", eval_output)

print("\n[알림] 학습은 '기타' 제외, 평가는 '기타' 제외 샘플로 점수 산정했습니다.")
print("[팁] 추후 추론 서비스에서 '기타'를 쓰고 싶다면, 확률 임계값 기반 reject-option을 추가하면 됩니다.")
save_dir = "./results_hier_weighted"
os.makedirs(save_dir, exist_ok=True)

label_info = {
    "law2id": law2id,
    "id2law": id2law,
    "cat2id": cat2id,
    "id2cat": id2cat,
    "mask_mat": mask_mat.cpu().numpy(),
    "category_list": category_list,
    "mainname_list": mainname_list,
}

with open(os.path.join(save_dir, "label_mapping.pkl"), "wb") as f:
    pickle.dump(label_info, f)

['개인정보보호법,정보통신망법' '아동복지법' '자본시장법,특정금융정보법,전자금융거래법,전자증권법,금융소비자보호법' '중대재해처벌법']
['온라인·플랫폼 관련 이슈' '개인정보보호법 기타' '법적 분쟁·정치 연루' '개인정보 유출·보호조치'
 '아동 유기·방임·사망 사건' '법·제도·사회적 이슈' '아동 학대·성범죄' '아동복지법 기타' '시장·기업 관련 사건'
 '금융소비자보호법 기타' '가상자산·규제정책' '금융사고·소비자 피해' '특검정치 연루 사건' '중대재해처벌법 기타'
 '산업재해 사건' '제도·안전관리·정책' '중대시민재해']


Map: 100%|██████████| 4000/4000 [00:02<00:00, 1643.56 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 1580.97 examples/s]
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = HierTrainer(
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy Law,F1 Law,Accuracy Cat,F1 Cat,F1 Cat Weighted
1,0.5133,0.947238,0.934542,0.9329,0.779456,0.77413,0.782385
2,0.288,0.925186,0.936556,0.935683,0.819738,0.807962,0.819538
3,0.2309,0.967945,0.944612,0.943785,0.833837,0.826106,0.836902
4,0.1328,1.024702,0.939577,0.936911,0.845921,0.831245,0.84671
5,0.0984,1.099728,0.940584,0.937363,0.841893,0.830014,0.843222
6,0.0503,1.128396,0.939577,0.937381,0.84995,0.838205,0.850365
7,0.0448,1.209476,0.93857,0.936095,0.854985,0.842089,0.855845
8,0.0251,1.266019,0.935549,0.93302,0.853978,0.840771,0.854119
9,0.0302,1.284363,0.936556,0.934742,0.852971,0.836668,0.852312





[완료] 학습 종료. best checkpoint가 로드되었습니다.
[저장 완료] Best model을 ./results_hier_weighted/best_model 에 복사했습니다.




[검증 결과] {'eval_loss': 1.2094762325286865, 'eval_accuracy_law': 0.9385699899295066, 'eval_f1_law': 0.9360954323686522, 'eval_accuracy_cat': 0.8549848942598187, 'eval_f1_cat': 0.8420890483875625, 'eval_f1_cat_weighted': 0.8558446291546842, 'eval_runtime': 12.8344, 'eval_samples_per_second': 158.013, 'eval_steps_per_second': 9.895, 'epoch': 9.0}

[알림] 학습은 '기타' 제외, 평가는 '기타' 제외 샘플로 점수 산정했습니다.
[팁] 추후 추론 서비스에서 '기타'를 쓰고 싶다면, 확률 임계값 기반 reject-option을 추가하면 됩니다.


In [None]:
# Predict - Hierarchical 

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from safetensors.torch import load_file
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
import pickle

MODEL_NAME = "klue/roberta-base"
BEST_MODEL_PATH = "../models/news_model/model.safetensors"
INPUT_CSV = "../data/processed/news_pre.csv"   # content 컬럼 포함한 데이터
OUTPUT_CSV = "../data/processed/news_predict_result.csv"
LABEL_PATH = "../models/news_model/label_mapping.pkl"

MAX_LENGTH = 256
STRIDE = 128
BATCH_SIZE = 8
SEED = 42

torch.manual_seed(SEED)
np.random.seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

with open(LABEL_PATH, "rb") as f:
    label_info = pickle.load(f)

law2id = label_info["law2id"]
id2law = label_info["id2law"]
cat2id = label_info["cat2id"]
id2cat = label_info["id2cat"]
mask_mat = torch.tensor(label_info["mask_mat"])
category = label_info["category_list"]
MAIN_NAME = label_info["mainname_list"]

num_laws = len(category)
num_cats = len(MAIN_NAME)

# 모델 정의
class HierarchicalClassifier(nn.Module):
    def __init__(self, model_name, num_laws, num_cats):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        hidden = self.encoder.config.hidden_size
        self.law_head = nn.Linear(hidden, num_laws)
        self.cat_head = nn.Linear(hidden, num_cats)

    def forward(self, input_ids, attention_mask):
        enc = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled = enc.last_hidden_state[:, 0]
        law_logits = self.law_head(pooled)
        cat_logits = self.cat_head(pooled)   # mask 적용 X
        return {"law_logits": law_logits, "cat_logits": cat_logits, "embedding": pooled}

# 모델 로드
model = HierarchicalClassifier(MODEL_NAME, num_laws, num_cats).to(device)
state_dict = load_file(BEST_MODEL_PATH, device=str(device))
model.load_state_dict(state_dict, strict=False)
model.eval()
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# 데이터 로드
df = pd.read_csv(INPUT_CSV)
texts = df["content"].astype(str).tolist()

# 예측
final_law_preds, final_cat_preds = [], []

for text in tqdm(texts, desc="Hierarchical 추론"):
    tokens = tokenizer(text, truncation=False, padding=False)
    input_ids_full = tokens["input_ids"]

    # 긴 문서 → 여러 chunk
    chunks = []
    for start in range(0, len(input_ids_full), MAX_LENGTH-STRIDE):
        end = min(start+MAX_LENGTH, len(input_ids_full))
        chunks.append(input_ids_full[start:end])
        if end == len(input_ids_full): break

    # 문서 단위 logits 모으기
    all_law_logits, all_cat_logits = [], []

    for i in range(0, len(chunks), BATCH_SIZE):
        batch_chunks = chunks[i:i+BATCH_SIZE]
        enc = tokenizer.pad({"input_ids": batch_chunks}, padding=True, return_tensors="pt")
        input_ids = enc["input_ids"].to(device)
        attention_mask = enc["attention_mask"].to(device)

        with torch.no_grad():
            out = model(input_ids, attention_mask)
            all_law_logits.append(out["law_logits"].cpu())
            all_cat_logits.append(out["cat_logits"].cpu())

    # 문서 단위 평균 확률
    law_avg = torch.mean(torch.cat(all_law_logits, dim=0), dim=0)
    law_id = torch.argmax(law_avg).item()   # 최종 대분류 확정

    cat_avg = torch.mean(torch.cat(all_cat_logits, dim=0), dim=0)
    allowed_cats = (mask_mat[law_id] == 0).nonzero(as_tuple=True)[0]

    if len(allowed_cats) > 0:
        rel_idx = torch.argmax(cat_avg[allowed_cats]).item()
        cat_id = allowed_cats[rel_idx].item()
    else:
        cat_id = -1 

    final_law_preds.append(law_id)
    final_cat_preds.append(cat_id)

# 결과 저장
df["predicted_category"] = [id2law[i] for i in final_law_preds] # 대분류 예측결과
df["predicted_MAIN_NAME"] = [id2cat[i] for i in final_cat_preds] # 중분류 예측결과
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8-sig")
print(f"저장 완료: {OUTPUT_CSV}")

Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Hierarchical 추론:   0%|          | 0/1000 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Hierarchical 추론:   0%|          | 2/1000 [00:00<02:07,  7.83it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (599 > 512). Running this sequence through the model will result in indexing errors
Hierarchical 추론: 100%|██████████| 1000/1000 [00:50<00:00, 19.61it/s]


저장 완료: ../data/processed/news_predict_result.csv


In [3]:
# predict 결과 확인용
import pandas as pd
# df = pd.read_csv("../../data/processed/news_pre.csv")
# count = df.groupby(["category"]).size().reset_index(name="count")
# print(count)

df_pred = pd.read_csv("../data/processed/news_predict_result.csv")
count_pred = df_pred.groupby(["predicted_category", "predicted_MAIN_NAME"]).size().reset_index(name="count")
print(count_pred)

                      predicted_category predicted_MAIN_NAME  count
0                         개인정보보호법,정보통신망법        개인정보 유출·보호조치     78
1                         개인정보보호법,정보통신망법          개인정보보호법 기타     80
2                         개인정보보호법,정보통신망법         법적 분쟁·정치 연루     86
3                         개인정보보호법,정보통신망법       온라인·플랫폼 관련 이슈     58
4                                  아동복지법         법·제도·사회적 이슈     31
5                                  아동복지법      아동 유기·방임·사망 사건     24
6                                  아동복지법           아동 학대·성범죄     57
7                                  아동복지법            아동복지법 기타     21
8   자본시장법,특정금융정보법,전자금융거래법,전자증권법,금융소비자보호법           가상자산·규제정책     84
9   자본시장법,특정금융정보법,전자금융거래법,전자증권법,금융소비자보호법         금융사고·소비자 피해     39
10  자본시장법,특정금융정보법,전자금융거래법,전자증권법,금융소비자보호법         금융소비자보호법 기타     50
11  자본시장법,특정금융정보법,전자금융거래법,전자증권법,금융소비자보호법         시장·기업 관련 사건     82
12  자본시장법,특정금융정보법,전자금융거래법,전자증권법,금융소비자보호법          특검정치 연루 사건     54
13                               중대재해처벌법        