In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import math
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from collections import defaultdict

# --- 1. 환경 및 경로 설정 ---
def seed_everything(seed=42):
    import random
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(42)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

MODEL_NAME = "microsoft/deberta-v3-base"
BASE_DIR = os.path.abspath("..") 
OUTPUT_DIR = os.path.join(BASE_DIR, "after")
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 하이퍼파라미터
BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 1e-5
MAX_LEN = 256
NUM_CLASSES = 531
HIDDEN_DIM = 768

# --- 2. 계층 구조 데이터 로더 ---
def load_hierarchy_maps(path):
    parents = defaultdict(list)
    children = defaultdict(list)
    with open(path, "r") as f:
        for line in f:
            p, c = map(int, line.split())
            parents[c].append(p)
            children[p].append(c)
    return parents, children

# --- 3. 모델 정의 (DeBERTa-GCN) ---
class GraphConvolution(nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.weight = nn.Parameter(torch.FloatTensor(in_features, out_features))
        self.bias = nn.Parameter(torch.FloatTensor(out_features))
        nn.init.xavier_uniform_(self.weight)
        nn.init.zeros_(self.bias)

    def forward(self, input, adj):
        support = torch.mm(input, self.weight)
        return torch.mm(adj, support) + self.bias

class DebertaGCN(nn.Module):
    def __init__(self, model_name, num_classes, hidden_dim, adj_matrix):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.label_embedding = nn.Parameter(torch.empty(num_classes, hidden_dim))
        nn.init.xavier_uniform_(self.label_embedding)
        self.gcn = GraphConvolution(hidden_dim, hidden_dim)
        self.adj_matrix = adj_matrix

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        doc_embedding = self.dropout(outputs.last_hidden_state[:, 0, :])
        refined_label_emb = torch.tanh(self.gcn(self.label_embedding, self.adj_matrix))
        return torch.mm(doc_embedding, refined_label_emb.t())

# --- 4. 계층 인지 데이터셋 ---
class HATDataset(Dataset):
    def __init__(self, texts, silver_labels, parents_map, children_map, tokenizer, max_len):
        self.texts, self.labels = texts, silver_labels
        self.parents, self.children = parents_map, children_map
        self.tokenizer, self.max_len = tokenizer, max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        core_cids = [int(x) for x in str(self.labels[item]).split(",") if x]
        target = torch.zeros(NUM_CLASSES)
        mask = torch.ones(NUM_CLASSES) # Loss 가중치 (1: 학습, 0: 무시)

        pos_set = set(core_cids)
        ignore_set = set()
        for cid in core_cids:
            
            curr = cid
            while curr in self.parents:
                p_list = self.parents[curr]
                pos_set.update(p_list)
                curr = p_list[0] if p_list else -1
            
            if cid in self.children:
                ignore_set.update(self.children[cid])

        target[list(pos_set)] = 1.0
        mask[list(ignore_set)] = 0.0 

        enc = self.tokenizer.encode_plus(str(self.texts[item]), max_length=self.max_len, padding='max_length', truncation=True, return_tensors='pt')
        return {'input_ids': enc['input_ids'].flatten(), 'attention_mask': enc['attention_mask'].flatten(), 'targets': target, 'masks': mask}

# --- 5. 학습 루프 ---
def train():
    hierarchy_path = os.path.join(BASE_DIR, "Amazon_products", "class_hierarchy.txt") # Corrected path
    parents, children = load_hierarchy_maps(hierarchy_path)

    train_csv_path = os.path.join(BASE_DIR, "final", "train_round_2.csv") # Corrected path
    df = pd.read_csv(train_csv_path)

    # 텍스트 데이터 로드
    train_corpus_path = os.path.join(BASE_DIR, "Amazon_products", "train", "train_corpus.txt") # Corrected path
    pid2text = {}
    with open(train_corpus_path, "r") as f:
        for line in f:
            p, t = line.strip().split("\t", 1)
            pid2text[int(p)] = t
    df['text'] = df['pid'].map(pid2text)
    df = df.dropna()

    train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)

    train_loader = DataLoader(HATDataset(train_df.text.to_numpy(), train_df.labels.to_numpy(), parents, children, tokenizer, MAX_LEN), batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(HATDataset(val_df.text.to_numpy(), val_df.labels.to_numpy(), parents, children, tokenizer, MAX_LEN), batch_size=BATCH_SIZE)

    # 인접 행렬 및 모델 초기화
    adj = torch.eye(NUM_CLASSES).to(DEVICE) # 단순화를 위해 I 사용
    model = DebertaGCN(MODEL_NAME, NUM_CLASSES, HIDDEN_DIM, adj).to(DEVICE)
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    criterion = nn.BCEWithLogitsLoss(reduction='none') # 마스킹을 위해 none 설정

    print("Starting Hierarchy-Aware Training...")
    for epoch in range(EPOCHS):
        model.train()
        for batch in tqdm(train_loader):
            optimizer.zero_grad()
            logits = model(batch['input_ids'].to(DEVICE), batch['attention_mask'].to(DEVICE))
            loss = criterion(logits, batch['targets'].to(DEVICE))
            masked_loss = (loss * batch['masks'].to(DEVICE)).sum() / batch['masks'].to(DEVICE).sum()
            masked_loss.backward()
            optimizer.step()

        torch.save(model, os.path.join(OUTPUT_DIR, "best_deberta_hat.pt"))
    print("Training Complete.")

if __name__ == "__main__":
    train()


Starting Hierarchy-Aware Training...


  0%|          | 0/1659 [00:01<?, ?it/s]


KeyboardInterrupt: 

In [None]:
import os
import csv
import torch
import torch.nn as nn
import numpy as np
from tqdm import tqdm
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel, BertModel
from collections import defaultdict

# --- 설정 및 경로 ---
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BASE_DIR = os.path.join(os.path.abspath(".."), "Amazon_products")
MAX_LEN = 256
BATCH_SIZE = 32
NUM_CLASSES = 531

# 모델 경로
PATH_BERT = os.path.join(os.path.abspath(".."), "final", "saved_model_gnn", "best_model_gnn.pt")
PATH_DEBERTA = os.path.join(os.path.abspath(".."), "after", "best_deberta_hat.pt")
FINAL_CSV = "1220.csv"

# --- 1. 모델 클래스 정의 ---

class GraphConvolution(nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.weight = nn.Parameter(torch.FloatTensor(in_features, out_features))
        self.bias = nn.Parameter(torch.FloatTensor(out_features))
        nn.init.xavier_uniform_(self.weight)
        nn.init.zeros_(self.bias)

    def forward(self, input, adj):
        support = torch.mm(input, self.weight)
        return torch.mm(adj, support) + self.bias

class DebertaGCN(nn.Module):
    """DeBERTa 기반 GNN 모델"""
    def __init__(self, model_name, num_classes, hidden_dim, adj_matrix):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.label_embedding = nn.Parameter(torch.empty(num_classes, hidden_dim))
        nn.init.xavier_uniform_(self.label_embedding)
        self.gcn = GraphConvolution(hidden_dim, hidden_dim)
        self.adj_matrix = adj_matrix

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        # DeBERTa-v3는 pooler_output이 없는 경우가 많아 last_hidden_state의 [CLS] 사용
        doc_embedding = self.dropout(outputs.last_hidden_state[:, 0, :])
        refined_label_emb = torch.tanh(self.gcn(self.label_embedding, self.adj_matrix))
        return torch.mm(doc_embedding, refined_label_emb.t())

class BertGCN(nn.Module):
    """
    BERT 기반 GNN 모델 - 에러 해결을 위해 구조 수정
    (DebertaGCN과 동일하게 라벨 임베딩과의 행렬 곱 방식으로 변경)
    """
    def __init__(self, num_labels, hidden_dim=768, adj_matrix=None):
        super(BertGCN, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.1)
        self.label_embedding = nn.Parameter(torch.empty(num_labels, hidden_dim))
        nn.init.xavier_uniform_(self.label_embedding)
        self.gcn = GraphConvolution(hidden_dim, hidden_dim)
        self.adj_matrix = adj_matrix 

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # BERT의 [CLS] 토큰 임베딩 (일관성을 위해 last_hidden_state 사용)
        doc_embedding = self.dropout(outputs.last_hidden_state[:, 0, :])

        # 모델에 내장된 adj_matrix 사용 (없을 경우를 대비해 예외 처리)
        adj = getattr(self, 'adj_matrix', None)
        if adj is None:
            # 추론 시 adj_matrix가 없으면 GCN 없이 임베딩만 사용하거나 에러 방지
            refined_label_emb = self.label_embedding
        else:
            refined_label_emb = torch.tanh(self.gcn(self.label_embedding, adj))

        return torch.mm(doc_embedding, refined_label_emb.t())

# --- 2. 헬퍼 함수 ---

def patch_model_config(model):
    """라이브러리 호환성 문제 해결"""
    target_models = []
    if hasattr(model, 'bert'): target_models.append(model.bert)
    if hasattr(model, 'encoder'): target_models.append(model.encoder)

    for m in target_models:
        if hasattr(m, 'config'):
            for attr in ['_output_attentions', '_output_hidden_states', 'output_attentions', 'output_hidden_states']:
                if not hasattr(m.config, attr):
                    setattr(m.config, attr, False)
    return model

def load_hierarchy_maps():
    parents = defaultdict(list)
    hierarchy_path = os.path.join(BASE_DIR, "class_hierarchy.txt")
    if not os.path.exists(hierarchy_path):
        return parents
    with open(hierarchy_path, "r") as f:
        for line in f:
            parts = line.split()
            if len(parts) == 2:
                p, c = map(int, parts)
                parents[c].append(p)
    return parents

# --- 3. 실행부 ---

def run_ensemble_inference():
    print("1. Loading Models and Tokenizers...")

    try:
        model_bert = torch.load(PATH_BERT, map_location=DEVICE, weights_only=False)
        model_bert = patch_model_config(model_bert)
        model_bert = model_bert.to(DEVICE).eval()
        print("✅ BERT model loaded successfully")
    except Exception as e:
        print(f"❌ Error loading BERT model: {e}")
        return

    try:
        model_deberta = torch.load(PATH_DEBERTA, map_location=DEVICE, weights_only=False)
        model_deberta = patch_model_config(model_deberta)
        model_deberta = model_deberta.to(DEVICE).eval()
        print("✅ DeBERTa model loaded successfully")
    except Exception as e:
        print(f"❌ Error loading DeBERTa model: {e}")
        return

    parents_map = load_hierarchy_maps()
    tokenizer_b = AutoTokenizer.from_pretrained("bert-base-uncased")
    tokenizer_d = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base", use_fast=False)

    # 데이터 로드
    test_pids, test_texts = [], []
    test_corpus_path = os.path.join(BASE_DIR, "test/test_corpus.txt")
    with open(test_corpus_path, "r", encoding="utf-8") as f:
        for line in f:
            if "\t" in line:
                p, t = line.strip().split("\t", 1)
                test_pids.append(p)
                test_texts.append(t)

    print(f"2. Starting Inference (Total: {len(test_texts)} samples)...")
    all_preds = []

    for i in tqdm(range(0, len(test_texts), BATCH_SIZE)):
        batch_texts = test_texts[i:i+BATCH_SIZE]

        # BERT 예측
        enc_b = tokenizer_b(batch_texts, padding=True, truncation=True, max_length=MAX_LEN, return_tensors='pt').to(DEVICE)
        with torch.no_grad():
            logits_b = model_bert(enc_b['input_ids'], enc_b['attention_mask'])
            prob_b = torch.sigmoid(logits_b)

        # DeBERTa 예측
        enc_d = tokenizer_d(batch_texts, padding=True, truncation=True, max_length=MAX_LEN, return_tensors='pt').to(DEVICE)
        with torch.no_grad():
            logits_d = model_deberta(enc_d['input_ids'], enc_d['attention_mask'])
            prob_d = torch.sigmoid(logits_d)

        # 앙상블 (DeBERTa 0.6 : BERT 0.4)
        final_probs = (prob_b * 0.4 + prob_d * 0.6).cpu().numpy()

        for probs in final_probs:
            top_idx = probs.argsort()[-10:][::-1]
            candidate_set = set()
            score_map = {}

            for cid in top_idx:
                candidate_set.add(cid)
                score_map[cid] = max(score_map.get(cid, 0), probs[cid])
                curr_nodes = [cid]
                while curr_nodes:
                    next_nodes = []
                    for node in curr_nodes:
                        if node in parents_map:
                            for p in parents_map[node]:
                                candidate_set.add(p)
                                score_map[p] = max(score_map.get(p, 0), probs[cid])
                                next_nodes.append(p)
                    curr_nodes = next_nodes

            final_labels = sorted(list(candidate_set), key=lambda x: score_map[x], reverse=True)[:3]
            if len(final_labels) < 2:
                final_labels = probs.argsort()[-2:][::-1].tolist()
            all_preds.append(sorted(final_labels))

    with open(FINAL_CSV, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["id", "labels"])
        for pid, lbls in zip(test_pids, all_preds):
            writer.writerow([pid, ",".join(map(str, lbls))])
    print(f"✅ Success: {FINAL_CSV}")

if __name__ == "__main__":
    run_ensemble_inference()

1. Loading Models and Tokenizers...
✅ BERT model loaded successfully
❌ Error loading DeBERTa model: Ran out of input
