In [1]:
# ================================================
# 1. REPRODUCIBILITY SETTINGS 
# ================================================
from pathlib import Path

import random
import numpy as np
import pandas as pd
import torch

from tqdm import tqdm
import copy

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split

# ----- Reproducibility -----
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# ----- Device -----
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ================================================
# 2. DEFAULT PATHS FOR DATASET
# ================================================
ROOT = Path("Amazon_products")   # dataset root directory

# Main corpus
TRAIN_CORPUS_PATH = ROOT / "train" /  "train_corpus.txt"       # pid \t text
TEST_CORPUS_PATH  = ROOT / "test" / "test_corpus.txt"        # pid \t text

# Taxonomy & class meta
CLASSES_PATH      = ROOT / "classes.txt"            # class_id \t class_name
HIERARCHY_PATH    = ROOT / "class_hierarchy.txt"    # parent_id \t child_id
KEYWORDS_PATH     = ROOT / "class_related_keywords.txt"

# Constants
NUM_CLASSES = 531
MIN_LABELS = 2     # minimum number of labels per sample
MAX_LABELS = 3     # maximum number of labels per sample

# Check paths
print("\n== Data path check ==")
for p in [TRAIN_CORPUS_PATH, TEST_CORPUS_PATH,
          CLASSES_PATH, HIERARCHY_PATH, KEYWORDS_PATH]:
    print(f"{p} -> {p.exists()}")

Device: cuda

== Data path check ==
Amazon_products/train/train_corpus.txt -> True
Amazon_products/test/test_corpus.txt -> True
Amazon_products/classes.txt -> True
Amazon_products/class_hierarchy.txt -> True
Amazon_products/class_related_keywords.txt -> True


In [2]:
# ================================================
# 2. DATA LOADING
# ================================================

def load_corpus(path):
    """
    Load corpus file (pid \\t text) as {pid: text} dictionary.
    """
    pid2text = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t", 1)
            if len(parts) == 2:
                pid, text = parts
                pid2text[pid] = text
    return pid2text

# Load corpus
print("Loading train/test corpus...")

pid2text_train = load_corpus(TRAIN_CORPUS_PATH)
pid2text_test  = load_corpus(TEST_CORPUS_PATH)

pid_list_train = list(pid2text_train.keys())
pid_list_test  = list(pid2text_test.keys())

print("Train samples:", len(pid2text_train))
print("Test samples :", len(pid2text_test))

# Quick sample check
for i, (pid, text) in enumerate(pid2text_train.items()):
    print(f"Example train sample #{i}: pid={pid}, text={text[:80]}...")
    break

Loading train/test corpus...
Train samples: 29487
Test samples : 19658
Example train sample #0: pid=0, text=omron hem 790it automatic blood pressure monitor with advanced omron health mana...


In [3]:
# ================================================
# 3. CLASS METADATA LOADING
# ================================================

def load_classes(path):
    """
    classes.txt : class_id \\t class_name
    returns: id2label, label2id
    """
    id2label = {}
    label2id = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) != 2:
                continue
            cid, name = parts
            cid = int(cid)
            id2label[cid] = name
            label2id[name] = cid
    return id2label, label2id


def load_hierarchy(path):
    """
    class_hierarchy.txt : parent_id \\t child_id
    returns: edges (list of tuples)
    """
    edges = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) != 2:
                continue
            p, c = map(int, parts)
            edges.append((p, c))
    return edges


def load_keywords(path, label2id):
    """
    class_related_keywords.txt : CLASS_NAME: kw1, kw2,...
    returns: {class_id: [kws]}
    """
    d = {cid: [] for cid in label2id.values()}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if ":" not in line:
                continue
            name, kws = line.strip().split(":", 1)
            kws = [k.strip() for k in kws.split(",") if k.strip()]
            if name in label2id:
                cid = label2id[name]
                d[cid] = kws
    return d


# ----------------- Load all class meta -----------------
print("Loading class metadata...")

id2label, label2id = load_classes(CLASSES_PATH)
edges = load_hierarchy(HIERARCHY_PATH)
label_keywords = load_keywords(KEYWORDS_PATH, label2id)

print("Num classes:", len(id2label))
print("Num edges in taxonomy:", len(edges))
print()

# Small check
example_id = 0
print("Example class id:", example_id)
print("Name:", id2label[example_id])
print("Keywords:", label_keywords[example_id])

Loading class metadata...
Num classes: 531
Num edges in taxonomy: 568

Example class id: 0
Name: grocery_gourmet_food
Keywords: ['snacks', 'condiments', 'beverages', 'specialty_foods', 'spices', 'cooking_oils', 'baking_ingredients', 'gourmet_chocolates', 'artisanal_cheeses', 'organic_foods']


In [None]:
# ================================================
# 4. TF-IDF EMBEDDING + DOC–CLASS SIMILARITY
#    (class name vs keyword 분리 활용)
# ================================================
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
import numpy as np

# ------------------------------------------------
# (1) 클래스 이름 텍스트 / 키워드 텍스트 분리 생성
# ------------------------------------------------
def build_class_name_texts(id2label):
    """
    각 class_id에 대해 클래스 이름만 사용한 텍스트 리스트 생성
    """
    texts = []
    for cid in range(NUM_CLASSES):
        name = id2label[cid].replace("_", " ")
        texts.append(name)
    return texts

def build_class_keyword_texts(label_keywords):
    """
    각 class_id에 대해 키워드만 이어붙인 텍스트 리스트 생성
    """
    texts = []
    for cid in range(NUM_CLASSES):
        kws = " ".join(label_keywords.get(cid, []))
        texts.append(kws if kws else "")
    return texts

class_name_texts = build_class_name_texts(id2label)         # [C]
class_kw_texts   = build_class_keyword_texts(label_keywords) # [C]

print("Example class name text:", class_name_texts[0])
print("Example class keyword text:", class_kw_texts[0])


# ------------------------------------------------
# (2) TF-IDF vocabulary 학습
#     - train 문서 + test 문서 + class name + class keyword 모두 포함
# ------------------------------------------------
all_texts_for_vocab = (
    list(pid2text_train.values())
    + list(pid2text_test.values())
    + class_name_texts
    + class_kw_texts
)

print("\nFitting TF-IDF vectorizer on docs + class names + keywords...")
vectorizer = TfidfVectorizer(
    max_features=100_000,   # 필요에 따라 조정 가능
    ngram_range=(1, 2),
    min_df=1                # ★ 중요: class 단어 손실 방지
)
vectorizer.fit(all_texts_for_vocab)

print("Vocabulary size:", len(vectorizer.vocabulary_))


# ------------------------------------------------
# (3) 실제 TF-IDF 행렬 변환
# ------------------------------------------------
N_train = len(pid2text_train)
N_test  = len(pid2text_test)
C       = NUM_CLASSES

# 문서 TF-IDF
X_train_docs = vectorizer.transform(pid2text_train.values())   # [N_train, V]
X_test_docs  = vectorizer.transform(pid2text_test.values())    # [N_test, V]

# 클래스 이름 TF-IDF (GNN용 initial feature)
X_class_name = vectorizer.transform(class_name_texts)          # [C, V]

# 클래스 키워드 TF-IDF (silver label용 보조 sim)
X_class_kw   = vectorizer.transform(class_kw_texts)            # [C, V]

print("X_train_docs:", X_train_docs.shape)
print("X_test_docs :", X_test_docs.shape)
print("X_class_name:", X_class_name.shape)
print("X_class_kw  :", X_class_kw.shape)


# ------------------------------------------------
# (4) L2 정규화 (코사인 유사도용)
# ------------------------------------------------
print("\nNormalizing TF-IDF vectors...")
X_train_norm      = normalize(X_train_docs, axis=1)
X_class_name_norm = normalize(X_class_name, axis=1)
X_class_kw_norm   = normalize(X_class_kw, axis=1)


# ------------------------------------------------
# (5) 이름 기반 / 키워드 기반 doc–class 유사도 계산
# ------------------------------------------------
print("\nComputing doc–class cosine similarities...")

# 이름 기반 similarity
sims_name = (X_train_norm @ X_class_name_norm.T).toarray().astype("float32")  # [N_train, C]

# 키워드 기반 similarity
sims_kw   = (X_train_norm @ X_class_kw_norm.T).toarray().astype("float32")    # [N_train, C]

print("sims_name shape:", sims_name.shape, "| min/max:", sims_name.min(), sims_name.max())
print("sims_kw   shape:", sims_kw.shape,   "| min/max:", sims_kw.min(),   sims_kw.max())


# ------------------------------------------------
# (6) 두 채널을 가중합하여 최종 similarity 생성
#     - alpha: class name 비중
#     - beta : keyword 비중
# ------------------------------------------------
alpha = 0.9  # class name 중요도
beta  = 0.1  # keyword 중요도

sims = alpha * sims_name + beta * sims_kw   # [N_train, C]

print("\nFinal sims shape:", sims.shape)
print("Final sims min/max:", sims.min(), sims.max())


# ------------------------------------------------
# (7) Document-wise Normalization 
# ------------------------------------------------

def zscore_norm(sims):
    """
    Document-wise z-score normalization.
    sims: numpy array [N, C]
    """
    sims_mean = sims.mean(axis=1, keepdims=True)
    sims_std  = sims.std(axis=1, keepdims=True) + 1e-8
    return (sims - sims_mean) / sims_std

def minmax_norm(sims):
    """
    Document-wise min-max normalization.
    sims: numpy array [N, C]
    """
    sims_min = sims.min(axis=1, keepdims=True)
    sims_max = sims.max(axis=1, keepdims=True)
    denom = (sims_max - sims_min) + 1e-8
    return (sims - sims_min) / denom

# ---- Step A: z-score normalization
sims_z = zscore_norm(sims)

# ---- Step B: positive clipping
sims_z = np.maximum(sims_z, 0)

# ---- Step C: Min-Max normalization (0~1)
sims_norm = minmax_norm(sims_z)

print("\nNormalized sims:")
print("sims_norm shape:", sims_norm.shape)
print("sims_norm min/max:", sims_norm.min(), sims_norm.max())
print("Done.")

Example class name text: grocery gourmet food
Example class keyword text: snacks condiments beverages specialty_foods spices cooking_oils baking_ingredients gourmet_chocolates artisanal_cheeses organic_foods

Fitting TF-IDF vectorizer on docs + class names + keywords...
Vocabulary size: 100000
X_train_docs: (29487, 100000)
X_test_docs : (19658, 100000)
X_class_name: (531, 100000)
X_class_kw  : (531, 100000)

Normalizing TF-IDF vectors...

Computing doc–class cosine similarities...
sims_name shape: (29487, 531) | min/max: 0.0 0.6900341
sims_kw   shape: (29487, 531) | min/max: 0.0 0.5547385

Final sims shape: (29487, 531)
Final sims min/max: 0.0 0.6210307

Normalized sims:
sims_norm shape: (29487, 531)
sims_norm min/max: 0.0 1.0
Done.


In [None]:
# ================================================
# 5. SIMPLE TOP-K CANDIDATE SELECTION 
# ================================================

import numpy as np

# ------------------------------------------------
# sims_norm 기반 top-k 후보 선택
# ------------------------------------------------
def select_topk_candidates(sims_norm, min_k=2, max_k=3):
    """
    sims_norm: [C] shape similarity row for one document
    return: set of candidate class IDs
    """
    # 1) max_k개를 선택 (가장 높은 similarity부터)
    top_k_idx = sims_norm.argsort()[-max_k:][::-1]   # 내림차순

    # 2) min_k 보장
    if len(top_k_idx) < min_k:
        # sims_norm이 이상하게 모두 0일 때 등
        # 상위 min_k개를 추가로 선택
        top_k_idx = sims_norm.argsort()[-min_k:][::-1]

    # set으로 반환
    return set(top_k_idx)


# ------------------------------------------------
# 테스트 
# ------------------------------------------------
print("Simple Top-K candidate selection module loaded.")
print("Example (first document):", select_topk_candidates(sims_norm[0]))

Simple Top-K candidate selection module loaded.
Example (first document): {137, 179, 87}


In [7]:
# ================================================
# 6. SIMPLE SILVER LABEL GENERATION (Top-k 방식)
# ================================================
import numpy as np
from tqdm import tqdm

def build_silver_labels_topk(sims_norm, min_k=2, max_k=3):
    """
    sims_norm : (N_train, NUM_CLASSES) normalized similarity matrix
    return:
        silver_y : (N, C) float32 (0/1 labels)
    """
    N, C = sims_norm.shape
    silver_y = np.zeros((N, C), dtype=np.float32)

    for i in tqdm(range(N), desc="Building silver labels (Top-k)"):

        # --- A. 안전하게 row 복사 ---
        row = np.array(sims_norm[i], dtype=np.float32).copy()

        # --- B. shape 검사 (가장 중요한 체크) ---
        assert row.shape[0] == C, \
            f"Error: sims_norm row shape {row.shape}, expected ({C},). " \
             "Check sims_norm axis order."

        # --- C. row 값이 모두 0이면 Fallback ---
        if row.max() == 0:
            # similarity 정보가 전혀 없다면 top N classes를 fallback
            topk_idx = np.arange(C)[-max_k:]
        else:
            # --- D. 정상 top-k 선택 ---
            # ascending → 마지막 max_k → descending
            topk_idx = row.argsort()[-max_k:][::-1]

        # --- E. min_k 보장 ---
        if len(topk_idx) < min_k:
            topk_idx = row.argsort()[-min_k:][::-1]

        # --- F. silver label 기록 ---
        silver_y[i, topk_idx] = 1.0

    return silver_y


print("Generating silver labels using simple Top-k scheme...")

silver_labels_v1 = build_silver_labels_topk(
    sims_norm,
    min_k=2,
    max_k=3
)

print("silver_labels_v1 shape:", silver_labels_v1.shape)
print("avg positives per doc:", silver_labels_v1.mean(axis=1).mean())
print("Done.")

Generating silver labels using simple Top-k scheme...


Building silver labels (Top-k): 100%|██████████| 29487/29487 [00:00<00:00, 87613.21it/s]

silver_labels_v1 shape: (29487, 531)
avg positives per doc: 0.005649717
Done.





In [None]:
# ================================================
# 7. LABEL-GCN + DOCUMENT-CLASS CLASSIFIER
# ================================================
import torch
import torch.nn as nn
import torch.nn.functional as F


# ------------------------------------------------
# (1) Build normalized adjacency A_hat for GCN
# ------------------------------------------------
def build_normalized_adj(num_classes, edges):
    """
    edges: [(parent, child), ...]
    출력: A_hat (torch.FloatTensor, [C,C])
    """
    import numpy as np

    A = np.zeros((num_classes, num_classes), dtype=np.float32)

    # parent-child 연결을 양방향으로 넣기
    for p, c in edges:
        A[p, c] = 1.0
        A[c, p] = 1.0

    # self-loop
    np.fill_diagonal(A, 1.0)

    # D^{-1/2} * A * D^{-1/2}
    deg = A.sum(axis=1)
    deg_inv_sqrt = np.power(deg, -0.5)
    deg_inv_sqrt[np.isinf(deg_inv_sqrt)] = 0.0
    D_inv_sqrt = np.diag(deg_inv_sqrt)

    A_hat = D_inv_sqrt @ A @ D_inv_sqrt
    return torch.from_numpy(A_hat).float()


A_hat = build_normalized_adj(NUM_CLASSES, edges).to(device)
print("A_hat built:", A_hat.shape)


# ------------------------------------------------
# (2) Label Encoder: GCN
# ------------------------------------------------
class LabelGCN(nn.Module):
    def __init__(self, in_dim, hidden_dim=256, num_layers=2, dropout=0.5):
        super().__init__()
        self.num_layers = num_layers
        self.dropout = nn.Dropout(dropout)
        
        dims = [in_dim] + [hidden_dim] * num_layers
        self.linears = nn.ModuleList()
        for i in range(num_layers):
            self.linears.append(nn.Linear(dims[i], dims[i+1]))

    def forward(self, A_hat, H):
        x = H  # [C, in_dim]
        for i, lin in enumerate(self.linears):
            x = A_hat @ x          # GCN aggregation
            x = lin(x)
            if i < self.num_layers - 1:
                x = F.relu(x)
                x = self.dropout(x)
        return x  # [C, hidden_dim]


# ------------------------------------------------
# (3) 전체 classifier: TF-IDF doc → projection → dot with label GNN
# ------------------------------------------------
class TaxonomyClassifier(nn.Module):
    def __init__(self, vocab_dim, hidden_dim=256):
        super().__init__()
        # 문서 임베딩 projection matrix: V → d
        self.doc_proj = nn.Linear(vocab_dim, hidden_dim, bias=False)
        
        # 라벨 GCN
        self.label_gcn = LabelGCN(
            in_dim=vocab_dim,     # label initial features = TF-IDF class-name vector
            hidden_dim=hidden_dim,
            num_layers=2,
            dropout=0.5
        )

    def forward(self, doc_feats, label_feats, A_hat):
        """
        doc_feats: [N, V]   TF-IDF 문서 벡터
        label_feats: [C, V] TF-IDF 클래스 (name) 벡터
        A_hat: [C, C]       taxonomy
        """
        # 1) Document embedding
        doc_emb = self.doc_proj(doc_feats)     # [N, d]

        # 2) Label embedding via GCN
        label_emb = self.label_gcn(A_hat, label_feats)  # [C, d]

        # 3) Matching score 
        logits = doc_emb @ label_emb.T         # [N, C]

        return logits, doc_emb, label_emb


print("Model definitions loaded.")

In [None]:
# ================================================
# 8. ROUND 1 TRAINING WITH SILVER LABELS v1
# ================================================
from torch.utils.data import Dataset, DataLoader

# ------------------------------------------------
# (1) Dataset 정의 (sparse TF-IDF → batch마다 dense 변환)
# ------------------------------------------------
class SilverDataset(Dataset):
    def __init__(self, X_csr, y_np, pid_list=None):
        """
        X_csr : scipy.sparse CSR matrix (N, V)
        y_np  : numpy array (N, C)  -> silver_labels_v1
        pid_list : (옵션) pid 리스트, 나중에 쓸 수도 있음
        """
        self.X = X_csr
        self.y = y_np
        self.pid_list = pid_list

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        # 1) sparse → dense (1, V) → (V,)
        x_dense = self.X[idx].toarray().astype("float32").squeeze(0)
        y = self.y[idx].astype("float32")
        return x_dense, y


# ------------------------------------------------
# (2) Dataset / DataLoader 생성
# ------------------------------------------------
train_dataset = SilverDataset(X_train_docs, silver_labels_v1, pid_list_train)

batch_size = 64
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=0,      
    pin_memory=True
)

print("Train dataset size:", len(train_dataset))
print("Batch size:", batch_size)


# ------------------------------------------------
# (3) 모델 / optimizer / loss 정의
# ------------------------------------------------
vocab_dim = X_train_docs.shape[1]
hidden_dim = 256

model = TaxonomyClassifier(vocab_dim=vocab_dim, hidden_dim=hidden_dim).to(device)

# Label initial features (GCN 입력용) : class name TF-IDF만 사용
label_feats = torch.from_numpy(
    X_class_name.toarray().astype("float32")
).to(device)   # [C, V]

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


# ------------------------------------------------
# (4) Training loop (Round 1)
# ------------------------------------------------
num_epochs = 2  # 일단 가볍게 1~2 epoch 정도부터 시도

for epoch in range(1, num_epochs + 1):
    model.train()
    total_loss = 0.0

    for batch_x, batch_y in tqdm(train_loader, desc=f"Epoch {epoch}"):
        batch_x = batch_x.to(device)        # [B, V]
        batch_y = batch_y.to(device)        # [B, C]

        optimizer.zero_grad()

        logits, doc_emb, label_emb = model(batch_x, label_feats, A_hat)  # [B, C]
        loss = criterion(logits, batch_y)

        loss.backward()
        optimizer.step()

        total_loss += loss.item() * batch_x.size(0)

    avg_loss = total_loss / len(train_dataset)
    print(f"[Epoch {epoch}] avg_loss = {avg_loss:.4f}")

print("Round 1 training finished.")

In [None]:
# ================================================
# 9. SELF-TRAINING: BUILD SILVER LABELS v2 FROM ROUND 1 MODEL
# ================================================

# ------------------------------------------------
# (1) Round 1 prediction 함수 (train 전체 predict)
# ------------------------------------------------
def predict_train_probs(model, X_csr, label_feats, A_hat, batch_size=64):
    """
    Train 전체 문서에 대해 확률 예측을 반환한다.
    return: probs (N_train, NUM_CLASSES)
    """
    model.eval()
    N = X_csr.shape[0]
    all_probs = []

    with torch.no_grad():
        for i in tqdm(range(0, N, batch_size), desc="Predicting Round 1"):
            X_batch = X_csr[i : i+batch_size].toarray().astype("float32")
            X_batch = torch.from_numpy(X_batch).to(device)

            logits, _, _ = model(X_batch, label_feats, A_hat)
            probs = torch.sigmoid(logits).cpu().numpy()

            all_probs.append(probs)

    return np.vstack(all_probs)


# ------------------------------------------------
# (2) Round 1 전체 예측 수행
# ------------------------------------------------
print("Predicting train probabilities (Round 1)...")

probs_train = predict_train_probs(
    model,
    X_train_docs,
    label_feats,
    A_hat,
    batch_size=64
)

print("probs_train shape:", probs_train.shape)  # (N_train, 531)
print("probs range:", probs_train.min(), probs_train.max())


# ------------------------------------------------
# (3) Silver labels v2 생성
#     adaptive threshold:
#       HIGH = 0.7  → confident positive
#       LOW  = 0.3  → confident negative
# ------------------------------------------------
def build_silver_labels_v2(probs, silver_v1, high=0.7, low=0.3):
    """
    probs : round 1 predicted probabilities (N, C)
    silver_v1 : previous silver labels (N, C)
    return: updated silver_v2 (N, C)
    """
    silver_v2 = silver_v1.copy()

    pos_mask = probs >= high
    neg_mask = probs <= low

    silver_v2[pos_mask] = 1.0
    silver_v2[neg_mask] = 0.0

    return silver_v2


silver_labels_v2 = build_silver_labels_v2(
    probs_train,
    silver_labels_v1,
    high=0.7,
    low=0.3
)

print("silver_labels_v2 shape:", silver_labels_v2.shape)
print("avg positives per doc:", silver_labels_v2.sum(axis=1).mean())
print("Self-training Round 2 labels ready.")

In [None]:
# ================================================
# 10. ROUND 2 TRAINING WITH SILVER LABELS v2
# ================================================
from torch.utils.data import Dataset, DataLoader

# SilverDataset 클래스는 이미 위에서 정의해 둔 것을 재사용:
# class SilverDataset(Dataset):
#     def __init__(self, X_csr, y_np, pid_list=None):
#         ...
#     def __len__(self):
#         ...
#     def __getitem__(self, idx):
#         ...

# ------------------------------------------------
# (1) Round 2용 Dataset / DataLoader 생성
# ------------------------------------------------
train_dataset_v2 = SilverDataset(X_train_docs, silver_labels_v2, pid_list_train)

batch_size_round2 = 64
train_loader_v2 = DataLoader(
    train_dataset_v2,
    batch_size=batch_size_round2,
    shuffle=True,
    num_workers=0,
    pin_memory=True
)

print("Round 2 train dataset size:", len(train_dataset_v2))
print("Round 2 batch size:", batch_size_round2)


# ------------------------------------------------
# (2) Optimizer / Loss 재설정 (모델은 Round 1에서 이어서 사용)
# ------------------------------------------------
criterion_round2 = nn.BCEWithLogitsLoss()
optimizer_round2 = torch.optim.Adam(model.parameters(), lr=1e-3)


# ------------------------------------------------
# (3) Round 2 Training Loop
# ------------------------------------------------
num_epochs_round2 = 2  # Round 2에서도 1~2 epoch 정도 돌려보는 것을 추천

for epoch in range(1, num_epochs_round2 + 1):
    model.train()
    total_loss = 0.0

    for batch_x, batch_y in tqdm(train_loader_v2, desc=f"[Round 2] Epoch {epoch}"):
        batch_x = batch_x.to(device)   # [B, V]
        batch_y = batch_y.to(device)   # [B, C]

        optimizer_round2.zero_grad()

        # label_feats, A_hat, model 은 Round 1에서 이미 정의된 것을 사용
        logits, doc_emb, label_emb = model(batch_x, label_feats, A_hat)  # [B, C]
        loss = criterion_round2(logits, batch_y)

        loss.backward()
        optimizer_round2.step()

        total_loss += loss.item() * batch_x.size(0)

    avg_loss = total_loss / len(train_dataset_v2)
    print(f"[Round 2 - Epoch {epoch}] avg_loss = {avg_loss:.4f}")

print("Round 2 training finished")

In [None]:
# ================================================
# 11. TEST PREDICTION + SUBMISSION FILE
# ================================================

def predict_test_probs(model, X_csr, label_feats, A_hat, batch_size=64):
    """
    Test 전체 문서에 대해 확률 예측을 반환한다.
    return: probs (N_test, NUM_CLASSES)
    """
    model.eval()
    N = X_csr.shape[0]
    all_probs = []

    with torch.no_grad():
        for i in tqdm(range(0, N, batch_size), desc="Predicting Test Set"):
            X_batch = X_csr[i : i+batch_size].toarray().astype("float32")
            X_batch = torch.from_numpy(X_batch).to(device)

            logits, _, _ = model(X_batch, label_feats, A_hat)
            probs = torch.sigmoid(logits).cpu().numpy()

            all_probs.append(probs)

    return np.vstack(all_probs)


print("Predicting on test corpus...")
probs_test = predict_test_probs(
    model,
    X_test_docs,
    label_feats,
    A_hat,
    batch_size=64
)

print("probs_test shape:", probs_test.shape)  # (N_test, 531)


# ------------------------------------------------
# (2) 문서당 Top-K label 선택
# ------------------------------------------------

MIN_LABELS = 2
MAX_LABELS = 3

def pick_labels(prob_row, min_k=1, max_k=3):
    """
    한 문서의 확률벡터에서 top-K 라벨 선택.
    """
    # 확률이 높은 class 순으로 정렬
    sorted_idx = np.argsort(prob_row)[::-1]  # 내림차순
    topk = sorted_idx[:max_k]
    return list(topk)


# ------------------------------------------------
# (3) submission.csv 파일 생성
# ------------------------------------------------
print("Generating submission.csv...")

with open(SUBMISSION_PATH, "w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["pid", "labels"])  # header

    for i, pid in enumerate(pid_list_test):
        labels = pick_labels(probs_test[i], MIN_LABELS, MAX_LABELS)
        label_str = ",".join(map(str, labels))
        writer.writerow([pid, label_str])

print("Submission file saved to:", SUBMISSION_PATH)
print("Total test samples:", len(pid_list_test))
print("Done!")

In [None]:
# ------------------------
# Dummy baseline for Kaggle submission
# Generates random multi-label predictions
# ------------------------
import os
import csv
import random
from tqdm import tqdm

# --- Paths ---
TEST_DIR = "Amazon_products/test"  # modify if needed
TEST_CORPUS_PATH = os.path.join(TEST_DIR, "test_corpus.txt")  # product_id \t text
SUBMISSION_PATH = "submission.csv"  # output file

# --- Constants ---
NUM_CLASSES = 531  # total number of classes (0–530)
MIN_LABELS = 1     # minimum number of labels per sample
MAX_LABELS = 3     # maximum number of labels per sample

# --- Load test corpus ---
def load_corpus(path):
    """Load test corpus into {pid: text} dictionary."""
    pid2text = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t", 1)
            if len(parts) == 2:
                pid, text = parts
                pid2text[pid] = text
    return pid2text

pid2text_test = load_corpus(TEST_CORPUS_PATH)
pid_list_test = list(pid2text_test.keys())

# --- Generate random predictions ---
all_pids, all_labels = [], []
for pid in tqdm(pid_list_test, desc="Generating dummy predictions"):
    n_labels = random.randint(MIN_LABELS, MAX_LABELS)
    labels = random.sample(range(NUM_CLASSES), n_labels)
    labels = sorted(labels)
    all_pids.append(pid)
    all_labels.append(labels)

# --- Save submission file ---
with open(SUBMISSION_PATH, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["pid", "labels"])
    for pid, labels in zip(all_pids, all_labels):
        writer.writerow([pid, ",".join(map(str, labels))])

print(f"Dummy submission file saved to: {SUBMISSION_PATH}")
print(f"Total samples: {len(all_pids)}, Classes per sample: {MIN_LABELS}-{MAX_LABELS}")

Generating dummy predictions: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19658/19658 [00:00<00:00, 190266.11it/s]

Dummy submission file saved to: submission.csv
Total samples: 19658, Classes per sample: 1-3



