In [1]:
# ================================================
# 1. REPRODUCIBILITY SETTINGS 
# ================================================
from pathlib import Path

import random
import numpy as np
import pandas as pd
import torch

from tqdm import tqdm
import copy

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split

# ----- Reproducibility -----
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

# ----- Device -----
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ================================================
# 2. DEFAULT PATHS FOR DATASET
# ================================================
ROOT = Path("Amazon_products")   # dataset root directory

# Main corpus
TRAIN_CORPUS_PATH = ROOT / "train" /  "train_corpus.txt"       # pid \t text
TEST_CORPUS_PATH  = ROOT / "test" / "test_corpus.txt"        # pid \t text

# Taxonomy & class meta
CLASSES_PATH      = ROOT / "classes.txt"            # class_id \t class_name
HIERARCHY_PATH    = ROOT / "class_hierarchy.txt"    # parent_id \t child_id
KEYWORDS_PATH     = ROOT / "class_related_keywords.txt"

# Constants
NUM_CLASSES = 531
MIN_LABELS = 1     # minimum number of labels per sample
MAX_LABELS = 3     # maximum number of labels per sample

# Check paths
print("\n== Data path check ==")
for p in [TRAIN_CORPUS_PATH, TEST_CORPUS_PATH,
          CLASSES_PATH, HIERARCHY_PATH, KEYWORDS_PATH]:
    print(f"{p} -> {p.exists()}")

Device: cuda

== Data path check ==
Amazon_products/train/train_corpus.txt -> True
Amazon_products/test/test_corpus.txt -> True
Amazon_products/classes.txt -> True
Amazon_products/class_hierarchy.txt -> True
Amazon_products/class_related_keywords.txt -> True


In [2]:
# ================================================
# 2. DATA LOADING
# ================================================

def load_corpus(path):
    """
    Load corpus file (pid \\t text) as {pid: text} dictionary.
    """
    pid2text = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t", 1)
            if len(parts) == 2:
                pid, text = parts
                pid2text[pid] = text
    return pid2text

# Load corpus
print("Loading train/test corpus...")

pid2text_train = load_corpus(TRAIN_CORPUS_PATH)
pid2text_test  = load_corpus(TEST_CORPUS_PATH)

pid_list_train = list(pid2text_train.keys())
pid_list_test  = list(pid2text_test.keys())

print("Train samples:", len(pid2text_train))
print("Test samples :", len(pid2text_test))

# Quick sample check
for i, (pid, text) in enumerate(pid2text_train.items()):
    print(f"Example train sample #{i}: pid={pid}, text={text[:80]}...")
    break

Loading train/test corpus...
Train samples: 29487
Test samples : 19658
Example train sample #0: pid=0, text=omron hem 790it automatic blood pressure monitor with advanced omron health mana...


In [3]:
# ================================================
# 3. CLASS METADATA LOADING
# ================================================

def load_classes(path):
    """
    classes.txt : class_id \\t class_name
    returns: id2label, label2id
    """
    id2label = {}
    label2id = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) != 2:
                continue
            cid, name = parts
            cid = int(cid)
            id2label[cid] = name
            label2id[name] = cid
    return id2label, label2id


def load_hierarchy(path):
    """
    class_hierarchy.txt : parent_id \\t child_id
    returns: edges (list of tuples)
    """
    edges = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) != 2:
                continue
            p, c = map(int, parts)
            edges.append((p, c))
    return edges


def load_keywords(path, label2id):
    """
    class_related_keywords.txt : CLASS_NAME: kw1, kw2,...
    returns: {class_id: [kws]}
    """
    d = {cid: [] for cid in label2id.values()}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if ":" not in line:
                continue
            name, kws = line.strip().split(":", 1)
            kws = [k.strip() for k in kws.split(",") if k.strip()]
            if name in label2id:
                cid = label2id[name]
                d[cid] = kws
    return d


# ----------------- Load all class meta -----------------
print("Loading class metadata...")

id2label, label2id = load_classes(CLASSES_PATH)
edges = load_hierarchy(HIERARCHY_PATH)
label_keywords = load_keywords(KEYWORDS_PATH, label2id)

print("Num classes:", len(id2label))
print("Num edges in taxonomy:", len(edges))
print()

# Small check
example_id = 0
print("Example class id:", example_id)
print("Name:", id2label[example_id])
print("Keywords:", label_keywords[example_id])

Loading class metadata...
Num classes: 531
Num edges in taxonomy: 568

Example class id: 0
Name: grocery_gourmet_food
Keywords: ['snacks', 'condiments', 'beverages', 'specialty_foods', 'spices', 'cooking_oils', 'baking_ingredients', 'gourmet_chocolates', 'artisanal_cheeses', 'organic_foods']


In [4]:
# ================================================
# 4. TF-IDF EMBEDDING + DOC–CLASS SIMILARITY
#    (class name vs keyword 분리 활용)
# ================================================
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize

# ------------------------------------------------
# (1) 클래스 이름 텍스트 / 키워드 텍스트 분리 생성
# ------------------------------------------------
def build_class_name_texts(id2label):
    """
    각 class_id에 대해 클래스 이름만 사용한 텍스트 리스트 생성
    """
    texts = []
    for cid in range(NUM_CLASSES):
        name = id2label[cid].replace("_", " ")
        texts.append(name)
    return texts

def build_class_keyword_texts(label_keywords, repeat=3):
    texts = []
    for cid in range(NUM_CLASSES):
        kws_list = label_keywords.get(cid, [])
        if not kws_list:
            texts.append("")
            continue
        kws = " ".join(kws_list)
        # 키워드 텍스트를 반복해서 TF-IDF 상에서 weight 증가
        text = (" " + kws) * repeat
        texts.append(text.strip())
    return texts


class_name_texts = build_class_name_texts(id2label)        
class_kw_texts = build_class_keyword_texts(label_keywords, repeat=3)    

print("Example class name text:", class_name_texts[0])
print("Example class keyword text:", class_kw_texts[0])


# ------------------------------------------------
# (2) TF-IDF vocabulary 학습
#     - train 문서 + test 문서 + class name + class keyword 모두 포함
# ------------------------------------------------
all_texts_for_vocab = (
    list(pid2text_train.values())
    + list(pid2text_test.values())
    + class_name_texts
    + class_kw_texts
)

vectorizer = TfidfVectorizer(
    max_features=100_000,
    ngram_range=(1, 2),
    min_df=1,
    sublinear_tf=True
)

vectorizer.fit(all_texts_for_vocab)

print("Vocabulary size:", len(vectorizer.vocabulary_))


# ------------------------------------------------
# (3) 실제 TF-IDF 행렬 변환
# ------------------------------------------------
N_train = len(pid2text_train)
N_test  = len(pid2text_test)
C       = NUM_CLASSES

# 문서 TF-IDF
X_train_docs = vectorizer.transform(pid2text_train.values())   # [N_train, V]
X_test_docs  = vectorizer.transform(pid2text_test.values())    # [N_test, V]

# 클래스 이름 TF-IDF (GNN용 initial feature)
X_class_name = vectorizer.transform(class_name_texts)          # [C, V]

# 클래스 키워드 TF-IDF (silver label용 보조 sim)
X_class_kw   = vectorizer.transform(class_kw_texts)            # [C, V]

print("X_train_docs:", X_train_docs.shape)
print("X_test_docs :", X_test_docs.shape)
print("X_class_name:", X_class_name.shape)
print("X_class_kw  :", X_class_kw.shape)


# ------------------------------------------------
# (4) L2 정규화 (코사인 유사도용)
# ------------------------------------------------
print("\nNormalizing TF-IDF vectors...")
X_train_norm      = normalize(X_train_docs, axis=1)
X_class_name_norm = normalize(X_class_name, axis=1)
X_class_kw_norm   = normalize(X_class_kw, axis=1)


# ------------------------------------------------
# (5) 이름 기반 / 키워드 기반 doc–class 유사도 계산
# ------------------------------------------------
print("\nComputing doc–class cosine similarities...")

# 이름 기반 similarity
sims_name = (X_train_norm @ X_class_name_norm.T).toarray().astype("float32")  # [N_train, C]

# 키워드 기반 similarity
sims_kw   = (X_train_norm @ X_class_kw_norm.T).toarray().astype("float32")    # [N_train, C]

print("sims_name shape:", sims_name.shape, "| min/max:", sims_name.min(), sims_name.max())
print("sims_kw   shape:", sims_kw.shape,   "| min/max:", sims_kw.min(),   sims_kw.max())


# ------------------------------------------------
# (6) 두 채널을 가중합하여 최종 similarity 생성
#     - alpha: class name 비중
#     - beta : keyword 비중
# ------------------------------------------------
alpha = 0.3  # class name 중요도
beta  = 0.7  # keyword 중요도

sims = alpha * sims_name + beta * sims_kw   # [N_train, C]

print("\nFinal sims shape:", sims.shape)
print("Final sims min/max:", sims.min(), sims.max())
print("Done.")

Example class name text: grocery gourmet food
Example class keyword text: snacks condiments beverages specialty_foods spices cooking_oils baking_ingredients gourmet_chocolates artisanal_cheeses organic_foods snacks condiments beverages specialty_foods spices cooking_oils baking_ingredients gourmet_chocolates artisanal_cheeses organic_foods snacks condiments beverages specialty_foods spices cooking_oils baking_ingredients gourmet_chocolates artisanal_cheeses organic_foods
Vocabulary size: 100000
X_train_docs: (29487, 100000)
X_test_docs : (19658, 100000)
X_class_name: (531, 100000)
X_class_kw  : (531, 100000)

Normalizing TF-IDF vectors...

Computing doc–class cosine similarities...
sims_name shape: (29487, 531) | min/max: 0.0 0.5831636
sims_kw   shape: (29487, 531) | min/max: 0.0 0.40836385

Final sims shape: (29487, 531)
Final sims min/max: 0.0 0.2858547
Done.


In [5]:
# ================================================
# 5. TOP-DOWN CLASS EXPLORATION (TaxoClass style)
# ================================================

from collections import defaultdict

# ------------------------------------------------
# (1) adjacency (parent2child, child2parent) 만들기
# ------------------------------------------------
parent2child = defaultdict(list)
child2parent = defaultdict(list)

for p, c in edges:   # edges = [(parent, child), ...]
    parent2child[p].append(c)
    child2parent[c].append(p)


# ------------------------------------------------
# (2) Top-Down 탐색 함수
# ------------------------------------------------
def topdown_candidates(doc_idx, sims_row, root=0, max_depth=5, base_k=2):
    """
    문서-클래스 유사도 sims_row 를 이용하여
    taxonomy 기반 top-down class 후보를 탐색.

    Return:
        candidates (set): 후보 class IDs
        path_score (dict): 각 class의 path score
    """
    path_score = {root: 1.0}   # Root의 path score = 1
    level_nodes = [root]
    visited = set([root])

    for depth in range(max_depth):
        # 현재 level의 child 후보 모으기
        cand = []
        for node in level_nodes:
            for ch in parent2child.get(node, []):
                if ch not in visited:
                    cand.append(ch)

        # 더 이상 확장할 child가 없으면 종료
        if not cand:
            break

        # child path score 계산
        for ch in cand:
            parents = child2parent.get(ch, [])
            if not parents:
                continue
            # parent 중 path_score*sim 이 가장 높은 parent 선택
            ps = max(
                path_score[p] * sims_row[ch]
                for p in parents
                if p in path_score
            )
            path_score[ch] = ps

        # path_score 기준 상위 k개 선택
        k = base_k + depth    # depth=0→2, depth=1→3, ...
        cand_sorted = sorted(
            cand,
            key=lambda x: path_score.get(x, 0.0),
            reverse=True
        )
        level_nodes = cand_sorted[:k]

        visited.update(level_nodes)

    # root 제외
    candidates = visited - {root}
    return candidates, path_score


print("Top-Down exploration module loaded.")

Top-Down exploration module loaded.


In [6]:
# ================================================
# 6. CORE CLASS MINING + SILVER LABEL v1 (WITH MASK)
# ================================================
import numpy as np
from tqdm import tqdm

# ---------------------------
# (1) core class 함수 (margin 완화 + fallback 포함)
# ---------------------------
def get_core_classes_for_doc(sims_row, candidates, top_m=3, margin=0.0):
    """
    sims_row : shape (NUM_CLASSES,)
    candidates : topdown_candidates()로 얻은 후보 class 집합
    top_m : core class 최대 개수
    margin : parent/sibling보다 얼마나 더 커야 core로 인정할지
    """
    confs = []  # (cid, conf)

    for c in candidates:
        parents = child2parent.get(c, [])
        sibs = set()
        for p in parents:
            sibs.update(parent2child.get(p, []))
        sibs.discard(c)

        base_sim = 0.0
        idxs = list(parents) + list(sibs)
        if idxs:
            base_sim = sims_row[idxs].max()

        conf = float(sims_row[c] - base_sim)
        confs.append((c, conf))

    # margin 이상인 것만
    good = [(c, conf) for c, conf in confs if conf > margin]
    good.sort(key=lambda x: x[1], reverse=True)
    core = [c for c, _ in good[:top_m]]

    # fallback: core가 비면 sims_row 기준 top-1 후보를 core로
    if not core and len(candidates) > 0:
        best_c = max(candidates, key=lambda cid: sims_row[cid])
        core = [best_c]

    return core


# ---------------------------
# (2) silver v1 + mask 생성
# ---------------------------
def build_silver_labels_v1_with_mask(
    sims,
    max_depth=5,
    base_k=2,
    top_m=3,
    margin=0.0
):
    """
    sims: (N_train, NUM_CLASSES)
    return:
        silver_y : (N, C) float32  (0/1)
        silver_m : (N, C) float32  (0/1, mask)
    - 문서별 후보 집합 Q_i 안에서만 0/1 라벨을 정의
    - Q_i 밖은 mask=0 → loss에서 제외
    """
    N_train = sims.shape[0]
    silver_y = np.zeros((N_train, NUM_CLASSES), dtype=np.float32)
    silver_m = np.zeros((N_train, NUM_CLASSES), dtype=np.float32)

    for i in tqdm(range(N_train), desc="Building silver labels v1 (with mask)"):
        sims_row = sims[i]

        # 1) Top-Down candidates
        candidates, _ = topdown_candidates(
            doc_idx=i,
            sims_row=sims_row,
            root=0,
            max_depth=max_depth,
            base_k=base_k,
        )

        # 후보가 없다면 sims top_m을 후보로 사용
        if not candidates:
            topk = np.argsort(sims_row)[::-1][:top_m]
            candidates = set(topk)

        # 2) core class 선택
        cores = get_core_classes_for_doc(
            sims_row,
            candidates,
            top_m=top_m,
            margin=margin,
        )

        # 그래도 core가 없다면 sims top-1을 core로
        if not cores:
            best_c = int(np.argmax(sims_row))
            cores = [best_c]

        # 3) positive set = core + parents(core)
        pos = set()
        for c in cores:
            pos.add(c)
            for p in child2parent.get(c, []):
                pos.add(p)

        # 4) candidate set Q_i = candidates + parents(candidates)
        Q = set(candidates)
        for c in list(candidates):
            for p in child2parent.get(c, []):
                Q.add(p)

        # pos는 반드시 Q 안에 있어야 함
        Q.update(pos)

        # 5) Q_i 안에서만 0/1 라벨 정의
        for c in Q:
            silver_m[i, c] = 1.0              # 이 클래스는 supervision 대상
            silver_y[i, c] = 1.0 if c in pos else 0.0

    return silver_y, silver_m


print("Generating silver_labels_v1 (with mask) using sims...")

silver_labels_v1, silver_mask_v1 = build_silver_labels_v1_with_mask(
    sims,
    max_depth=5,
    base_k=2,
    top_m=3,
    margin=0.0,   # 일단 margin=0으로 두고, 나중에 조정 가능
)

print("silver_labels_v1 shape:", silver_labels_v1.shape)
print("silver_mask_v1   shape:", silver_mask_v1.shape)

pos_per_doc = silver_labels_v1.sum(axis=1)
sup_per_doc = silver_mask_v1.sum(axis=1)
print("avg positives per doc:", pos_per_doc.mean())
print("avg supervised classes per doc (|Q_i|):", sup_per_doc.mean())
print("docs with 0 supervised classes:", (sup_per_doc == 0).sum())
print("Done.")

Generating silver_labels_v1 (with mask) using sims...


Building silver labels v1 (with mask): 100%|██████████| 29487/29487 [00:02<00:00, 12504.69it/s]

silver_labels_v1 shape: (29487, 531)
silver_mask_v1   shape: (29487, 531)
avg positives per doc: 2.1752975
avg supervised classes per doc (|Q_i|): 6.76939
docs with 0 supervised classes: 0
Done.





In [7]:
# ================================================
# 7. LABEL-GCN + DOCUMENT-CLASS CLASSIFIER
# ================================================
import torch
import torch.nn as nn
import torch.nn.functional as F


# ------------------------------------------------
# (1) Build normalized adjacency A_hat for GCN
# ------------------------------------------------
def build_normalized_adj(num_classes, edges):
    """
    edges: [(parent, child), ...]
    출력: A_hat (torch.FloatTensor, [C,C])
    """
    import numpy as np

    A = np.zeros((num_classes, num_classes), dtype=np.float32)

    # parent-child 연결을 양방향으로 넣기
    for p, c in edges:
        A[p, c] = 1.0
        A[c, p] = 1.0

    # self-loop
    np.fill_diagonal(A, 1.0)

    # D^{-1/2} * A * D^{-1/2}
    deg = A.sum(axis=1)
    deg_inv_sqrt = np.power(deg, -0.5)
    deg_inv_sqrt[np.isinf(deg_inv_sqrt)] = 0.0
    D_inv_sqrt = np.diag(deg_inv_sqrt)

    A_hat = D_inv_sqrt @ A @ D_inv_sqrt
    return torch.from_numpy(A_hat).float()


A_hat = build_normalized_adj(NUM_CLASSES, edges).to(device)
print("A_hat built:", A_hat.shape)


# ------------------------------------------------
# (2) Label Encoder: GCN
# ------------------------------------------------
class LabelGCN(nn.Module):
    def __init__(self, in_dim, hidden_dim=256, num_layers=2, dropout=0.5):
        super().__init__()
        self.num_layers = num_layers
        self.dropout = nn.Dropout(dropout)
        
        dims = [in_dim] + [hidden_dim] * num_layers
        self.linears = nn.ModuleList()
        for i in range(num_layers):
            self.linears.append(nn.Linear(dims[i], dims[i+1]))

    def forward(self, A_hat, H):
        x = H  # [C, in_dim]
        for i, lin in enumerate(self.linears):
            x = A_hat @ x          # GCN aggregation
            x = lin(x)
            if i < self.num_layers - 1:
                x = F.relu(x)
                x = self.dropout(x)
        return x  # [C, hidden_dim]


# ------------------------------------------------
# (3) 전체 classifier: TF-IDF doc → projection → dot with label GNN
# ------------------------------------------------
class TaxonomyClassifier(nn.Module):
    def __init__(self, vocab_dim, hidden_dim=256):
        super().__init__()
        # 문서 임베딩 projection matrix: V → d
        self.doc_proj = nn.Linear(vocab_dim, hidden_dim, bias=False)
        
        # 라벨 GCN
        self.label_gcn = LabelGCN(
            in_dim=vocab_dim,     # label initial features = TF-IDF class-name vector
            hidden_dim=hidden_dim,
            num_layers=2,
            dropout=0.5
        )

    def forward(self, doc_feats, label_feats, A_hat):
        """
        doc_feats: [N, V]   TF-IDF 문서 벡터
        label_feats: [C, V] TF-IDF 클래스 (name) 벡터
        A_hat: [C, C]       taxonomy
        """
        # 1) Document embedding
        doc_emb = self.doc_proj(doc_feats)     # [N, d]

        # 2) Label embedding via GCN
        label_emb = self.label_gcn(A_hat, label_feats)  # [C, d]

        # 3) Matching score (bilinear의 단순 버전)
        logits = doc_emb @ label_emb.T         # [N, C]

        return logits, doc_emb, label_emb


print("Model definitions loaded.")

A_hat built: torch.Size([531, 531])
Model definitions loaded.


In [None]:
# ================================================
# 8. ROUND 1 TRAINING (MASKED BCE WITH SILVER v1)
# ================================================
from torch.utils.data import Dataset, DataLoader

class SilverDatasetWithMask(Dataset):
    def __init__(self, X_csr, y_np, m_np, pid_list=None):
        self.X = X_csr
        self.y = y_np
        self.m = m_np
        self.pid_list = pid_list

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        x_dense = self.X[idx].toarray().astype("float32").squeeze(0)
        y = self.y[idx].astype("float32")
        m = self.m[idx].astype("float32")
        return x_dense, y, m


train_dataset = SilverDatasetWithMask(X_train_docs, silver_labels_v1, silver_mask_v1, pid_list_train)

batch_size = 64
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=0,
    pin_memory=True
)

print("Train dataset size:", len(train_dataset))
print("Batch size:", batch_size)

vocab_dim = X_train_docs.shape[1]
hidden_dim = 256

model = TaxonomyClassifier(vocab_dim=vocab_dim, hidden_dim=hidden_dim).to(device)

label_feats = torch.from_numpy(
    X_class_name.toarray().astype("float32")
).to(device)

criterion = nn.BCEWithLogitsLoss(reduction='none')  # <-- 중요: reduction='none'
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

num_epochs = 2

for epoch in range(1, num_epochs + 1):
    model.train()
    total_loss = 0.0
    total_count = 0.0   # 실제 supervision에 사용된 (doc, class) 개수

    for batch_x, batch_y, batch_m in tqdm(train_loader, desc=f"[Round 1] Epoch {epoch}"):
        batch_x = batch_x.to(device)    # [B, V]
        batch_y = batch_y.to(device)    # [B, C]
        batch_m = batch_m.to(device)    # [B, C]

        optimizer.zero_grad()

        logits, doc_emb, label_emb = model(batch_x, label_feats, A_hat)  # [B, C]
        loss_raw = criterion(logits, batch_y)  # [B, C]

        # mask 적용: Q_i 안에서만 loss 계산
        loss_masked = loss_raw * batch_m
        loss = loss_masked.sum() / (batch_m.sum() + 1e-8)

        loss.backward()
        optimizer.step()

        total_loss += loss.item() * batch_m.sum().item()
        total_count += batch_m.sum().item()

    avg_loss = total_loss / (total_count + 1e-8)
    print(f"[Round 1] Epoch {epoch} avg_loss = {avg_loss:.6f}")

print("Round 1 training (masked BCE) finished.")

Train dataset size: 29487
Batch size: 64


[Round 1] Epoch 1: 100%|█████████▉| 459/461 [00:17<00:00, 26.43it/s]

In [None]:
# ================================================
# 9. SELF-TRAINING: SILVER v2 + MASK v2 (STILL MASKED)
# ================================================

def predict_train_probs(model, X_csr, label_feats, A_hat, batch_size=64):
    model.eval()
    N = X_csr.shape[0]
    all_probs = []
    with torch.no_grad():
        for i in tqdm(range(0, N, batch_size), desc="Predicting Train (Round 1)"):
            X_batch = X_csr[i : i+batch_size].toarray().astype("float32")
            X_batch = torch.from_numpy(X_batch).to(device)
            logits, _, _ = model(X_batch, label_feats, A_hat)
            probs = torch.sigmoid(logits).cpu().numpy()
            all_probs.append(probs)
    return np.vstack(all_probs)

print("Predicting train probabilities (Round 1)...")
probs_train = predict_train_probs(
    model,
    X_train_docs,
    label_feats,
    A_hat,
    batch_size=64
)
print("probs_train shape:", probs_train.shape)

def build_silver_labels_v2_with_mask(
    probs,
    silver_v1,
    mask_v1,
    high=0.75,
    low=0.25
):
    """
    probs  : (N, C)
    silver_v1 : (N, C)
    mask_v1   : (N, C)
    return: silver_v2, mask_v2
    - high 이상: 1, mask=1
    - low  이하: 0, mask=1
    - 그 사이: 이전 silver_v1 / mask_v1 유지
    """
    silver_v2 = silver_v1.copy()
    mask_v2   = mask_v1.copy()

    high_mask = (probs >= high)
    low_mask  = (probs <= low)

    silver_v2[high_mask] = 1.0
    mask_v2[high_mask]   = 1.0

    silver_v2[low_mask] = 0.0
    mask_v2[low_mask]   = 1.0

    return silver_v2, mask_v2

silver_labels_v2, silver_mask_v2 = build_silver_labels_v2_with_mask(
    probs_train,
    silver_labels_v1,
    silver_mask_v1,
    high=0.6,   # 살짝 완화
    low=0.2
)

pos_per_doc_v2 = silver_labels_v2.sum(axis=1)
sup_per_doc_v2 = silver_mask_v2.sum(axis=1)
print("[v2] avg positives per doc:", pos_per_doc_v2.mean())
print("[v2] avg supervised classes per doc:", sup_per_doc_v2.mean())
print("[v2] docs with 0 supervised classes:", (sup_per_doc_v2 == 0).sum())
print("Self-training Round 2 labels ready.")

In [None]:
# ================================================
# 10. ROUND 2 TRAINING (MASKED BCE WITH SILVER v2)
# ================================================
train_dataset_v2 = SilverDatasetWithMask(X_train_docs, silver_labels_v2, silver_mask_v2, pid_list_train)

batch_size_round2 = 64
train_loader_v2 = DataLoader(
    train_dataset_v2,
    batch_size=batch_size_round2,
    shuffle=True,
    num_workers=0,
    pin_memory=True
)

criterion_round2 = nn.BCEWithLogitsLoss(reduction='none')
optimizer_round2 = torch.optim.Adam(model.parameters(), lr=1e-3)

num_epochs_round2 = 2

for epoch in range(1, num_epochs_round2 + 1):
    model.train()
    total_loss = 0.0
    total_count = 0.0

    for batch_x, batch_y, batch_m in tqdm(train_loader_v2, desc=f"[Round 2] Epoch {epoch}"):
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)
        batch_m = batch_m.to(device)

        optimizer_round2.zero_grad()
        logits, doc_emb, label_emb = model(batch_x, label_feats, A_hat)
        loss_raw = criterion_round2(logits, batch_y)

        loss_masked = loss_raw * batch_m
        loss = loss_masked.sum() / (batch_m.sum() + 1e-8)

        loss.backward()
        optimizer_round2.step()

        total_loss += loss.item() * batch_m.sum().item()
        total_count += batch_m.sum().item()

    avg_loss = total_loss / (total_count + 1e-8)
    print(f"[Round 2] Epoch {epoch}] avg_loss = {avg_loss:.6f}")

print("Round 2 training (masked BCE) finished.")

In [None]:
# ================================================
# 11. FINAL TEST PREDICTION + SUBMISSION
#     (model + TF-IDF sims blending, 1~3 labels)
# ================================================
import csv
from tqdm import tqdm
from sklearn.preprocessing import normalize
import numpy as np

SUBMISSION_PATH = "submission.csv"  # output file

# ------------------------------------------------
# 1) Round 2 모델 기반 test 확률이 없다면 먼저 예측
#    (이미 probs_test가 있다면 이 블록은 건너뛰어도 됨)
# ------------------------------------------------
def predict_test_probs(model, X_csr, label_feats, A_hat, batch_size=64):
    model.eval()
    N = X_csr.shape[0]
    all_probs = []
    with torch.no_grad():
        for i in tqdm(range(0, N, batch_size), desc="Predicting test probs"):
            X_batch = X_csr[i : i+batch_size].toarray().astype("float32")
            X_batch = torch.from_numpy(X_batch).to(device)
            logits, _, _ = model(X_batch, label_feats, A_hat)
            probs = torch.sigmoid(logits).cpu().numpy()
            all_probs.append(probs)
    return np.vstack(all_probs)

if "probs_test" not in globals():
    print("probs_test not found in globals, computing...")
    probs_test = predict_test_probs(
        model,
        X_test_docs,
        label_feats,
        A_hat,
        batch_size=64
    )

print("probs_test shape:", probs_test.shape)


# ------------------------------------------------
# 2) TF-IDF 기반 sims_test 계산 (doc–class similarity)
# ------------------------------------------------
print("Computing TF-IDF-based sims for test set...")

# test 문서 정규화
X_test_norm = normalize(X_test_docs, axis=1)

# 클래스 이름 / 키워드 정규화 행렬은 4번 셀에서 이미 있음:
# X_class_name_norm, X_class_kw_norm
sims_name_test = (X_test_norm @ X_class_name_norm.T).toarray().astype("float32")
sims_kw_test   = (X_test_norm @ X_class_kw_norm.T).toarray().astype("float32")

# 이름 vs 키워드 비중
alpha = 0.3  # class name weight
beta  = 0.7  # keyword weight

sims_test = alpha * sims_name_test + beta * sims_kw_test
print("sims_test shape:", sims_test.shape)


# ------------------------------------------------
# 3) MODEL + SIMS BLENDING
# ------------------------------------------------
lambda_model = 0.6
lambda_sims  = 0.4

final_scores = lambda_model * probs_test + lambda_sims * sims_test
print("final_scores shape:", final_scores.shape)


# ------------------------------------------------
# 4) 동적 라벨 선택 함수
#    - threshold 이상인 클래스만 후보
#    - 없으면 top-1
#    - 너무 많으면 상위 MAX_LABELS개
#    - MIN_LABELS, MAX_LABELS는 baseline에서 정의한 값 사용
# ------------------------------------------------
def pick_labels(score_row, min_k=1, max_k=3, threshold=0.6):
    """
    score_row: shape (C,) - 한 문서에 대한 클래스 점수 벡터
    min_k    : 최소 라벨 개수 (보통 1)
    max_k    : 최대 라벨 개수 (보통 3)
    threshold: 이 값 이상인 클래스만 '유망하다'고 보고 우선 선택
    """
    # 1) threshold 이상인 클래스만 후보로 뽑기
    cand = np.where(score_row >= threshold)[0]

    # 2) threshold 넘는 게 하나도 없으면 → top-1만 선택
    if len(cand) == 0:
        top1 = int(np.argmax(score_row))
        return [top1]

    # 3) 후보가 너무 많으면 → 점수 순으로 max_k개만 남기기
    if len(cand) > max_k:
        sorted_idx = cand[np.argsort(score_row[cand])[::-1]]
        cand = sorted_idx[:max_k]

    # 4) 후보가 min_k보다 적으면 (예: cand 1개인데 min_k=2) → top-score에서 채워서 min_k까지 맞추기
    if len(cand) < min_k:
        sorted_idx = np.argsort(score_row)[::-1]
        for c in sorted_idx:
            if c not in cand:
                cand = np.append(cand, c)
                if len(cand) == min_k:
                    break

    return list(map(int, cand))


# ------------------------------------------------
# 5) baseline 스타일로 submission.csv 생성
# ------------------------------------------------
all_pids, all_labels = [], []

print("Generating final predictions for submission...")
for i, pid in enumerate(tqdm(pid_list_test, desc="Scoring test instances")):
    scores = final_scores[i]
    labels = pick_labels(
        scores,
        min_k=MIN_LABELS,
        max_k=MAX_LABELS,
        threshold=0.6,  # 필요하면 조정 가능
    )
    labels = sorted(labels)
    all_pids.append(pid)
    all_labels.append(labels)

# --- Save submission file (baseline 포맷 그대로) ---
with open(SUBMISSION_PATH, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["id", "labels"])
    for pid, labels in zip(all_pids, all_labels):
        writer.writerow([pid, ",".join(map(str, labels))])

print(f"Submission file saved to: {SUBMISSION_PATH}")
print(f"Total samples: {len(all_pids)}, Classes per sample: {MIN_LABELS}-{MAX_LABELS} (dynamic)")

In [None]:
lengths = [len(lbls) for lbls in all_labels]
print("min labels per doc:", min(lengths))
print("max labels per doc:", max(lengths))

unique, counts = np.unique(lengths, return_counts=True)
print("label count distribution:")
for u, c in zip(unique, counts):
    print(f"{u} labels: {c} docs")

In [None]:
# 서로 다른 두 개 train 샘플에 대해 probs 비교
print(probs_test[0][:10])
print(probs_test[1][:10])

# 혹은 표준편차
print("std over classes (sample 0):", probs_test[0].std())
print("std over classes (sample 1):", probs_test[1].std())

In [None]:
# silver v1 통계
pos_per_doc_v1 = silver_labels_v1.sum(axis=1)
print("[v1] avg positives per doc:", pos_per_doc_v1.mean())
print("[v1] min positives per doc:", pos_per_doc_v1.min())
print("[v1] docs with 0 positives:", (pos_per_doc_v1 == 0).sum())

# silver v2 통계
pos_per_doc_v2 = silver_labels_v2.sum(axis=1)
print("[v2] avg positives per doc:", pos_per_doc_v2.mean())
print("[v2] min positives per doc:", pos_per_doc_v2.min())
print("[v2] docs with 0 positives:", (pos_per_doc_v2 == 0).sum())

In [18]:
# ------------------------
# Dummy baseline for Kaggle submission
# Generates random multi-label predictions
# ------------------------
import os
import csv
import random
from tqdm import tqdm

# --- Paths ---
TEST_DIR = "Amazon_products/test"  # modify if needed
TEST_CORPUS_PATH = os.path.join(TEST_DIR, "test_corpus.txt")  # product_id \t text
SUBMISSION_PATH = "submission.csv"  # output file

# --- Constants ---
NUM_CLASSES = 531  # total number of classes (0–530)
MIN_LABELS = 1     # minimum number of labels per sample
MAX_LABELS = 3     # maximum number of labels per sample

# --- Load test corpus ---
def load_corpus(path):
    """Load test corpus into {pid: text} dictionary."""
    pid2text = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t", 1)
            if len(parts) == 2:
                pid, text = parts
                pid2text[pid] = text
    return pid2text

pid2text_test = load_corpus(TEST_CORPUS_PATH)
pid_list_test = list(pid2text_test.keys())

# --- Generate random predictions ---
all_pids, all_labels = [], []
for pid in tqdm(pid_list_test, desc="Generating dummy predictions"):
    n_labels = random.randint(MIN_LABELS, MAX_LABELS)
    labels = random.sample(range(NUM_CLASSES), n_labels)
    labels = sorted(labels)
    all_pids.append(pid)
    all_labels.append(labels)

# --- Save submission file ---
with open(SUBMISSION_PATH, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["pid", "labels"])
    for pid, labels in zip(all_pids, all_labels):
        writer.writerow([pid, ",".join(map(str, labels))])

print(f"Dummy submission file saved to: {SUBMISSION_PATH}")
print(f"Total samples: {len(all_pids)}, Classes per sample: {MIN_LABELS}-{MAX_LABELS}")

Generating dummy predictions: 100%|██████████| 19658/19658 [00:00<00:00, 312260.00it/s]

Dummy submission file saved to: submission.csv
Total samples: 19658, Classes per sample: 1-3



