In [10]:
from pathlib import Path
import os, re, json
import pandas as pd

CWD = Path.cwd()
DATA_ROOT = CWD / "project_release" / "Amazon_products"

paths = {
    "classes": DATA_ROOT / "classes.txt",
    "hier":    DATA_ROOT / "class_hierarchy.txt",
    "kw":      DATA_ROOT / "class_related_keywords.txt",
    "train":   DATA_ROOT / "train" / "train_corpus.txt",
    "test":    DATA_ROOT / "test"  / "test_corpus.txt",
}

print("CWD:", CWD)
print("DATA_ROOT:", DATA_ROOT, "exists:", DATA_ROOT.exists())
for k,v in paths.items():
    print(k, "->", v, "exists:", v.exists(), "size:", (v.stat().st_size if v.exists() else None))

# 강제 체크(하나라도 없으면 여기서 멈추기)
missing = [k for k,v in paths.items() if not v.exists()]
if missing:
    raise FileNotFoundError(f"Missing files: {missing}")


CWD: /home/sagemaker-user
DATA_ROOT: /home/sagemaker-user/project_release/Amazon_products exists: True
classes -> /home/sagemaker-user/project_release/Amazon_products/classes.txt exists: True size: 9646
hier -> /home/sagemaker-user/project_release/Amazon_products/class_hierarchy.txt exists: True size: 4086
kw -> /home/sagemaker-user/project_release/Amazon_products/class_related_keywords.txt exists: True size: 87468
train -> /home/sagemaker-user/project_release/Amazon_products/train/train_corpus.txt exists: True size: 14125831
test -> /home/sagemaker-user/project_release/Amazon_products/test/test_corpus.txt exists: True size: 9428649


In [11]:
def head_lines(path, n=8):
    out = []
    with open(path, "r", encoding="utf-8", errors="replace") as f:
        for _ in range(n):
            out.append(next(f).rstrip("\n"))
    return out

for k in ["classes", "hier", "kw", "train", "test"]:
    print("\n====", k, "====")
    lines = head_lines(paths[k], n=8)
    for i,l in enumerate(lines, 1):
        print(f"{i:02d}: {l[:250]}")



==== classes ====
01: 0	grocery_gourmet_food
02: 1	meat_poultry
03: 2	jerky
04: 3	toys_games
05: 4	games
06: 5	puzzles
07: 6	jigsaw_puzzles
08: 7	board_games

==== hier ====
01: 0	1
02: 0	8
03: 0	208
04: 0	211
05: 0	213
06: 0	216
07: 0	229
08: 0	255

==== kw ====
01: grocery_gourmet_food:snacks,condiments,beverages,specialty_foods,spices,cooking_oils,baking_ingredients,gourmet_chocolates,artisanal_cheeses,organic_foods
02: meat_poultry:butcher,cuts,marination,grilling,roasting,seasoning,halal,organic,deli,marbling
03: jerky:beef,turkey,chicken,venison,buffalo,kangaroo,elk,ostrich,bison,spicy
04: toys_games:board_games,puzzles,action_figures,building_blocks,dolls,outdoor_toys,educational_toys,card_games,remote_control_toys,plush_toys
05: games:board_games,card_games,tabletop_games,party_games,roleplaying_games,video_games,strategy_games,family_games,word_games,dice_games
06: puzzles:jigsaw_puzzles,brain_teasers,puzzle_accessories,puzzle_storage,puzzle_mats,puzzle_glue,puzzle_organizers

In [12]:
def parse_classes(lines):
    rows = []
    for ln in lines:
        s = ln.strip()
        if not s:
            continue
        m = re.match(r"^\s*(\d+)\s*[\t,]\s*(.+?)\s*$", s)
        if m:
            raw_id = int(m.group(1))
            name = m.group(2).strip()
        else:
            raw_id = None
            name = s
        rows.append((raw_id, name))

    has_ids = sum(r[0] is not None for r in rows) >= (len(rows)*0.8)

    if has_ids:
        seen = {}
        mapped = []
        for raw_id, name in rows:
            if raw_id not in seen:
                seen[raw_id] = len(seen)
            mapped.append((seen[raw_id], raw_id, name))
    else:
        mapped = [(i, raw_id, name) for i,(raw_id,name) in enumerate(rows)]

    df = pd.DataFrame(mapped, columns=["class_idx","raw_id","class_name"])
    df["class_name"] = df["class_name"].astype(str).str.strip()
    return df

with open(paths["classes"], "r", encoding="utf-8", errors="replace") as f:
    class_lines = [l.rstrip("\n") for l in f]

classes_df = parse_classes(class_lines)

print("classes_df shape:", classes_df.shape)
print("raw_id null count:", classes_df["raw_id"].isna().sum())
print("duplicate class_idx:", classes_df["class_idx"].duplicated().sum())
print("duplicate class_name:", classes_df["class_name"].duplicated().sum())
display(classes_df.head(10))
display(classes_df.tail(10))

# 기대: 531개인지 확인(과제 스펙)
print("num_classes:", len(classes_df))


classes_df shape: (531, 3)
raw_id null count: 0
duplicate class_idx: 0
duplicate class_name: 0


Unnamed: 0,class_idx,raw_id,class_name
0,0,0,grocery_gourmet_food
1,1,1,meat_poultry
2,2,2,jerky
3,3,3,toys_games
4,4,4,games
5,5,5,puzzles
6,6,6,jigsaw_puzzles
7,7,7,board_games
8,8,8,beverages
9,9,9,juices


Unnamed: 0,class_idx,raw_id,class_name
521,521,521,spices_gifts
522,522,522,dried_fruit
523,523,523,flying_toys
524,524,524,shampoo
525,525,525,coatings_batters
526,526,526,hydrometers
527,527,527,lamb
528,528,528,exercise_wheels
529,529,529,chocolate_covered_nuts
530,530,530,breeding_tanks


num_classes: 531


In [None]:
# raw_id -> class_idx 매핑(가능한 경우)
rawid_to_idx = {}
if classes_df["raw_id"].notna().all():
    rawid_to_idx = {int(r.raw_id): int(r.class_idx) for r in classes_df.itertuples()}

def parse_edges(lines):
    edges_raw = []
    for ln in lines:
        s = ln.strip()
        if not s:
            continue
        parts = re.split(r"[\t, ]+", s)
        if len(parts) < 2:
            continue
        edges_raw.append((parts[0], parts[1]))
    return edges_raw

with open(paths["hier"], "r", encoding="utf-8", errors="replace") as f:
    hier_lines = [l.rstrip("\n") for l in f]

edges_raw = parse_edges(hier_lines)
print("raw edges:", len(edges_raw), "sample:", edges_raw[:5])

edges = []
unmapped = 0
for a,b in edges_raw:
    try:
        pa = int(a); ch = int(b)
        if rawid_to_idx:
            if pa in rawid_to_idx and ch in rawid_to_idx:
                edges.append((rawid_to_idx[pa], rawid_to_idx[ch]))
            else:
                unmapped += 1
        else:
            edges.append((pa, ch))
    except:
        unmapped += 1

print("mapped edges:", len(edges), "unmapped:", unmapped)
print("mapped sample:", edges[:5])

from collections import defaultdict
children = defaultdict(list)
parents  = defaultdict(list)
for p,c in edges:
    children[p].append(c)
    parents[c].append(p)

all_nodes = set(classes_df["class_idx"].tolist())
has_parent = set(parents.keys())
roots = sorted(list(all_nodes - has_parent))

print("num nodes:", len(all_nodes))
print("num roots candidate:", len(roots))
print("roots sample:", roots[:15])


In [17]:
import re
from collections import defaultdict

# class_name -> class_idx
name_to_idx = {r.class_name: int(r.class_idx) for r in classes_df.itertuples()}

def parse_kw_by_name(lines):
    kw_map = {}
    bad = []
    for ln in lines:
        s = ln.strip()
        if not s:
            continue
        if ":" not in s:
            bad.append(s)
            continue
        left, right = s.split(":", 1)
        cls_name = left.strip()
        if cls_name not in name_to_idx:
            bad.append(s)
            continue
        # 키워드 토큰화: 콤마 중심
        kws = [t.strip() for t in right.split(",") if t.strip()]
        kw_map[name_to_idx[cls_name]] = kws
    return kw_map, bad

with open(paths["kw"], "r", encoding="utf-8", errors="replace") as f:
    kw_lines = [l.rstrip("\n") for l in f]

kw_map, bad_kw_lines = parse_kw_by_name(kw_lines)

print("KW mapped classes:", len(kw_map))
print("Bad kw lines:", len(bad_kw_lines))
print("Classes without keywords:", len(classes_df) - len(kw_map))

# 샘플 확인
for ci in list(kw_map.keys())[:8]:
    name = classes_df.loc[classes_df.class_idx==ci, "class_name"].iloc[0]
    print(f"[{ci}] {name} -> {kw_map[ci][:12]}")

# 키워드 없는 클래스 몇 개 확인(리포트/디버깅용)
no_kw = sorted(list(set(classes_df.class_idx) - set(kw_map.keys())))
print("No-keyword class_idx sample:", no_kw[:20])


KW mapped classes: 531
Bad kw lines: 0
Classes without keywords: 0
[0] grocery_gourmet_food -> ['snacks', 'condiments', 'beverages', 'specialty_foods', 'spices', 'cooking_oils', 'baking_ingredients', 'gourmet_chocolates', 'artisanal_cheeses', 'organic_foods']
[1] meat_poultry -> ['butcher', 'cuts', 'marination', 'grilling', 'roasting', 'seasoning', 'halal', 'organic', 'deli', 'marbling']
[2] jerky -> ['beef', 'turkey', 'chicken', 'venison', 'buffalo', 'kangaroo', 'elk', 'ostrich', 'bison', 'spicy']
[3] toys_games -> ['board_games', 'puzzles', 'action_figures', 'building_blocks', 'dolls', 'outdoor_toys', 'educational_toys', 'card_games', 'remote_control_toys', 'plush_toys']
[4] games -> ['board_games', 'card_games', 'tabletop_games', 'party_games', 'roleplaying_games', 'video_games', 'strategy_games', 'family_games', 'word_games', 'dice_games']
[5] puzzles -> ['jigsaw_puzzles', 'brain_teasers', 'puzzle_accessories', 'puzzle_storage', 'puzzle_mats', 'puzzle_glue', 'puzzle_organizers', 'p

In [15]:
def load_corpus(path, n_probe=200):
    with open(path, "r", encoding="utf-8", errors="replace") as f:
        probe = []
        for _ in range(n_probe):
            try:
                probe.append(next(f).rstrip("\n"))
            except StopIteration:
                break

    tab_ratio = (sum("\t" in l for l in probe) / len(probe)) if probe else 0.0
    comma_ratio = (sum("," in l for l in probe) / len(probe)) if probe else 0.0
    delim = "\t" if tab_ratio >= max(comma_ratio, 0.2) else ("," if comma_ratio >= 0.2 else None)

    rows = []
    with open(path, "r", encoding="utf-8", errors="replace") as f:
        for i,ln in enumerate(f):
            s = ln.rstrip("\n")
            if not s.strip():
                continue
            if delim and delim in s:
                a,b = s.split(delim, 1)
                a=a.strip(); b=b.strip()
                if re.match(r"^[A-Za-z0-9_\-]+$", a) and len(b) > 0:
                    doc_id, text = a, b
                else:
                    doc_id, text = f"auto_{i:08d}", s
            else:
                doc_id, text = f"auto_{i:08d}", s
            rows.append((doc_id, text))

    df = pd.DataFrame(rows, columns=["doc_id","text"])
    df["text_len"] = df["text"].astype(str).str.len()
    probe_info = {"tab_ratio": tab_ratio, "comma_ratio": comma_ratio, "chosen_delim": delim}
    return df, probe_info

train_df, train_probe = load_corpus(paths["train"])
test_df,  test_probe  = load_corpus(paths["test"])

print("train:", train_df.shape, train_probe)
print("  len(min/mean/max):", int(train_df.text_len.min()), float(train_df.text_len.mean()), int(train_df.text_len.max()))
print("test :", test_df.shape, test_probe)
print("  len(min/mean/max):", int(test_df.text_len.min()), float(test_df.text_len.mean()), int(test_df.text_len.max()))

print("train duplicate ids:", int(train_df.doc_id.duplicated().sum()))
print("test duplicate ids :", int(test_df.doc_id.duplicated().sum()))

display(train_df.head(5))
display(test_df.head(5))


train: (29487, 3) {'tab_ratio': 1.0, 'comma_ratio': 0.82, 'chosen_delim': '\t'}
  len(min/mean/max): 94 472.42961304981856 8147
test : (19658, 3) {'tab_ratio': 1.0, 'comma_ratio': 0.76, 'chosen_delim': '\t'}
  len(min/mean/max): 81 473.1993590395768 6797
train duplicate ids: 0
test duplicate ids : 0


Unnamed: 0,doc_id,text,text_len
0,0,omron hem 790it automatic blood pressure monit...,305
1,1,natural factors whey factors chocolate works w...,192
2,2,"clif bar builder 's bar , 2 . 4 ounce bars i l...",561
3,3,andis 1875 watt professional ceramic ionic hai...,367
4,4,clif bar energy bars these were cheaper than w...,638


Unnamed: 0,doc_id,text,text_len
0,0,conair cs15tcs professional straight styles st...,501
1,1,barbie ballet shoes icon doll i was looking ro...,479
2,2,cloud b twilight constellation night light i b...,730
3,3,alessi zuppa toscana tuscan white bean soup ( ...,345
4,4,swedish beauty amaretto tanning lotion advance...,540


In [18]:
NUM_CLASSES = len(classes_df)
NUM_EDGES = len(edges)
KW_COVERAGE = len(kw_map)

print("NUM_CLASSES:", NUM_CLASSES)
print("NUM_EDGES:", NUM_EDGES)
print("KW_COVERAGE(classes with keywords):", KW_COVERAGE)

step1 = {
    "paths": {k: str(v) for k,v in paths.items()},
    "num_classes": NUM_CLASSES,
    "num_edges": NUM_EDGES,
    "kw_coverage": KW_COVERAGE,
    "num_train": len(train_df),
    "num_test": len(test_df),
}
print(step1)


NUM_CLASSES: 531
NUM_EDGES: 568
KW_COVERAGE(classes with keywords): 531
{'paths': {'classes': '/home/sagemaker-user/project_release/Amazon_products/classes.txt', 'hier': '/home/sagemaker-user/project_release/Amazon_products/class_hierarchy.txt', 'kw': '/home/sagemaker-user/project_release/Amazon_products/class_related_keywords.txt', 'train': '/home/sagemaker-user/project_release/Amazon_products/train/train_corpus.txt', 'test': '/home/sagemaker-user/project_release/Amazon_products/test/test_corpus.txt'}, 'num_classes': 531, 'num_edges': 568, 'kw_coverage': 531, 'num_train': 29487, 'num_test': 19658}


### Step 2: class label 문서 만들기. (추후 silver label 생성에 사용할 라벨 설명 text만드는 것 목표)

In [20]:
# Step 2 config
TOP_KW = 12              # 클래스당 사용할 키워드 수(너무 많으면 노이즈)
USE_PRETTY_NAME = True   # underscore -> space 변환(임베딩 매칭 시 유리)
INCLUDE_PARENT_PATH = True  # taxonomy를 텍스트에 약하게 주입(선택)

# Step1 산출물이 존재하는지 체크
assert "classes_df" in globals()
assert "kw_map" in globals()
assert "edges" in globals()
assert "train_df" in globals()
assert "test_df" in globals()

print("TOP_KW:", TOP_KW, "| USE_PRETTY_NAME:", USE_PRETTY_NAME, "| INCLUDE_PARENT_PATH:", INCLUDE_PARENT_PATH)

TOP_KW: 12 | USE_PRETTY_NAME: True | INCLUDE_PARENT_PATH: True


In [21]:
from collections import defaultdict

parents = defaultdict(list)
children = defaultdict(list)
for p, c in edges:
    children[p].append(c)
    parents[c].append(p)

# 루트 후보(부모 없는 노드)
all_nodes = set(classes_df["class_idx"].tolist())
roots = sorted(list(all_nodes - set(parents.keys())))
print("roots:", roots)

# 부모 체인(한 클래스에 부모가 여러 개일 수 있으므로 '대표 경로'를 하나만 택하는 간단 규칙)
# 규칙: 부모가 여러 개면 가장 작은 idx를 택해 위로 올라감(결정적(deterministic)인 baseline)
def get_parent_chain(ci, max_hops=50):
    chain = []
    cur = ci
    hops = 0
    while cur in parents and len(parents[cur]) > 0 and hops < max_hops:
        ps = sorted(parents[cur])
        cur = ps[0]
        chain.append(cur)
        hops += 1
    return chain  # [parent, grandparent, ...] (위로)

# 샘플 확인
for ci in [0, 1, 8, 23]:
    print(ci, "parents:", parents.get(ci, []), "chain:", get_parent_chain(ci))


roots: [0, 3, 10, 23, 40, 169]
0 parents: [] chain: []
1 parents: [0] chain: [0]
8 parents: [0] chain: [0]
23 parents: [] chain: []


In [22]:
def pretty(name: str) -> str:
    return name.replace("_", " ") if USE_PRETTY_NAME else name

# class_idx -> class_name
idx_to_name = {int(r.class_idx): r.class_name for r in classes_df.itertuples()}

label_text = {}
label_meta = []  # DataFrame용

for ci in range(len(classes_df)):
    cname = idx_to_name[ci]
    kws = kw_map.get(ci, [])
    kws = kws[:TOP_KW]

    # parent path 텍스트(선택)
    path_txt = ""
    if INCLUDE_PARENT_PATH:
        chain = get_parent_chain(ci)
        # chain은 부모부터 위로; 사람이 읽기 좋게 root->...->parent 형태로 뒤집기
        chain_names = [pretty(idx_to_name[x]) for x in reversed(chain)]
        if chain_names:
            path_txt = " | path: " + " > ".join(chain_names)

    text = f"{pretty(cname)}. keywords: {', '.join(kws)}{path_txt}"
    label_text[ci] = text

    label_meta.append({
        "class_idx": ci,
        "class_name": cname,
        "num_keywords_used": len(kws),
        "label_text": text,
        "has_path": bool(path_txt),
    })

label_df = pd.DataFrame(label_meta)

print("label_df shape:", label_df.shape)
print("empty label_text:", int((label_df["label_text"].str.len() == 0).sum()))
display(label_df.head(8))
display(label_df.sample(8, random_state=0))


label_df shape: (531, 5)
empty label_text: 0


Unnamed: 0,class_idx,class_name,num_keywords_used,label_text,has_path
0,0,grocery_gourmet_food,10,"grocery gourmet food. keywords: snacks, condim...",False
1,1,meat_poultry,10,"meat poultry. keywords: butcher, cuts, marinat...",True
2,2,jerky,10,"jerky. keywords: beef, turkey, chicken, veniso...",True
3,3,toys_games,10,"toys games. keywords: board_games, puzzles, ac...",False
4,4,games,10,"games. keywords: board_games, card_games, tabl...",True
5,5,puzzles,10,"puzzles. keywords: jigsaw_puzzles, brain_tease...",True
6,6,jigsaw_puzzles,10,"jigsaw puzzles. keywords: interlocking_pieces,...",True
7,7,board_games,10,"board games. keywords: board_game_accessories,...",True


Unnamed: 0,class_idx,class_name,num_keywords_used,label_text,has_path
409,409,bars,10,"bars. keywords: protein_bars, energy_bars, gra...",True
438,438,halva,10,"halva. keywords: sesame, tahini, middle_easter...",True
268,268,slot_cars,10,"slot cars. keywords: racing_tracks, digital_sl...",True
196,196,joggers,10,"joggers. keywords: running, fitness, outdoors,...",True
450,450,memorials,10,"memorials. keywords: grave_markers, urns, memo...",True
283,283,bathroom_aids_safety,10,"bathroom aids safety. keywords: shower_chairs,...",True
281,281,training_behavior_aids,10,"training behavior aids. keywords: clickers, le...",True
15,15,action_toy_figures,10,action toy figures. keywords: poseable_figures...,True


In [23]:
label_df["text_len"] = label_df["label_text"].str.len()

print("label_text len(min/mean/max):",
      int(label_df.text_len.min()),
      float(label_df.text_len.mean()),
      int(label_df.text_len.max()))

print("keywords used(min/mean/max):",
      int(label_df.num_keywords_used.min()),
      float(label_df.num_keywords_used.mean()),
      int(label_df.num_keywords_used.max()))

# 몇 개 눈으로 확인
for ci in [0, 1, 2, 3, 8, 169, 530]:
    print("\n--- class_idx", ci, "|", idx_to_name[ci], "---")
    print(label_text[ci][:400])


label_text len(min/mean/max): 131 220.58757062146893 350
keywords used(min/mean/max): 1 9.932203389830509 10

--- class_idx 0 | grocery_gourmet_food ---
grocery gourmet food. keywords: snacks, condiments, beverages, specialty_foods, spices, cooking_oils, baking_ingredients, gourmet_chocolates, artisanal_cheeses, organic_foods

--- class_idx 1 | meat_poultry ---
meat poultry. keywords: butcher, cuts, marination, grilling, roasting, seasoning, halal, organic, deli, marbling | path: grocery gourmet food

--- class_idx 2 | jerky ---
jerky. keywords: beef, turkey, chicken, venison, buffalo, kangaroo, elk, ostrich, bison, spicy | path: grocery gourmet food > meat poultry

--- class_idx 3 | toys_games ---
toys games. keywords: board_games, puzzles, action_figures, building_blocks, dolls, outdoor_toys, educational_toys, card_games, remote_control_toys, plush_toys

--- class_idx 8 | beverages ---
beverages. keywords: coffee, tea, energy_drinks, soft_drinks, bottled_water, juices, sports_drinks,

In [24]:
from pathlib import Path
import json

ART = Path("artifacts")
ART.mkdir(exist_ok=True)

with open(ART / "label_text.json", "w", encoding="utf-8") as f:
    json.dump(label_text, f, ensure_ascii=False, indent=2)

label_df.to_csv(ART / "label_text_table.csv", index=False, encoding="utf-8")

print("Saved:")
print(" -", ART / "label_text.json")
print(" -", ART / "label_text_table.csv")


Saved:
 - artifacts/label_text.json
 - artifacts/label_text_table.csv


### Step3: Pseudo Label (Silver label)생성: 
##### unlabeled data인 train_df에 대해 TF-IDF 기반으로 각 문서에 2~3개 클래스 라벨 자동 부여 & 신뢰도 계산해 저장

In [26]:
import numpy as np
import pandas as pd
from pathlib import Path

# Step2 산출물 확인
assert "label_text" in globals(), "Step2 label_text가 없습니다."
assert "label_df" in globals(), "Step2 label_df가 없습니다."
assert "train_df" in globals(), "Step1 train_df가 없습니다."
assert "classes_df" in globals(), "Step1 classes_df가 없습니다."
assert "edges" in globals(), "Step1 edges가 없습니다."

# 설정
TOPK_CAND = 30          # 후보 라벨 수 (처음엔 30 정도면 충분)
FORCE_MIN_LABELS = 2
FORCE_MAX_LABELS = 3

# 3개 라벨을 줄지 결정하는 규칙 파라미터
THRESH_THIRD = 0.55     # top3 점수가 이 이상이면 3개 허용 (TFIDF cosine 기준, 데이터에 따라 조정)
MARGIN_23 = 0.05        # top2 - top3 차이가 작으면(비슷하면) 3개 허용

# 샘플링(속도 검증용) : 처음엔 전체 실행하되, 문제 생기면 숫자 줄여 디버그
DOC_LIMIT = None        # 예: 5000으로 줄여 시험 가능. 전체면 None

print("Step3 Config:")
print(" TOPK_CAND:", TOPK_CAND)
print(" labels per doc:", FORCE_MIN_LABELS, "~", FORCE_MAX_LABELS)
print(" THRESH_THIRD:", THRESH_THIRD, "| MARGIN_23:", MARGIN_23)
print(" DOC_LIMIT:", DOC_LIMIT)


Step3 Config:
 TOPK_CAND: 30
 labels per doc: 2 ~ 3
 THRESH_THIRD: 0.55 | MARGIN_23: 0.05
 DOC_LIMIT: None


In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize

# 문서 준비
docs = train_df["text"].astype(str).tolist()
doc_ids = train_df["doc_id"].astype(str).tolist()

if DOC_LIMIT is not None:
    docs = docs[:DOC_LIMIT]
    doc_ids = doc_ids[:DOC_LIMIT]

# 라벨 텍스트 준비 (531개 고정)
label_texts = [label_text[i] for i in range(len(classes_df))]

# TF-IDF 설정(보수적)
vectorizer = TfidfVectorizer(
    lowercase=True,
    ngram_range=(1,2),
    min_df=2,
    max_df=0.9,
    max_features=200000,
)

X_doc = vectorizer.fit_transform(docs)          # (N_docs, V)
X_label = vectorizer.transform(label_texts)     # (531, V)

# cosine 유사도 위해 L2 normalize
X_doc = normalize(X_doc)
X_label = normalize(X_label)

print("TFIDF shapes:")
print(" X_doc:", X_doc.shape)
print(" X_label:", X_label.shape)
print(" vocab size:", len(vectorizer.vocabulary_))


TFIDF shapes:
 X_doc: (29487, 200000)
 X_label: (531, 200000)
 vocab size: 200000


In [41]:
import numpy as np

N = X_doc.shape[0]
C = X_label.shape[0]

# 배치 계산: N x 531 전체를 한 번에 만들면 메모리는 버틸 수 있으나,
# 안전하게 chunk로 topk만 뽑습니다.
CHUNK = 2000

topk_idx = np.zeros((N, TOPK_CAND), dtype=np.int32)
topk_score = np.zeros((N, TOPK_CAND), dtype=np.float32)

for start in range(0, N, CHUNK):
    end = min(N, start + CHUNK)
    # (chunk, V) dot (V, C) -> (chunk, C)
    sim = X_doc[start:end].dot(X_label.T).toarray()  # cosine similarity (0~1+)
    # topk
    idx_part = np.argpartition(-sim, TOPK_CAND-1, axis=1)[:, :TOPK_CAND]
    score_part = np.take_along_axis(sim, idx_part, axis=1)

    # sort within topk
    order = np.argsort(-score_part, axis=1)
    idx_sorted = np.take_along_axis(idx_part, order, axis=1)
    score_sorted = np.take_along_axis(score_part, order, axis=1)

    topk_idx[start:end] = idx_sorted.astype(np.int32)
    topk_score[start:end] = score_sorted.astype(np.float32)

    if start == 0:
        print("First chunk debug:")
        print(" sim min/mean/max:", float(sim.min()), float(sim.mean()), float(sim.max()))
        print(" top1 score sample:", topk_score[0,0], "top3:", topk_score[0,2], "top10:", topk_score[0,9])

print("Done. topk_idx/topk_score shapes:", topk_idx.shape, topk_score.shape)


First chunk debug:
 sim min/mean/max: 0.0 0.001785475945790756 0.26322913554963845
 top1 score sample: 0.08823033 top3: 0.059450895 top10: 0.04230768
Done. topk_idx/topk_score shapes: (29487, 30) (29487, 30)


In [45]:
import numpy as np
import pandas as pd
from collections import Counter
from IPython.display import display

# 튜닝값 확정
THRESH_THIRD = 0.06866627186536789
MARGIN_23    = 0.003

def select_labels(scores, idxs, min_k=2, max_k=3,
                  thresh_third=0.0687, margin_23=0.003):
    chosen = [int(idxs[0]), int(idxs[1])]
    s2 = float(scores[1])
    s3 = float(scores[2])

    if max_k >= 3:
        if (s3 >= thresh_third) or ((s2 - s3) <= margin_23):
            chosen.append(int(idxs[2]))

    if len(chosen) < min_k:
        chosen = list(map(int, idxs[:min_k]))
    return chosen[:max_k]

N = topk_idx.shape[0]
silver_labels = []
conf_list = []

top1 = topk_score[:,0].astype(float)
top2 = topk_score[:,1].astype(float)
top3 = topk_score[:,2].astype(float)
top4 = topk_score[:,3].astype(float)

for i in range(N):
    labs = select_labels(topk_score[i], topk_idx[i],
                         min_k=2, max_k=3,
                         thresh_third=THRESH_THIRD,
                         margin_23=MARGIN_23)
    silver_labels.append(labs)

    # confidence(간단): top2 평균 + 0.25*(top2-top4) 양수부분
    conf = (top1[i] + top2[i]) / 2.0 + 0.25 * max(0.0, (top2[i] - top4[i]))
    conf_list.append(float(conf))

silver_df = pd.DataFrame({
    "doc_id": train_df["doc_id"].astype(str).tolist()[:N],
    "labels": silver_labels,
    "confidence": conf_list,
    "top1": top1,
    "top2": top2,
    "top3": top3,
})

# (1) 2개 vs 3개 분포
print("labels per doc distribution:")
print(silver_df["labels"].apply(len).value_counts().sort_index())

# (2) confidence 통계
print("\nconfidence describe:")
print(silver_df["confidence"].describe())

# (3) 상위 라벨 빈도(쏠림 체크)
cnt = Counter()
for labs in silver_labels:
    cnt.update(labs)

top15 = cnt.most_common(15)
print("\nTop 15 frequent labels:")
for ci, ncnt in top15:
    cname = classes_df.loc[classes_df.class_idx==ci, "class_name"].iloc[0]
    print(f"{ci:3d} {cname:30s} -> {ncnt}")

display(silver_df.head(8))

# high-confidence threshold를 퍼센타일로 결정
# 추천: 상위 60%만 학습에 사용(노이즈 감소). 필요 시 70%까지도 가능
CONF_PCTL = 60
conf_thr = float(np.percentile(silver_df["confidence"].values, CONF_PCTL))

silver_df["is_high_conf"] = (silver_df["confidence"] >= conf_thr)

print(f"\n[High-Conf] percentile={CONF_PCTL} -> threshold={conf_thr:.6f}")
print("high-conf count:", int(silver_df["is_high_conf"].sum()), "/", len(silver_df))

# high-conf subset의 2/3 라벨 분포도 확인
print("\nlabels per doc distribution (high-conf subset):")
print(silver_df.loc[silver_df.is_high_conf, "labels"].apply(len).value_counts().sort_index())

# high-conf subset에서 상위 라벨 쏠림 체크
from collections import Counter
cnt_h = Counter()
for labs in silver_df.loc[silver_df.is_high_conf, "labels"]:
    cnt_h.update(labs)

print("\nTop 10 frequent labels (high-conf subset):")
for ci, ncnt in cnt_h.most_common(10):
    cname = classes_df.loc[classes_df.class_idx==ci, "class_name"].iloc[0]
    print(f"{ci:3d} {cname:30s} -> {ncnt}")



labels per doc distribution:
2    12764
3    16723
Name: labels, dtype: int64

confidence describe:
count    29487.000000
mean         0.052508
std          0.034802
min          0.000000
25%          0.029144
50%          0.045683
75%          0.068086
max          0.389068
Name: confidence, dtype: float64

Top 15 frequent labels:
 90 hair_color                     -> 1543
 30 electronics_for_kids           -> 1506
104 electronic_pets                -> 1451
350 sports_supplements             -> 1001
 15 action_toy_figures             -> 974
461 hair_relaxers                  -> 971
346 milk                           -> 961
473 hair_perms_texturizers         -> 914
220 fragrance                      -> 863
344 chocolate_truffles             -> 698
 40 baby_products                  -> 698
379 toy_gift_sets                  -> 651
435 game_room_games                -> 650
166 accessories                    -> 613
177 cameras_camcorders             -> 612


Unnamed: 0,doc_id,labels,confidence,top1,top2,top3
0,0,"[137, 87]",0.084811,0.08823,0.072477,0.059451
1,1,"[266, 276, 490]",0.054418,0.053876,0.053649,0.051482
2,2,"[409, 381]",0.05709,0.05545,0.053065,0.045784
3,3,"[90, 461, 473]",0.065674,0.066375,0.063531,0.063531
4,4,"[308, 376, 449]",0.025899,0.028305,0.022843,0.022631
5,5,"[43, 173, 145]",0.020517,0.031117,0.009747,0.009734
6,6,"[158, 210]",0.056686,0.066117,0.041233,0.029989
7,7,"[350, 346, 344]",0.0,0.0,0.0,0.0



[High-Conf] percentile=60 -> threshold=0.053064
high-conf count: 11795 / 29487

labels per doc distribution (high-conf subset):
2    6208
3    5587
Name: labels, dtype: int64

Top 10 frequent labels (high-conf subset):
 90 hair_color                     -> 666
461 hair_relaxers                  -> 473
473 hair_perms_texturizers         -> 459
220 fragrance                      -> 452
 40 baby_products                  -> 373
435 game_room_games                -> 355
  8 beverages                      -> 327
127 car_seat_stroller_toys         -> 292
294 game_collections               -> 286
142 cloth_diapers                  -> 280


In [46]:
from pathlib import Path
import json

ART = Path("artifacts")
ART.mkdir(exist_ok=True)

# 1) 전체 저장
out_all = ART / "silver_train_tfidf_all.jsonl"
silver_df.to_json(out_all, orient="records", lines=True, force_ascii=False)

# 2) high-confidence subset 저장
out_hc = ART / "silver_train_tfidf_highconf.jsonl"
silver_high = silver_df[silver_df["is_high_conf"]].copy()
silver_high.to_json(out_hc, orient="records", lines=True, force_ascii=False)

# 3) 메타 저장(재현성)
meta = {
    "method": "tfidf_label_text_cosine",
    "TOPK_CAND": TOPK_CAND,
    "labels_per_doc": [FORCE_MIN_LABELS, FORCE_MAX_LABELS],
    "THRESH_THIRD": THRESH_THIRD,
    "MARGIN_23": MARGIN_23,
    "DOC_LIMIT": DOC_LIMIT,
    "vectorizer": {
        "ngram_range": (1,2),
        "min_df": 2,
        "max_df": 0.9,
        "max_features": 200000,
    },
    "high_conf": {
        "percentile": int(CONF_PCTL),
        "threshold": float(conf_thr),
        "count": int(len(silver_high)),
        "ratio": float(len(silver_high) / len(silver_df)),
    },
    "num_docs": int(len(silver_df)),
    "num_classes": int(len(classes_df)),
}
with open(ART / "silver_train_tfidf_meta.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

print("Saved:")
print(" -", out_all)
print(" -", out_hc)
print(" -", ART / "silver_train_tfidf_meta.json")

print("\nHigh-conf preview:")
display(silver_high.head(8))


Saved:
 - artifacts/silver_train_tfidf_all.jsonl
 - artifacts/silver_train_tfidf_highconf.jsonl
 - artifacts/silver_train_tfidf_meta.json

High-conf preview:


Unnamed: 0,doc_id,labels,confidence,top1,top2,top3,is_high_conf
0,0,"[137, 87]",0.084811,0.08823,0.072477,0.059451,True
1,1,"[266, 276, 490]",0.054418,0.053876,0.053649,0.051482,True
2,2,"[409, 381]",0.05709,0.05545,0.053065,0.045784,True
3,3,"[90, 461, 473]",0.065674,0.066375,0.063531,0.063531,True
6,6,"[158, 210]",0.056686,0.066117,0.041233,0.029989,True
8,8,"[129, 195, 130]",0.194676,0.208124,0.15471,0.102777,True
17,17,"[524, 448, 64]",0.100061,0.103397,0.089236,0.078065,True
23,23,"[435, 294]",0.074757,0.071439,0.065224,0.061182,True


#### Step4 : Text classification model training

In [47]:
import pandas as pd
import numpy as np
from pathlib import Path

ART = Path("artifacts")
silver_path = ART / "silver_train_tfidf_highconf.jsonl"
assert silver_path.exists(), f"Missing: {silver_path}"

silver = pd.read_json(silver_path, lines=True)
print("silver(highconf):", silver.shape)
display(silver.head(3))

# train_df에서 doc_id로 text join
# (train_df는 Step1에서 이미 로드되어 있어야 함)
assert "train_df" in globals(), "train_df not found"
train_map = train_df[["doc_id","text"]].copy()
train_map["doc_id"] = train_map["doc_id"].astype(str)

silver["doc_id"] = silver["doc_id"].astype(str)
data = silver.merge(train_map, on="doc_id", how="left")

missing_text = data["text"].isna().sum()
print("merged data:", data.shape, "| missing_text:", int(missing_text))
if missing_text > 0:
    display(data[data["text"].isna()].head(5))
    raise ValueError("Some doc_id not found in train_df")

# 학습용 컬럼
X_text = data["text"].astype(str).tolist()
Y_labels = data["labels"].tolist()   # list of list[int]
conf = data["confidence"].astype(float).values

# 라벨 수 분포 확인
cnt = pd.Series([len(x) for x in Y_labels]).value_counts().sort_index()
print("labels per doc (highconf):")
print(cnt)


silver(highconf): (11795, 7)


Unnamed: 0,doc_id,labels,confidence,top1,top2,top3,is_high_conf
0,0,"[137, 87]",0.084811,0.08823,0.072477,0.059451,True
1,1,"[266, 276, 490]",0.054418,0.053876,0.053649,0.051482,True
2,2,"[409, 381]",0.05709,0.05545,0.053065,0.045784,True


merged data: (11795, 8) | missing_text: 0
labels per doc (highconf):
2    6208
3    5587
dtype: int64


In [48]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

SEED = 42
VAL_RATIO = 0.1

idx = np.arange(len(X_text))
tr_idx, va_idx = train_test_split(idx, test_size=VAL_RATIO, random_state=SEED, shuffle=True)

X_tr = [X_text[i] for i in tr_idx]
X_va = [X_text[i] for i in va_idx]
Y_tr = [Y_labels[i] for i in tr_idx]
Y_va = [Y_labels[i] for i in va_idx]

print("split:", len(X_tr), len(X_va))

# TF-IDF (Step3보다 살짝 보수적으로 희귀 토큰 영향 줄임)
vectorizer = TfidfVectorizer(
    lowercase=True,
    ngram_range=(1,2),
    min_df=3,
    max_df=0.9,
    max_features=200000,
)

Xtr_vec = vectorizer.fit_transform(X_tr)
Xva_vec = vectorizer.transform(X_va)

print("TFIDF shapes:", Xtr_vec.shape, Xva_vec.shape, "| vocab:", len(vectorizer.vocabulary_))

# MultiLabel -> multi-hot (531 classes)
NUM_CLASSES = len(classes_df) if "classes_df" in globals() else 531
mlb = MultiLabelBinarizer(classes=list(range(NUM_CLASSES)))
Ytr_bin = mlb.fit_transform(Y_tr)
Yva_bin = mlb.transform(Y_va)

print("Y bin shapes:", Ytr_bin.shape, Yva_bin.shape, "| classes:", len(mlb.classes_))
print("train label density (avg #labels):", Ytr_bin.sum(axis=1).mean())


split: 10615 1180
TFIDF shapes: (10615, 55968) (1180, 55968) | vocab: 55968
Y bin shapes: (10615, 531) (1180, 531) | classes: 531
train label density (avg #labels): 2.4740461610927933


In [49]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

# LinearSVC는 대규모 sparse 텍스트에서 강한 베이스라인
base_clf = LinearSVC(C=1.0)
clf = OneVsRestClassifier(base_clf, n_jobs=-1)

print("Training...")
clf.fit(Xtr_vec, Ytr_bin)
print("Done.")


Training...




Done.


In [54]:
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd

scores = clf.decision_function(Xva_vec)
scores = np.asarray(scores)

# top score들
idx_sorted = np.argsort(-scores, axis=1)
top1_idx = idx_sorted[:, 0]
top2_idx = idx_sorted[:, 1]
top3_idx = idx_sorted[:, 2]

top1 = scores[np.arange(scores.shape[0]), top1_idx]
top2 = scores[np.arange(scores.shape[0]), top2_idx]
top3 = scores[np.arange(scores.shape[0]), top3_idx]

gap23 = top2 - top3

# 더 안정적인 임계치(보수적으로)
THRESH_THIRD = float(np.percentile(top3, 85))   # 그대로(상위 10%)
MARGIN_23 = float(np.percentile(gap23, 75))     # 0 방지 + 더 보수적

# 안전장치: MARGIN이 너무 작으면 최소값 부여
MARGIN_23 = max(MARGIN_23, 0.05)

print("thresholds (revised):")
print(" THRESH_THIRD:", THRESH_THIRD)
print(" MARGIN_23   :", MARGIN_23)
print("gap23 percentiles:", np.percentile(gap23, [50, 75, 90, 95]).tolist())

def predict_2or3(score_row, thresh_third, margin_23):
    idx = np.argsort(-score_row)[:3]
    s1, s2, s3 = float(score_row[idx[0]]), float(score_row[idx[1]]), float(score_row[idx[2]])
    labs = [int(idx[0]), int(idx[1])]
    # 3라벨은 더 엄격하게:
    # - top3가 충분히 크고
    # - top2-top3 차이가 충분히 작을 때만
    if (s3 >= thresh_third) and ((s2 - s3) <= margin_23):
        labs.append(int(idx[2]))
    return labs

pred_labels = [predict_2or3(scores[i], THRESH_THIRD, MARGIN_23) for i in range(scores.shape[0])]

Ypred_bin = np.zeros_like(Yva_bin)
for i, labs in enumerate(pred_labels):
    Ypred_bin[i, labs] = 1

micro = f1_score(Yva_bin, Ypred_bin, average="micro", zero_division=0)
samples = f1_score(Yva_bin, Ypred_bin, average="samples", zero_division=0)
print("Val micro-F1  :", micro)
print("Val sample-F1 :", samples)

dist = pd.Series([len(x) for x in pred_labels]).value_counts().sort_index()
print("\nPred labels per doc distribution:")
print(dist)

# 예시 5개
for j in range(5):
    gt = Y_va[j]
    pr = pred_labels[j]
    print(f"\n[ex {j}] GT={gt} | PR={pr}")
    print(X_va[j][:200])


thresholds (revised):
 THRESH_THIRD: 0.0009821156265188567
 MARGIN_23   : 0.13019305185127453
gap23 percentiles: [0.0, 0.13019305185127453, 0.41966545089410423, 0.6925465777002563]
Val micro-F1  : 0.43772175536881425
Val sample-F1 : 0.42819209039548023

Pred labels per doc distribution:
2    1100
3      80
dtype: int64

[ex 0] GT=[137, 241] | PR=[241, 137]
master massage monroe salon size portable massage table , 30 inch i purchased this massage table primarily for home use ( we have a wonderful massage therapist who does out calls ) and this makes it e

[ex 1] GT=[271, 366] | PR=[383, 505]
popcorn nut salt 24 oz i have n't ordered this but i was reading some bad reviews saying it was just salt , not flavored , which is true . i also read that it is finer than regular store bought salt a

[ex 2] GT=[127, 152] | PR=[127, 152]
amazon . com somehow i lost my first go go babyz and had to order this one because i will not travel without it . it takes a little patience to fasten the carseat 

In [55]:
import joblib, json
from pathlib import Path

ART = Path("artifacts")
ART.mkdir(exist_ok=True)

model_path = ART / "step4_tfidf_linearSVC_ovr.joblib"
joblib.dump(
    {"vectorizer": vectorizer, "model": clf, "mlb": mlb,
     "thresholds": {"THRESH_THIRD": THRESH_THIRD, "MARGIN_23": MARGIN_23}},
    model_path
)

meta = {
    "model": "OneVsRest(LinearSVC)",
    "tfidf": {"ngram_range": (1,2), "min_df": 3, "max_df": 0.9, "max_features": 200000},
    "val_ratio": VAL_RATIO,
    "seed": SEED,
    "thresholds": {"THRESH_THIRD": float(THRESH_THIRD), "MARGIN_23": float(MARGIN_23)},
    "val_micro_f1": float(micro),
    "val_sample_f1": float(samples),
    "pred_label_dist": {str(k): int(v) for k,v in pd.Series([len(x) for x in pred_labels]).value_counts().sort_index().items()},
    "train_docs": int(len(X_tr)),
    "val_docs": int(len(X_va)),
}
with open(ART / "step4_model_meta.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

print("Saved:")
print(" -", model_path)
print(" -", ART / "step4_model_meta.json")


Saved:
 - artifacts/step4_tfidf_linearSVC_ovr.joblib
 - artifacts/step4_model_meta.json


### Step5

In [56]:
import random, numpy as np
import torch
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import f1_score

# 재현성
SEED = 42
random.seed(SEED); np.random.seed(SEED)
torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)

# Step3 highconf 로드 (학습 신뢰도 확보)
ART = Path("artifacts")
silver_path = ART / "silver_train_tfidf_highconf.jsonl"
silver = pd.read_json(silver_path, lines=True)

# train_df에서 text 조인
train_map = train_df[["doc_id","text"]].copy()
train_map["doc_id"] = train_map["doc_id"].astype(str)
silver["doc_id"] = silver["doc_id"].astype(str)
data = silver.merge(train_map, on="doc_id", how="left")
assert data["text"].isna().sum() == 0

X_text = data["text"].astype(str).tolist()
Y_labels = data["labels"].tolist()

# split (Step4와 동일하게 10% val)
VAL_RATIO = 0.1
idx = np.arange(len(X_text))
tr_idx, va_idx = train_test_split(idx, test_size=VAL_RATIO, random_state=SEED, shuffle=True)

X_tr = [X_text[i] for i in tr_idx]
X_va = [X_text[i] for i in va_idx]
Y_tr = [Y_labels[i] for i in tr_idx]
Y_va = [Y_labels[i] for i in va_idx]

NUM_CLASSES = len(classes_df)

# TF-IDF (Step4와 비슷한 설정)
vectorizer = TfidfVectorizer(
    lowercase=True, ngram_range=(1,2),
    min_df=3, max_df=0.9, max_features=200000
)
Xtr_tfidf = vectorizer.fit_transform(X_tr)
Xva_tfidf = vectorizer.transform(X_va)

print("TFIDF:", Xtr_tfidf.shape, Xva_tfidf.shape, "| vocab:", len(vectorizer.vocabulary_))

# SVD로 밀집 문서 임베딩 만들기 (GCN과 내적하기 위해)
EMB_DIM = 256
svd = TruncatedSVD(n_components=EMB_DIM, random_state=SEED)
Z_tr = svd.fit_transform(Xtr_tfidf)   # (Ntr, 256)
Z_va = svd.transform(Xva_tfidf)       # (Nva, 256)

print("Doc embedding:", Z_tr.shape, Z_va.shape)

# 클래스 초기 특징: label_text를 같은 vectorizer로 -> 같은 svd 공간으로
label_texts = [label_text[i] for i in range(NUM_CLASSES)]
Xc_tfidf = vectorizer.transform(label_texts)
Xc0 = svd.transform(Xc_tfidf)         # (531, 256)
print("Class init feat:", Xc0.shape)

# 멀티라벨 binarize
mlb = MultiLabelBinarizer(classes=list(range(NUM_CLASSES)))
Ytr_bin = mlb.fit_transform(Y_tr).astype(np.float32)
Yva_bin = mlb.transform(Y_va).astype(np.float32)

print("Y bin:", Ytr_bin.shape, Yva_bin.shape, "| avg labels(train):", Ytr_bin.sum(axis=1).mean())

# ====== taxonomy 그래프 -> 정규화 adjacency(A_hat) ======
# edges: list[(parent, child)] in Step1
N = NUM_CLASSES
A = np.zeros((N, N), dtype=np.float32)

# 방향성을 약하게 만들기 위해(메시지 패싱 안정), undirected로 사용(부모-자식 모두 연결)
for p, c in edges:
    A[p, c] = 1.0
    A[c, p] = 1.0

# self-loop 추가
A[np.arange(N), np.arange(N)] = 1.0

deg = A.sum(axis=1)
D_inv_sqrt = np.diag(1.0 / np.sqrt(np.maximum(deg, 1e-8)))
A_hat = D_inv_sqrt @ A @ D_inv_sqrt   # (N,N)

print("Graph:", "nodes", N, "| undirected edges (nonzero offdiag):", int((A>0).sum() - N))
print("A_hat stats:", float(A_hat.min()), float(A_hat.mean()), float(A_hat.max()))


TFIDF: (10615, 55968) (1180, 55968) | vocab: 55968
Doc embedding: (10615, 256) (1180, 256)
Class init feat: (531, 256)
Y bin: (10615, 531) (1180, 531) | avg labels(train): 2.4740462
Graph: nodes 531 | undirected edges (nonzero offdiag): 1136
A_hat stats: 0.0 0.001591013977304101 0.4999999701976776


In [63]:
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import f1_score

ART = Path("artifacts")
bundle = joblib.load(ART / "step4_tfidf_linearSVC_ovr.joblib")
vec4 = bundle["vectorizer"]
clf4 = bundle["model"]
mlb4 = bundle["mlb"]

# Step5의 train/val split을 그대로 쓰려면, 지금 Step5 Cell1에서 만든 X_va, Y_va, Yva_bin을 그대로 사용
# (이미 존재한다고 가정)
Xva_vec4 = vec4.transform(X_va)
scores_va = clf4.decision_function(Xva_vec4)  # (Nva, 531)
scores_va = np.asarray(scores_va)

print("scores_va:", scores_va.shape, "min/mean/max:", float(scores_va.min()), float(scores_va.mean()), float(scores_va.max()))


scores_va: (1180, 531) min/mean/max: -2.535507356382762 -1.0667796914498313 2.752199370737678


In [69]:
import numpy as np

# edges: (parent, child)
N = NUM_CLASSES

A_pc = np.zeros((N, N), dtype=np.float32)  # parent->child
for p, c in edges:
    A_pc[c, p] = 1.0  # NOTE: (child row) <- parent col 로 두면 A_pc @ S 가 "부모 점수의 자식 유입"이 됨

# self-loop(자기 보존)
A_pc[np.arange(N), np.arange(N)] = 1.0

# row-normalize (각 노드로 들어오는 부모들의 평균)
row_sum = A_pc.sum(axis=1, keepdims=True)
A_pc_hat = A_pc / np.maximum(row_sum, 1e-8)

print("A_pc_hat stats:", float(A_pc_hat.min()), float(A_pc_hat.mean()), float(A_pc_hat.max()))

# propagation (매우 약하게)
beta = 0.10   # 변화량 강도(0.05~0.15 권장)
S0 = scores_va.T.astype(np.float32)     # (C, Nva)
S  = S0 + beta * ((A_pc_hat @ S0) - S0) # residual 1-step

scores_va_smooth2 = S.T

top1_raw = np.argmax(scores_va, axis=1)
top1_smo = np.argmax(scores_va_smooth2, axis=1)
print("top1 changed ratio:", float(np.mean(top1_raw != top1_smo)))


A_pc_hat stats: 0.0 0.0018832391360774636 1.0
top1 changed ratio: 0.023728813559322035


In [70]:
def predict_2or3(score_mat, thresh_third_pctl=85, margin_pctl=75, margin_floor=0.05):
    idx_sorted = np.argsort(-score_mat, axis=1)[:, :3]
    top2 = score_mat[np.arange(score_mat.shape[0]), idx_sorted[:,1]]
    top3 = score_mat[np.arange(score_mat.shape[0]), idx_sorted[:,2]]
    gap23 = top2 - top3

    TH = float(np.percentile(top3, thresh_third_pctl))
    MG = float(np.percentile(gap23, margin_pctl))
    MG = max(MG, margin_floor)

    pred = []
    for i in range(score_mat.shape[0]):
        a,b,c = idx_sorted[i]
        s2 = float(score_mat[i,b]); s3v = float(score_mat[i,c])
        labs = [int(a), int(b)]
        if (s3v >= TH) and ((s2 - s3v) <= MG):
            labs.append(int(c))
        pred.append(labs)
    return pred, TH, MG

def eval_scores(score_mat, name):
    pred_labels, TH, MG = predict_2or3(score_mat, thresh_third_pctl=85, margin_pctl=75, margin_floor=0.05)
    Ypred = np.zeros_like(Yva_bin, dtype=np.int32)
    for i,labs in enumerate(pred_labels):
        Ypred[i, labs] = 1
    micro = f1_score(Yva_bin, Ypred, average="micro", zero_division=0)
    samples = f1_score(Yva_bin, Ypred, average="samples", zero_division=0)
    dist = pd.Series([len(x) for x in pred_labels]).value_counts().sort_index().to_dict()
    print(f"[{name}] microF1={micro:.4f} sampleF1={samples:.4f} | TH={TH:.4f} MG={MG:.4f} | dist={dist}")
    return {"micro": micro, "samples": samples, "TH": TH, "MG": MG, "dist": dist}

res_raw = eval_scores(scores_va, "SVM raw")
res_smo = eval_scores(scores_va_smooth, f"SVM+GNN smooth (alpha={alpha},T={T})")
res_smo2 = eval_scores(scores_va_smooth2, "SVM + Directed residual smooth (beta=0.10)")


[SVM raw] microF1=0.4377 sampleF1=0.4282 | TH=0.0010 MG=0.1302 | dist={2: 1100, 3: 80}
[SVM+GNN smooth (alpha=0.2,T=1)] microF1=0.4156 sampleF1=0.4077 | TH=-0.0331 MG=0.1095 | dist={2: 1113, 3: 67}
[SVM + Directed residual smooth (beta=0.10)] microF1=0.4442 sampleF1=0.4349 | TH=-0.0262 MG=0.1338 | dist={2: 1102, 3: 78}


In [71]:
import json
from pathlib import Path

ART = Path("artifacts")
ART.mkdir(exist_ok=True)

step5_meta = {
    "method": "GNN-based directed residual smoothing on taxonomy (post-process Step4 scores)",
    "graph": {
        "type": "parent_to_child_row_normalized_with_self_loop",
        "num_nodes": int(NUM_CLASSES),
        "num_edges": int(len(edges)),
    },
    "smoothing": {
        "beta": 0.10,
        "formula": "S = S0 + beta * (A_pc_hat @ S0 - S0)",
        "note": "Directed message passing (parent->child) + residual to preserve original scores"
    },
    "results": {
        "svm_raw": {"micro_f1": float(res_raw["micro"]), "sample_f1": float(res_raw["samples"]),
                    "TH": float(res_raw["TH"]), "MG": float(res_raw["MG"]), "dist": res_raw["dist"]},
        "smooth_bidirectional_alpha02_T1": {"micro_f1": float(res_smo["micro"]), "sample_f1": float(res_smo["samples"]),
                    "TH": float(res_smo["TH"]), "MG": float(res_smo["MG"]), "dist": res_smo["dist"]},
        "smooth_directed_residual_beta010": {"micro_f1": float(res_smo2["micro"]), "sample_f1": float(res_smo2["samples"]),
                    "TH": float(res_smo2["TH"]), "MG": float(res_smo2["MG"]), "dist": res_smo2["dist"]},
    }
}

out = ART / "step5_gnn_directed_residual_meta.json"
with open(out, "w", encoding="utf-8") as f:
    json.dump(step5_meta, f, ensure_ascii=False, indent=2)

print("Saved:", out)


Saved: artifacts/step5_gnn_directed_residual_meta.json


### Step 6: Self-trainng

In [73]:
import numpy as np
import pandas as pd
from pathlib import Path
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score

ART = Path("artifacts")

# Step4 모델
bundle = joblib.load(ART / "step4_tfidf_linearSVC_ovr.joblib")
vec = bundle["vectorizer"]
clf = bundle["model"]
mlb = bundle["mlb"]

print("Loaded Step4 model.")

# 전체 train/test (Step1에서 만든 train_df/test_df 사용)
assert "train_df" in globals() and "test_df" in globals()
train_all = train_df.copy()
test_all  = test_df.copy()
train_all["doc_id"] = train_all["doc_id"].astype(str)
test_all["doc_id"]  = test_all["doc_id"].astype(str)

print("train_all:", train_all.shape, "test_all:", test_all.shape)

# Step6에서도 동일하게 val split을 유지(재현성)
SEED = 42
VAL_RATIO = 0.1
idx_all = np.arange(len(train_all))
tr_idx, va_idx = train_test_split(idx_all, test_size=VAL_RATIO, random_state=SEED, shuffle=True)

train_base = train_all.iloc[tr_idx].reset_index(drop=True)
val_hold   = train_all.iloc[va_idx].reset_index(drop=True)

print("base train:", train_base.shape, "val_hold:", val_hold.shape)


Loaded Step4 model.
train_all: (29487, 3) test_all: (19658, 3)
base train: (26538, 3) val_hold: (2949, 3)


In [79]:
silver_all = pd.read_json(ART / "silver_train_tfidf_all.jsonl", lines=True)
silver_all["doc_id"] = silver_all["doc_id"].astype(str)

# base train에 붙이기
seed = train_base.merge(silver_all[["doc_id","labels","confidence"]], on="doc_id", how="left")

print("seed merged:", seed.shape)
print("seed label missing:", int(seed["labels"].isna().sum()))

# seed label이 있는 것만을 초기 학습 데이터로(초기엔 highconf만 사용 권장)
seed_labeled = seed.dropna(subset=["labels"]).copy()
seed_labeled["labels"] = seed_labeled["labels"].apply(lambda x: list(x) if isinstance(x, (list,tuple)) else x)

print("seed_labeled:", seed_labeled.shape)

# 초기 학습은 highconf 기준으로 필터(노이즈 억제)
CONF_THR = float(np.percentile(seed_labeled["confidence"].values, 60))  # Step3와 동일 컨셉
seed_init = seed_labeled[seed_labeled["confidence"] >= CONF_THR].copy()

print("CONF_THR:", CONF_THR)
print("seed_init:", seed_init.shape)
print("labels per doc:", seed_init["labels"].apply(len).value_counts().sort_index().to_dict())


# === ADD at end of Cell 2 ===
pool_low = seed_labeled[seed_labeled["confidence"] < CONF_THR].copy()
print("pool_low (lowconf):", pool_low.shape)
print("pool_low labels per doc:", pool_low["labels"].apply(len).value_counts().sort_index().to_dict())
print("pool_low confidence describe:")
print(pool_low["confidence"].describe())

# ===== Eval holdout (pseudo-GT) split to avoid leakage =====
EVAL_PCT = 10  # seed_init의 10%를 평가 전용으로 분리
rng = np.random.RandomState(SEED)

eval_idx = rng.choice(seed_init.index.values, size=int(len(seed_init) * EVAL_PCT/100), replace=False)
eval_hold = seed_init.loc[eval_idx].copy()
seed_init = seed_init.drop(eval_idx).copy()

print("eval_hold:", eval_hold.shape, "| seed_init(after drop):", seed_init.shape)



seed merged: (26538, 5)
seed label missing: 0
seed_labeled: (26538, 5)
CONF_THR: 0.05300162082
seed_init: (10615, 5)
labels per doc: {2: 5602, 3: 5013}
pool_low (lowconf): (15923, 5)
pool_low labels per doc: {2: 5928, 3: 9995}
pool_low confidence describe:
count    15923.000000
mean         0.031113
std          0.013627
min          0.000000
25%          0.022034
50%          0.032297
75%          0.042052
max          0.053001
Name: confidence, dtype: float64
eval_hold: (1061, 5) | seed_init(after drop): (9554, 5)


In [75]:
# Step5에서 확정한 directed residual smoothing(beta=0.10)
beta = 0.10

# taxonomy edges -> A_pc_hat 구성 (parent->child)
N = len(classes_df)
A_pc = np.zeros((N, N), dtype=np.float32)
for p, c in edges:
    A_pc[c, p] = 1.0

A_pc[np.arange(N), np.arange(N)] = 1.0
row_sum = A_pc.sum(axis=1, keepdims=True)
A_pc_hat = A_pc / np.maximum(row_sum, 1e-8)

def smooth_scores_parent_to_child(scores, beta=0.10):
    # scores: (num_docs, num_classes)
    S0 = scores.T.astype(np.float32)             # (C,N)
    S  = S0 + beta * ((A_pc_hat @ S0) - S0)      # residual 1-step
    return S.T

def pick_2or3_from_scores(score_mat, thresh_third_pctl=85, margin_pctl=75, margin_floor=0.05):
    idx_sorted = np.argsort(-score_mat, axis=1)[:, :3]
    top2 = score_mat[np.arange(score_mat.shape[0]), idx_sorted[:,1]]
    top3 = score_mat[np.arange(score_mat.shape[0]), idx_sorted[:,2]]
    gap23 = top2 - top3

    TH = float(np.percentile(top3, thresh_third_pctl))
    MG = float(np.percentile(gap23, margin_pctl))
    MG = max(MG, margin_floor)

    pred = []
    conf = []
    for i in range(score_mat.shape[0]):
        a,b,c = idx_sorted[i]
        s1 = float(score_mat[i,a]); s2 = float(score_mat[i,b]); s3 = float(score_mat[i,c])
        labs = [int(a), int(b)]
        if (s3 >= TH) and ((s2 - s3) <= MG):
            labs.append(int(c))
        pred.append(labs)

        # confidence: top2 평균 + (top2-top4 유사 대용으로 top2-top3) 양수 가중
        conf_i = (s1 + s2)/2.0 + 0.25 * max(0.0, (s2 - s3))
        conf.append(conf_i)

    return pred, np.array(conf, dtype=np.float32), TH, MG


In [80]:
# pool = low-confidence portion
pool = pool_low.copy()
print("pool_low:", pool.shape)

# teacher scores
X_pool_vec = vec.transform(pool["text"].astype(str).tolist())
scores_pool = np.asarray(clf.decision_function(X_pool_vec))

# Step5 directed residual smoothing
scores_pool_s = smooth_scores_parent_to_child(scores_pool, beta=beta)

# top-k 정보로 더 안정적인 confidence 구성
idx_sorted = np.argsort(-scores_pool_s, axis=1)[:, :4]  # top4까지
top1 = scores_pool_s[np.arange(scores_pool_s.shape[0]), idx_sorted[:,0]]
top2 = scores_pool_s[np.arange(scores_pool_s.shape[0]), idx_sorted[:,1]]
top3 = scores_pool_s[np.arange(scores_pool_s.shape[0]), idx_sorted[:,2]]
top4 = scores_pool_s[np.arange(scores_pool_s.shape[0]), idx_sorted[:,3]]

# 예측 라벨(2~3개)
pred_labels, _, TH, MG = pick_2or3_from_scores(scores_pool_s, thresh_third_pctl=85, margin_pctl=75)

# 새로운 confidence: (절대값 + 마진) 기반 (LinearSVC score 스케일에 덜 민감)
pseudo_conf2 = top1 + 0.5*(top2) + 0.5*(top2 - top4)  # top1 크기 + 상위 분리도

pool["pseudo_labels"] = pred_labels
pool["pseudo_conf2"] = pseudo_conf2

print("TH/MG:", TH, MG)
print("pseudo_conf2 describe:")
print(pd.Series(pool["pseudo_conf2"]).describe())

# ===== Consistency filter: 기존 labels와 pseudo_labels가 최소 1개 이상 겹치기 =====
def overlap_count(a, b):
    return len(set(a).intersection(set(b)))

pool["overlap"] = [overlap_count(o, p) for o, p in zip(pool["labels"], pool["pseudo_labels"])]

print("overlap distribution:", pool["overlap"].value_counts().sort_index().to_dict())

# ===== 매우 엄격하게 추가: (1) top1 > 0 조건 + (2) pseudo_conf2 상위 5% + (3) overlap>=1 =====
ADD_TOP_PCT = 5
thr_conf = float(np.percentile(pool["pseudo_conf2"].values, 100-ADD_TOP_PCT))

to_add = pool[
    (top1 > 0) &
    (pool["pseudo_conf2"] >= thr_conf) &
    (pool["overlap"] >= 1)
].copy()

print("ADD_TOP_PCT:", ADD_TOP_PCT, "| thr_conf:", thr_conf)
print("to_add:", to_add.shape)
print("labels per doc (to_add):", to_add["pseudo_labels"].apply(len).value_counts().sort_index().to_dict())

print("\n[Sanity check: original vs pseudo]")
for i in range(min(8, len(to_add))):
    r = to_add.iloc[i]
    print("doc_id:", r["doc_id"], "orig:", r["labels"], "pseudo:", r["pseudo_labels"],
          "overlap:", int(r["overlap"]), "top1:", float(top1[pool.index.get_loc(r.name)]), "conf2:", float(r["pseudo_conf2"]))


pool_low: (15923, 5)
TH/MG: -0.05106486417353153 0.05
pseudo_conf2 describe:
count    15923.000000
mean         0.169723
std          0.150168
min         -0.086586
25%          0.073364
50%          0.173585
75%          0.251312
max          2.031942
Name: pseudo_conf2, dtype: float64
overlap distribution: {0: 14073, 1: 1284, 2: 515, 3: 51}
ADD_TOP_PCT: 5 | thr_conf: 0.32395256757736207
to_add: (544, 8)
labels per doc (to_add): {2: 484, 3: 60}

[Sanity check: original vs pseudo]
doc_id: 23457 orig: [220, 129] pseudo: [220, 439] overlap: 1 top1: 0.6037007570266724 conf2: 0.5777121186256409
doc_id: 14658 orig: [220, 221] pseudo: [220, 221] overlap: 2 top1: 1.0881788730621338 conf2: 1.7485592365264893
doc_id: 14614 orig: [294, 433, 435] pseudo: [294, 433] overlap: 2 top1: 0.8134709000587463 conf2: 1.3495101928710938
doc_id: 12290 orig: [220, 221] pseudo: [220, 221] overlap: 2 top1: 0.6159065961837769 conf2: 0.6466577053070068
doc_id: 7136 orig: [15, 166] pseudo: [166, 15] overlap: 2 top

In [82]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

train_mix = pd.concat([
    seed_init[["text","labels"]].rename(columns={"labels":"y"}),
    to_add[["text","pseudo_labels"]].rename(columns={"pseudo_labels":"y"})
], axis=0).reset_index(drop=True)

print("train_mix:", train_mix.shape, "| pseudo added:", len(to_add))

X_mix = vec.transform(train_mix["text"].astype(str).tolist())
Y_mix = mlb.transform(train_mix["y"].tolist())

clf_st = OneVsRestClassifier(LinearSVC(C=1.0), n_jobs=-1)
clf_st.fit(X_mix, Y_mix)

print("trained self-training model.")


train_mix: (10098, 2) | pseudo added: 544
trained self-training model.


In [84]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
import numpy as np

# seed_init은 이미 eval_hold 제거된 상태: 9554
X_base = vec.transform(seed_init["text"].astype(str).tolist())
Y_base = mlb.transform(seed_init["labels"].tolist())

clf_base_fair = OneVsRestClassifier(LinearSVC(C=1.0), n_jobs=-1)
clf_base_fair.fit(X_base, Y_base)

print("trained fair baseline on seed_init (eval_hold excluded):", X_base.shape, Y_base.shape)




trained fair baseline on seed_init (eval_hold excluded): (9554, 55968) (9554, 531)


In [85]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
import numpy as np

# seed_init은 이미 eval_hold 제거된 상태: 9554
X_base = vec.transform(seed_init["text"].astype(str).tolist())
Y_base = mlb.transform(seed_init["labels"].tolist())

clf_base_fair = OneVsRestClassifier(LinearSVC(C=1.0), n_jobs=-1)
clf_base_fair.fit(X_base, Y_base)

print("trained fair baseline on seed_init (eval_hold excluded):", X_base.shape, Y_base.shape)




trained fair baseline on seed_init (eval_hold excluded): (9554, 55968) (9554, 531)


In [86]:
from sklearn.metrics import f1_score
import numpy as np

Xv = vec.transform(eval_hold["text"].astype(str).tolist())
Yv = mlb.transform(eval_hold["labels"].tolist())

# fair baseline
sc_base = np.asarray(clf_base_fair.decision_function(Xv))
sc_base = smooth_scores_parent_to_child(sc_base, beta=beta)
pred_b, _, _, _ = pick_2or3_from_scores(sc_base)
Ypb = np.zeros_like(Yv, dtype=np.int32)
for i,labs in enumerate(pred_b):
    Ypb[i, labs] = 1

# self-trained
sc_st = np.asarray(clf_st.decision_function(Xv))
sc_st = smooth_scores_parent_to_child(sc_st, beta=beta)
pred_s, _, _, _ = pick_2or3_from_scores(sc_st)
Yps = np.zeros_like(Yv, dtype=np.int32)
for i,labs in enumerate(pred_s):
    Yps[i, labs] = 1

micro_base = f1_score(Yv, Ypb, average="micro", zero_division=0)
micro_st   = f1_score(Yv, Yps, average="micro", zero_division=0)
samp_base  = f1_score(Yv, Ypb, average="samples", zero_division=0)
samp_st    = f1_score(Yv, Yps, average="samples", zero_division=0)

print("Eval on eval_hold (no-leak fair comparison)")
print(f" fair baseline(seed_init only) micro={micro_base:.4f} sample={samp_base:.4f}")
print(f" self-train(+to_add)        micro={micro_st:.4f} sample={samp_st:.4f}")


Eval on eval_hold (no-leak fair comparison)
 fair baseline(seed_init only) micro=0.4484 sample=0.4382
 self-train(+to_add)        micro=0.7209 sample=0.7183


### Step7

In [87]:
import numpy as np
import pandas as pd
from pathlib import Path

# test_all: (19658, 3) already loaded in Step6 Cell1
X_test = vec.transform(test_all["text"].astype(str).tolist())

scores_test = np.asarray(clf_st.decision_function(X_test))   # (Ntest, 531)
scores_test_s = smooth_scores_parent_to_child(scores_test, beta=beta)

print("scores_test:", scores_test.shape, "scores_test_s:", scores_test_s.shape)
print("scores_test_s min/mean/max:", float(scores_test_s.min()), float(scores_test_s.mean()), float(scores_test_s.max()))


scores_test: (19658, 531) scores_test_s: (19658, 531)
scores_test_s min/mean/max: -2.487013101577759 -1.0674573183059692 2.880851984024048


In [88]:
pred_test, _, THt, MGt = pick_2or3_from_scores(scores_test_s, thresh_third_pctl=85, margin_pctl=75)

lens = pd.Series([len(x) for x in pred_test]).value_counts().sort_index()
print("TH/MG (test):", THt, MGt)
print("pred labels per doc distribution:", lens.to_dict())

# 미리보기
preview = pd.DataFrame({
    "doc_id": test_all["doc_id"].astype(int),
    "labels": pred_test
}).head(10)
preview


TH/MG (test): -0.34720872044563283 0.24121679738163948
pred labels per doc distribution: {2: 17842, 3: 1816}


Unnamed: 0,doc_id,labels
0,0,"[90, 473]"
1,1,"[168, 18]"
2,2,"[153, 140]"
3,3,"[314, 373]"
4,4,"[67, 510]"
5,5,"[206, 8]"
6,6,"[397, 313]"
7,7,"[161, 115]"
8,8,"[49, 264]"
9,9,"[81, 48]"


In [91]:
OUT = Path("artifacts") / "2021350218_final.csv"
OUT.parent.mkdir(exist_ok=True)

sub = pd.DataFrame({
    "id": test_all["doc_id"].astype(int),
    "labels": [", ".join(map(str, labs)) for labs in pred_test]
})

sub.to_csv(OUT, index=False)
print("Saved:", OUT)
sub.head()


Saved: artifacts/2021350218_final.csv


Unnamed: 0,id,labels
0,0,"90, 473"
1,1,"168, 18"
2,2,"153, 140"
3,3,"314, 373"
4,4,"67, 510"


In [90]:
import json
from pathlib import Path

ART = Path("artifacts")
ART.mkdir(exist_ok=True)

final_meta = {
    "data": {
        "num_classes": int(NUM_CLASSES),
        "num_edges": int(len(edges)),
        "num_train": int(len(train_all)),
        "num_test": int(len(test_all)),
        "seed_init_size": int(len(seed_init)),
        "eval_hold_size": int(len(eval_hold)),
        "pool_low_size": int(len(pool_low)),
        "to_add_size": int(len(to_add)),
    },
    "models": {
        "baseline_step4": "LinearSVC OneVsRest + TFIDF",
        "taxonomy_step5": "Directed residual smoothing (parent->child) beta=0.10",
        "self_training_step6": "Re-train on seed_init + filtered pseudo from lowconf pool",
    },
    "step5": {
        "beta": float(beta),
        "formula": "S = S0 + beta * (A_pc_hat @ S0 - S0)",
    },
    "step6_selection": {
        "CONF_THR_seed": float(CONF_THR),
        "ADD_TOP_PCT": int(ADD_TOP_PCT),
        "filters": ["top1>0", "pseudo_conf2>=thr_conf(top5%)", "overlap>=1"],
    },
    "eval_no_leak": {
        "fair_baseline_micro": float(micro_base),
        "fair_baseline_sample": float(samp_base),
        "self_train_micro": float(micro_st),
        "self_train_sample": float(samp_st),
    },
    "test_prediction": {
        "TH_test": float(THt),
        "MG_test": float(MGt),
        "pred_dist": {int(k): int(v) for k,v in pd.Series([len(x) for x in pred_test]).value_counts().sort_index().items()},
    },
    "artifacts": {
        "step4_model": "artifacts/step4_tfidf_linearSVC_ovr.joblib",
        "step5_meta": "artifacts/step5_gnn_directed_residual_meta.json",
        "submission": "artifacts/submission.csv",
    }
}

OUT_META = ART / "final_pipeline_meta.json"
with open(OUT_META, "w", encoding="utf-8") as f:
    json.dump(final_meta, f, ensure_ascii=False, indent=2)

print("Saved:", OUT_META)


Saved: artifacts/final_pipeline_meta.json
