In [1]:
# ================================================
# 1. REPRODUCIBILITY SETTINGS 
# ================================================
from pathlib import Path

import random
import numpy as np
import pandas as pd
import torch

from tqdm import tqdm
import copy

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split

# ----- Reproducibility -----
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# ----- Device -----
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ================================================
# 2. DEFAULT PATHS FOR DATASET
# ================================================
ROOT = Path("Amazon_products")   # dataset root directory

# Main corpus
TRAIN_CORPUS_PATH = ROOT / "train" /  "train_corpus.txt"       # pid \t text
TEST_CORPUS_PATH  = ROOT / "test" / "test_corpus.txt"        # pid \t text

# Taxonomy & class meta
CLASSES_PATH      = ROOT / "classes.txt"            # class_id \t class_name
HIERARCHY_PATH    = ROOT / "class_hierarchy.txt"    # parent_id \t child_id
KEYWORDS_PATH     = ROOT / "class_related_keywords.txt"

# Constants
NUM_CLASSES = 531
MIN_LABELS = 2     # minimum number of labels per sample
MAX_LABELS = 3     # maximum number of labels per sample

# Check paths
print("\n== Data path check ==")
for p in [TRAIN_CORPUS_PATH, TEST_CORPUS_PATH,
          CLASSES_PATH, HIERARCHY_PATH, KEYWORDS_PATH]:
    print(f"{p} -> {p.exists()}")

Device: cuda

== Data path check ==
Amazon_products/train/train_corpus.txt -> True
Amazon_products/test/test_corpus.txt -> True
Amazon_products/classes.txt -> True
Amazon_products/class_hierarchy.txt -> True
Amazon_products/class_related_keywords.txt -> True


In [2]:
# ================================================
# 2. DATA LOADING
# ================================================

def load_corpus(path):
    """
    Load corpus file (pid \\t text) as {pid: text} dictionary.
    """
    pid2text = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t", 1)
            if len(parts) == 2:
                pid, text = parts
                pid2text[pid] = text
    return pid2text

# Load corpus
print("Loading train/test corpus...")

pid2text_train = load_corpus(TRAIN_CORPUS_PATH)
pid2text_test  = load_corpus(TEST_CORPUS_PATH)

pid_list_train = list(pid2text_train.keys())
pid_list_test  = list(pid2text_test.keys())

print("Train samples:", len(pid2text_train))
print("Test samples :", len(pid2text_test))

# Quick sample check
for i, (pid, text) in enumerate(pid2text_train.items()):
    print(f"Example train sample #{i}: pid={pid}, text={text[:80]}...")
    break

Loading train/test corpus...
Train samples: 29487
Test samples : 19658
Example train sample #0: pid=0, text=omron hem 790it automatic blood pressure monitor with advanced omron health mana...


In [3]:
# ================================================
# 3. CLASS METADATA LOADING
# ================================================

def load_classes(path):
    """
    classes.txt : class_id \\t class_name
    returns: id2label, label2id
    """
    id2label = {}
    label2id = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) != 2:
                continue
            cid, name = parts
            cid = int(cid)
            id2label[cid] = name
            label2id[name] = cid
    return id2label, label2id


def load_hierarchy(path):
    """
    class_hierarchy.txt : parent_id \\t child_id
    returns: edges (list of tuples)
    """
    edges = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) != 2:
                continue
            p, c = map(int, parts)
            edges.append((p, c))
    return edges


def load_keywords(path, label2id):
    """
    class_related_keywords.txt : CLASS_NAME: kw1, kw2,...
    returns: {class_id: [kws]}
    """
    d = {cid: [] for cid in label2id.values()}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if ":" not in line:
                continue
            name, kws = line.strip().split(":", 1)
            kws = [k.strip() for k in kws.split(",") if k.strip()]
            if name in label2id:
                cid = label2id[name]
                d[cid] = kws
    return d


# ----------------- Load all class meta -----------------
print("Loading class metadata...")

id2label, label2id = load_classes(CLASSES_PATH)
edges = load_hierarchy(HIERARCHY_PATH)
label_keywords = load_keywords(KEYWORDS_PATH, label2id)

print("Num classes:", len(id2label))
print("Num edges in taxonomy:", len(edges))
print()

# Small check
example_id = 0
print("Example class id:", example_id)
print("Name:", id2label[example_id])
print("Keywords:", label_keywords[example_id])

Loading class metadata...
Num classes: 531
Num edges in taxonomy: 568

Example class id: 0
Name: grocery_gourmet_food
Keywords: ['snacks', 'condiments', 'beverages', 'specialty_foods', 'spices', 'cooking_oils', 'baking_ingredients', 'gourmet_chocolates', 'artisanal_cheeses', 'organic_foods']


In [5]:
# ================================================
# 4. TEXT EMBEDDING WITH GTE (Alibaba-NLP/gte-base-en-v1.5)
# ================================================
from sentence_transformers import SentenceTransformer

# ----- Embedding model -----
print(f"\n== Loading embedding model: {EMB_MODEL_NAME} ==")
emb_model = SentenceTransformer(
    "Alibaba-NLP/gte-base-en-v1.5",
    device=device,
    trust_remote_code=True
)


def encode_texts(texts, batch_size: int = 64, normalize: bool = True):
    """
    Alibaba-NLP/gte-base-en-v1.5로 텍스트 리스트 임베딩.
    - 파라미터는 고정 (feature extractor로만 사용)
    - normalize=True 이면 cosine similarity 계산이 쉬워짐
    """
    embeddings = emb_model.encode(
        texts,
        batch_size=batch_size,
        convert_to_numpy=True,
        show_progress_bar=True,
        normalize_embeddings=normalize,
    )
    return embeddings


# ----- Save directory for embeddings -----
EMB_DIR = ROOT / "embeddings"
EMB_DIR.mkdir(exist_ok=True, parents=True)
print("Embedding dir:", EMB_DIR)


# ================================================
# 4-1. DOCUMENT EMBEDDINGS (TRAIN / TEST)
# ================================================
print("\n== Building document embeddings (train/test) ==")

# pid 순서에 맞게 텍스트 리스트 만들기
train_texts = [pid2text_train[pid] for pid in pid_list_train]
test_texts  = [pid2text_test[pid] for pid in pid_list_test]

print("Num train texts:", len(train_texts))
print("Num test texts :", len(test_texts))

# 임베딩 계산
train_doc_emb = encode_texts(train_texts)
test_doc_emb  = encode_texts(test_texts)

print("train_doc_emb shape:", train_doc_emb.shape)
print("test_doc_emb shape :", test_doc_emb.shape)

# 저장
np.save(EMB_DIR / "train_doc_gte.npy", train_doc_emb)
np.save(EMB_DIR / "test_doc_gte.npy",  test_doc_emb)

# pid 리스트도 같이 저장 (submission 만들 때 필요)
np.save(EMB_DIR / "pid_list_train.npy", np.array(pid_list_train))
np.save(EMB_DIR / "pid_list_test.npy",  np.array(pid_list_test))


# ================================================
# 4-2. CLASS EMBEDDINGS (NAME / NAME+KEYWORDS)
# ================================================
print("\n== Building class embeddings ==")

# 클래스 id를 정렬된 순서로 사용 (0 ~ NUM_CLASSES-1라고 가정)
class_ids = sorted(id2label.keys())
print("Num class ids (from id2label):", len(class_ids))

# 혹시라도 NUM_CLASSES와 다르면 바로 잡기
if len(class_ids) != NUM_CLASSES:
    print(f"[WARN] NUM_CLASSES({NUM_CLASSES}) != loaded classes({len(class_ids)})")

# (1) class_name 임베딩
class_names = [id2label[cid] for cid in class_ids]
class_name_emb = encode_texts(class_names)
print("class_name_emb shape:", class_name_emb.shape)

np.save(EMB_DIR / "class_name_gte.npy", class_name_emb)

# (2) class_name + keywords 합친 텍스트로 임베딩 (선택, 나중에 실험할 때 사용 가능)
merged_class_texts = []
for cid in class_ids:
    name = id2label[cid]
    kws = label_keywords.get(cid, [])
    if kws:
        merged = name + " : " + ", ".join(kws)
    else:
        merged = name
    merged_class_texts.append(merged)

class_kw_emb = encode_texts(merged_class_texts)
print("class_kw_emb shape:", class_kw_emb.shape)

np.save(EMB_DIR / "class_kw_gte.npy", class_kw_emb)

print("\n== GTE embedding step completed successfully. ==")


== Loading embedding model: Alibaba-NLP/gte-base-en-v1.5 ==


A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Embedding dir: Amazon_products/embeddings

== Building document embeddings (train/test) ==
Num train texts: 29487
Num test texts : 19658


Batches: 100%|██████████| 461/461 [02:10<00:00,  3.52it/s]
Batches: 100%|██████████| 308/308 [01:30<00:00,  3.41it/s]


train_doc_emb shape: (29487, 768)
test_doc_emb shape : (19658, 768)

== Building class embeddings ==
Num class ids (from id2label): 531


Batches: 100%|██████████| 9/9 [00:00<00:00, 56.15it/s]


class_name_emb shape: (531, 768)


Batches: 100%|██████████| 9/9 [00:01<00:00,  7.37it/s]

class_kw_emb shape: (531, 768)

== GTE embedding step completed successfully. ==





In [None]:
# ================================================
# 5. LOAD PRECOMPUTED EMBEDDINGS
# ================================================
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

# 이미 위에서 ROOT, NUM_CLASSES, device 가 정의되어 있다고 가정
EMB_DIR = ROOT / "embeddings"
print("Embedding dir:", EMB_DIR)


# ----- 5-1. 기본 임베딩 로딩 -----
print("\n== Loading precomputed embeddings ==")

# (1) 문서 임베딩
train_doc_emb = np.load(EMB_DIR / "train_doc_gte.npy")   # shape: (N_train, dim)
test_doc_emb  = np.load(EMB_DIR / "test_doc_gte.npy")    # shape: (N_test, dim)

# (2) pid 리스트
pid_list_train = np.load(EMB_DIR / "pid_list_train.npy", allow_pickle=True)
pid_list_test  = np.load(EMB_DIR / "pid_list_test.npy",  allow_pickle=True)

# (3) 클래스 임베딩 (이름 기반 / 이름+키워드 기반 두 종류)
class_name_emb = np.load(EMB_DIR / "class_name_gte.npy") # shape: (NUM_CLASSES, dim)
class_kw_emb   = np.load(EMB_DIR / "class_kw_gte.npy")   # shape: (NUM_CLASSES, dim)

print("train_doc_emb:", train_doc_emb.shape)
print("test_doc_emb :", test_doc_emb.shape)
print("class_name_emb:", class_name_emb.shape)
print("class_kw_emb  :", class_kw_emb.shape)
print("len(pid_list_train):", len(pid_list_train))
print("len(pid_list_test) :", len(pid_list_test))


# ================================================
# 5-2. TORCH TENSOR로 변환 (학습에 바로 쓰기 용)
# ================================================
train_doc_tensor = torch.from_numpy(train_doc_emb).float().to(device)
test_doc_tensor  = torch.from_numpy(test_doc_emb).float().to(device)

class_name_tensor = torch.from_numpy(class_name_emb).float().to(device)
class_kw_tensor   = torch.from_numpy(class_kw_emb).float().to(device)

print("\nTensors moved to device:", device)
print("train_doc_tensor:", train_doc_tensor.shape)
print("class_name_tensor:", class_name_tensor.shape)


# ================================================
# 5-3. 학습용 Dataset/Dataloader 예시 (라벨 생기면 여기에 붙이면 됨)
# ================================================

class EmbeddingDataset(Dataset):
    """
    X: (N, dim) 문서 임베딩
    y: (N, C) multi-hot 라벨 (TaxoClass-style silver label 들어갈 자리)
    """
    def __init__(self, X: np.ndarray, y: np.ndarray):
        assert X.shape[0] == y.shape[0]
        self.X = torch.from_numpy(X).float()
        self.y = torch.from_numpy(y).float()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [9]:
# ================================================
# 5. COSINE SIMILARITY → TOP-3 LABEL → CSV SUBMISSION
# ================================================
import numpy as np
import pandas as pd

# ----- 5-1. 임베딩 로드 -----
EMB_DIR = ROOT / "embeddings"

# (1) 텍스트 임베딩: test 문서
test_doc_emb = np.load(EMB_DIR / "test_doc_gte.npy")      # shape: (N_test, dim)
pid_list_test = np.load(EMB_DIR / "pid_list_test.npy", allow_pickle=True)  # shape: (N_test,)

# (2) 클래스 임베딩: 여기서는 class_name 기반 사용 (원하면 class_kw_gte로 바꿔도 됨)
class_emb = np.load(EMB_DIR / "class_name_gte.npy")       # shape: (NUM_CLASSES, dim)

print("test_doc_emb:", test_doc_emb.shape)
print("class_emb   :", class_emb.shape)


# ----- 5-2. 코사인 유사도 계산 (dot product: 이미 normalize_embeddings=True) -----
# sim[i, j] = cos(test_doc_i, class_j)
sim_matrix = np.matmul(test_doc_emb, class_emb.T)   # shape: (N_test, NUM_CLASSES)
print("sim_matrix:", sim_matrix.shape)


# ----- 5-3. 각 문서마다 top-3 클래스 뽑기 -----
TOP_K = 3

# sim_matrix: (N_test, NUM_CLASSES)
# argsort를 이용해 각 행에서 상위 TOP_K 인덱스를 구함
# np.argpartition을 써도 되지만, NUM_CLASSES=531이라 그냥 argsort로 가도 충분히 빠름.
topk_indices = np.argsort(-sim_matrix, axis=1)[:, :TOP_K]   # shape: (N_test, TOP_K)

# labels는 공백으로 이어붙인 "cid1 cid2 cid3" 형태로 만듦
labels_str_list = []
for row in topk_indices:
    # 정수형 class id 리스트 → 문자열 리스트 → 공백으로 join
    cids = [str(int(cid)) for cid in row]
    labels_str = ",".join(cids)
    labels_str_list.append(labels_str)

print("예시 labels 5개:", labels_str_list[:5])
print("예시 pid 5개   :", pid_list_test[:5])


# ----- 5-4. Kaggle 제출용 CSV 만들기 -----
# Kaggle 포맷 가정: 컬럼 이름이 'id', 'labels'
# (previous warning 메시지에도 'id column' 언급이 있었으니 'id'로 맞추는 게 안전)
submission = pd.DataFrame({
    "id": pid_list_test,
    "labels": labels_str_list
})

SUBMISSION_PATH = ROOT / "submissions" / "submission_top3_cosine.csv"
SUBMISSION_PATH.parent.mkdir(exist_ok=True, parents=True)

submission.to_csv(SUBMISSION_PATH, index=False)
print("Saved submission to:", SUBMISSION_PATH)

test_doc_emb: (19658, 768)
class_emb   : (531, 768)
sim_matrix: (19658, 531)
예시 labels 5개: ['473,197,461', '168,17,18', '300,73,112', '304,354,0', '220,508,109']
예시 pid 5개   : ['0' '1' '2' '3' '4']
Saved submission to: Amazon_products/submissions/submission_top3_cosine.csv
