In [2]:
import os
import csv
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
from sentence_transformers import SentenceTransformer, util

# --- 1. 환경 설정 ---
# AWS GPU 사용 확인
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# 시드 고정 (재현성)
def seed_everything(seed=42):
    import random
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(42)

# --- Paths ---
BASE_DIR = "../Amazon_products" #
TRAIN_CORPUS = os.path.join(BASE_DIR, "train/train_corpus.txt")
CLASSES_PATH = os.path.join(BASE_DIR, "classes.txt")
KEYWORDS_PATH = os.path.join(BASE_DIR, "class_related_keywords.txt")
HIERARCHY_PATH = os.path.join(BASE_DIR, "class_hierarchy.txt")
OUTPUT_CSV = "silver_labels_train.csv"

# --- 2. 데이터 로더 ---
def load_classes(path):
    id2name = {}
    name2id = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t", 1)
            if len(parts) == 2:
                cid, name = int(parts[0]), parts[1].strip()
                id2name[cid] = name
                name2id[name] = cid
    return id2name, name2id

def load_keywords(path, name2id):
    # 클래스 ID별로 [클래스이름 + 키워드들]을 합쳐서 하나의 텍스트로 만듦
    cid2desc = defaultdict(str)
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if ":" in line:
                name, kws = line.strip().split(":", 1)
                if name in name2id:
                    cid = name2id[name]
                    # "class_name: keyword, keyword..." 형식의 설명 문구 생성
                    desc = f"{name}: {kws.replace(',', ' ')}"
                    cid2desc[cid] = desc
    return cid2desc

def load_hierarchy(path):
    parents = defaultdict(list)
    with open(path, 'r') as f:
        for line in f:
            p, c = map(int, line.split())
            parents[c].append(p)
    return parents

def load_corpus(path):
    pids, texts = [], []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t", 1)
            if len(parts) == 2:
                pids.append(parts[0])
                texts.append(parts[1])
    return pids, texts

# --- 3. 실행 로직 ---
print("Loading data...")
id2name, name2id = load_classes(CLASSES_PATH)
cid2desc = load_keywords(KEYWORDS_PATH, name2id)
parents_map = load_hierarchy(HIERARCHY_PATH)
train_pids, train_texts = load_corpus(TRAIN_CORPUS)

# 3-1. Embedding Model 로드 (Sentence-BERT)
print("Loading Sentence-BERT model...")
# 가볍고 빠른 모델 사용 ('all-MiniLM-L6-v2'). 성능을 높이려면 'all-mpnet-base-v2' 사용 가능
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

# 3-2. 클래스 설명(Description) 임베딩 미리 계산
print("Encoding class descriptions...")
# 0번부터 530번 클래스까지 순서대로 텍스트 리스트 생성
sorted_cids = sorted(id2name.keys())
class_texts = [cid2desc.get(cid, id2name[cid]) for cid in sorted_cids]
class_embeddings = model.encode(class_texts, convert_to_tensor=True)

# 3-3. 텍스트 임베딩 및 유사도 계산 (Batch Processing)
batch_size = 64
all_silver_labels = []

print(f"Generating labels for {len(train_texts)} samples...")

for i in tqdm(range(0, len(train_texts), batch_size)):
    batch_texts = train_texts[i : i + batch_size]
    batch_pids = train_pids[i : i + batch_size]
    
    # 리뷰 텍스트 임베딩
    text_embeddings = model.encode(batch_texts, convert_to_tensor=True)
    
    # 코사인 유사도 계산 (Review vs All Classes)
    # shape: (batch_size, num_classes)
    cos_scores = util.cos_sim(text_embeddings, class_embeddings)
    
    # 각 리뷰별 상위 3개 클래스 추출
    top_results = torch.topk(cos_scores, k=3, dim=1)
    
    for idx, (scores, indices) in enumerate(zip(top_results.values, top_results.indices)):
        # 텐서를 리스트로 변환
        top_cids = indices.tolist()
        pid = batch_pids[idx]
        
        # 계층 구조 적용 (자식 선택 시 부모 자동 추가)
        final_labels = set(top_cids)
        for cid in top_cids:
            curr = cid
            while curr in parents_map:
                for p in parents_map[curr]:
                    final_labels.add(p)
                    curr = p # 부모의 부모를 찾기 위해
                break # 단순화를 위해 첫 부모 라인만
        
        # 결과 저장
        all_silver_labels.append([pid, ",".join(map(str, final_labels))])

# --- 4. CSV 저장 ---
df = pd.DataFrame(all_silver_labels, columns=["pid", "labels"])
df.to_csv(OUTPUT_CSV, index=False)
print(f"Saved silver labels to {OUTPUT_CSV}")
print("Sample output:")
print(df.head())

Using device: cpu
Loading data...
Loading Sentence-BERT model...
Encoding class descriptions...
Generating labels for 29487 samples...


  1%|          | 5/461 [00:04<06:59,  1.09it/s]


KeyboardInterrupt: 