In [None]:
import os
import csv
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# --- 1. 설정 ---
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_NAME = "bert-base-uncased"
BATCH_SIZE = 32
NUM_CLASSES = 531

# 경로 설정
BASE_DIR = "../Amazon_products"
TEST_CORPUS_PATH = os.path.join(BASE_DIR, "test/test_corpus.txt")
HIERARCHY_PATH = os.path.join(BASE_DIR, "class_hierarchy.txt")

# 입력: 1대 모델 & 기존 학습 데이터
INPUT_MODEL_PATH = "saved_model/best_model.pt"
ORIGINAL_TRAIN_CSV = "silver_labels_train.csv"

# 출력: 2라운드용 학습 데이터
OUTPUT_TRAIN_CSV = "train_round_2.csv"

# Self-Training 하이퍼파라미터
CONFIDENCE_THRESHOLD = 0.85  

# --- 2. 데이터 로더 ---
def load_test_corpus(path):
    pids, texts = [], []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t", 1)
            if len(parts) == 2:
                pids.append(parts[0])
                texts.append(parts[1])
    return pids, texts

def load_hierarchy(path):
    parents = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 2:
                parents[int(parts[1])] = int(parts[0]) # 자식: 부모
    return parents

class InferenceDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len=256):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

# --- 3. 메인 로직 ---
def generate_pseudo_labels():
    print("1. Loading Model & Data...")
    test_pids, test_texts = load_test_corpus(TEST_CORPUS_PATH)
    
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_CLASSES)
    model.load_state_dict(torch.load(INPUT_MODEL_PATH, map_location=DEVICE))
    model.to(DEVICE)
    model.eval()
    
    dataset = InferenceDataset(test_texts, tokenizer)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

    pseudo_labels = []
    
    print("2. Scanning Test Data for Confident Predictions...")
    with torch.no_grad():
        for batch_idx, batch in enumerate(tqdm(loader)):
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = torch.sigmoid(outputs.logits)
            
            # 배치 내 샘플 반복
            batch_probs = probs.cpu().numpy()
            current_pids = test_pids[batch_idx * BATCH_SIZE : (batch_idx + 1) * BATCH_SIZE]
            
            for i, sample_prob in enumerate(batch_probs):
                # 가장 높은 확률값들을 확인
                top_indices = sample_prob.argsort()[::-1]
                top_score = sample_prob[top_indices[0]] # 1등의 확률
                
                if top_score >= CONFIDENCE_THRESHOLD:
                    # 2~3개 라벨 선정 (기존 추론 로직과 유사하게)
                    # 여기서는 간단히 확률 높은 순으로 0.5 이상인 것들 (최대 3개)
                    selected_indices = [idx for idx in top_indices if sample_prob[idx] > 0.5][:3]
                    
                    # 만약 0.5 넘는게 2개 미만이면, 상위 2개 강제 선택
                    if len(selected_indices) < 2:
                        selected_indices = top_indices[:2]
                    
                    # 계층 구조는 학습 시 모델이 배우게 놔두거나, 여기서 추가해줘도 됨.
                    # 일단 모델이 예측한 그대로(selected_indices) 신뢰하고 추가.
                    
                    pid = current_pids[i]
                    label_str = ",".join(map(str, sorted(selected_indices)))
                    pseudo_labels.append([pid, label_str])

    print(f"   -> Found {len(pseudo_labels)} high-confidence samples out of {len(test_texts)}.")

    # --- 4. 병합 및 저장 ---
    print("3. Merging with Original Train Data...")
    
    # 기존 Train 데이터 로드
    df_train = pd.read_csv(ORIGINAL_TRAIN_CSV)
    print(f"   Original Train Size: {len(df_train)}")
    
    # Pseudo Label 데이터프레임 생성
    df_pseudo = pd.DataFrame(pseudo_labels, columns=['pid', 'labels'])
    print(f"   Pseudo Label Size: {len(df_pseudo)}")
    
    # 병합 (concat)
    df_round2 = pd.concat([df_train, df_pseudo], ignore_index=True)
    
    # 저장
    df_round2.to_csv(OUTPUT_TRAIN_CSV, index=False)
    print("-" * 30)
    print(f"✅ Round 2 Training Data Saved: {OUTPUT_TRAIN_CSV}")
    print(f"   Total Training Samples: {len(df_round2)} (Increased by {len(df_pseudo)})")
    print("-" * 30)
    print("Now, run 'train.py' again with this new CSV file!")

if __name__ == "__main__":
    generate_pseudo_labels()

  from .autonotebook import tqdm as notebook_tqdm


1. Loading Model & Data...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


2. Scanning Test Data for Confident Predictions...


  0%|          | 0/615 [00:00<?, ?it/s]