In [3]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm

# --- 1. 환경 및 하이퍼파라미터 설정 ---
def seed_everything(seed=42):
    import random
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 설정 (성능을 위해 Batch Size는 GPU 메모리가 허용하는 한 크게 잡으세요)
MODEL_NAME = "bert-base-uncased" # 성능을 더 높이려면 'microsoft/deberta-v3-base' 고려 가능
BATCH_SIZE = 16  # OOM(Out of Memory) 나면 8로 줄이세요
EPOCHS = 5
LEARNING_RATE = 2e-5
MAX_LEN = 256
NUM_CLASSES = 531

# 경로 설정
BASE_DIR = "Amazon_products"
TRAIN_CORPUS_PATH = os.path.join(BASE_DIR, "train/train_corpus.txt")
SILVER_LABELS_PATH = "silver_labels_train.csv" # 방금 만든 파일
OUTPUT_MODEL_DIR = "saved_model"

if not os.path.exists(OUTPUT_MODEL_DIR):
    os.makedirs(OUTPUT_MODEL_DIR)

# --- 2. 데이터셋 클래스 정의 ---
class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        # Multi-hot Encoding
        label_list = [int(x) for x in str(self.labels[item]).split(",") if x]
        label_tensor = torch.zeros(NUM_CLASSES)
        label_tensor[label_list] = 1.0

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'review_text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': label_tensor
        }

# --- 3. 데이터 로드 및 전처리 ---
print("Loading data...")
# 텍스트 로드
pid2text = {}
with open(TRAIN_CORPUS_PATH, "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split("\t", 1)
        if len(parts) == 2:
            pid2text[int(parts[0])] = parts[1]

# 라벨 로드
df = pd.read_csv(SILVER_LABELS_PATH)
df['text'] = df['pid'].map(pid2text)

# 결측치 제거 (혹시 모를 오류 방지)
df = df.dropna(subset=['text', 'labels'])

# Train / Validation Split (9:1)
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
print(f"Train size: {len(train_df)}, Val size: {len(val_df)}")

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# DataLoader
train_dataset = ReviewDataset(
    texts=train_df.text.to_numpy(),
    labels=train_df.labels.to_numpy(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)
val_dataset = ReviewDataset(
    texts=val_df.text.to_numpy(),
    labels=val_df.labels.to_numpy(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

# --- 4. 모델 초기화 ---
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    num_labels=NUM_CLASSES,
    problem_type="multi_label_classification"
)
model = model.to(device)

# Optimizer & Scheduler
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, correct_bias=False)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)
loss_fn = torch.nn.BCEWithLogitsLoss()

# --- 5. 학습 루프 (Training Loop) ---
best_val_loss = float('inf')

print("Starting training...")
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    print("-" * 10)

    # Train
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        loss = loss_fn(logits, labels)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) # Gradient Clipping
        optimizer.step()
        scheduler.step()
        
        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    print(f"Average Train Loss: {avg_train_loss:.4f}")

    # Validation
    model.eval()
    val_loss = 0
    pred_list = []
    true_list = []
    
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = loss_fn(logits, labels)
            val_loss += loss.item()
            
            # 성능 지표 계산을 위한 저장 (Sigmoid 적용)
            preds = torch.sigmoid(logits).cpu().numpy()
            true_labels = labels.cpu().numpy()
            
            pred_list.extend(preds)
            true_list.extend(true_labels)

    avg_val_loss = val_loss / len(val_loader)
    print(f"Validation Loss: {avg_val_loss:.4f}")

    # Checkpoint 저장
    if avg_val_loss < best_val_loss:
        print(f"Validation Loss Improved ({best_val_loss:.4f} -> {avg_val_loss:.4f}). Saving model...")
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), os.path.join(OUTPUT_MODEL_DIR, "best_model.pt"))
    
    print()

print("Training complete. Best model saved.")

ImportError: cannot import name 'AdamW' from 'transformers' (C:\Users\user\AppData\Roaming\Python\Python314\site-packages\transformers\__init__.py)