In [1]:
"""
train_lora_en2ko.py
- NHNDQ/nllb-finetuned-en2ko 에 LoRA 적용하여 en→ko 파인튜닝
- 48GB VRAM 환경에서 약 85% 사용 목표(자동 배치 크기 탐색)
- 중간 저장(체크포인트) 및 재개(resume) 지원

필수 패키지:
  pip install -U "torch>=2.6" transformers datasets peft accelerate sacrebleu sentencepiece

데이터 가정:
  /popsongData/parsed_en2ko_train.csv
  /popsongData/parsed_en2ko_valid.csv
  /popsongData/parsed_en2ko_test.csv
  컬럼명: english(입력), korean_ref(정답)  ← 다르면 SRC_COL/TGT_COL 변경
"""

import os, math, time, gc, torch, pandas as pd
# os.environ["TRANSFORMERS_NO_ACCELERATE"] = "1"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"  


# torch.set_num_interop_threads(2)  # ← 병렬 작업 시작 전, 최초 1회
# torch.set_num_threads(6)

import sys, platform
print("PY:", sys.executable)
print("VER:", platform.python_version())

PY: /home/j-j13c104/.conda/envs/c104-env/bin/python
VER: 3.9.23


In [2]:
from datasets import load_dataset
from transformers import ( AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments, EarlyStoppingCallback )
from peft import LoraConfig, get_peft_model

In [3]:
# =========================
# 0) 경로/데이터 컬럼 설정
# =========================
DATA_DIR  = "./popsongData"
# TRAIN_CSV = f"{DATA_DIR}/parsed_en2ko_train.csv"
TRAIN_CSV = f"{DATA_DIR}/build_dataset_enko_more_train.csv"

# VALID_CSV = f"{DATA_DIR}/parsed_en2ko_valid.csv"
# TEST_CSV  = f"{DATA_DIR}/parsed_en2ko_test.csv"
VALID_CSV = f"{DATA_DIR}/build_dataset_enko_more_valid.csv"
TEST_CSV  = f"{DATA_DIR}/build_dataset_enko_more_test.csv"

SRC_COL = "en"     # 입력 문장 컬럼명
TGT_COL = "ko"  # 정답(한국어) 컬럼명  (다르면 여기만 바꾸면 됨)

In [4]:
# =========================
# 1) 모델/출력/학습 하이퍼파라미터
# =========================
MODEL_ID = "NHNDQ/nllb-finetuned-en2ko"
OUT_DIR  = "./outputs/en2ko-nllb600m-lora-other-parms4"  # 체크포인트/최종 모델 저장 디렉토리

In [5]:
# 입력/출력 최대 길이(토큰 단위). 가사 1줄 번역 기준 128이면 보통 충분
# MAX_SRC, MAX_TGT = 112, 112
MAX_SRC, MAX_TGT = 256, 256
# 학습률/에폭 등 기본 학습 설정
# LR            = 2e-4
LR            = 1e-4            # LoRA 파라미터 학습률(1e-4~2e-4 권장)
EPOCHS        = 8               # 데이터 많으면 1, 적으면 2 (과적합 주의)
WARMUP_STEPS  = 1000            # 초반 학습 안정화
WEIGHT_DECAY  = 0.01            # AdamW 가중치 감쇠(과적합 방지)
SAVE_STEPS    = 1000            # 이 스텝마다 체크포인트 저장
EVAL_STEPS    = 1000            # 이 스텝마다 검증 수행

In [6]:
# LoRA 설정: q_proj, v_proj만(가성비). 필요 시 k_proj/out_proj 추가 가능
LORA_R        = 8               # 랭크(표현력↔메모리 트레이드오프)
LORA_ALPHA    = 16              # 스케일(일반적으로 r와 비슷하거나 2배)
LORA_DROPOUT  = 0.05            # LoRA 브랜치 드롭아웃
TARGET_MODULES = ["q_proj","v_proj"]  # Encoder+Decoder 모든 층에 자동 매칭

In [7]:
# =========================
# 2) VRAM 사용 목표 및 배치 자동탐색 설정
# =========================
TARGET_VRAM_FRAC = 0.85   # 사용 목표(48GB 기준 ~40.8GB 정도)
MARGIN_FRAC      = 0.03   # 학습 중 실제 사용량 증가 대비 여유 마진
START_BSZ        = 16     # per-device 배치 탐색 시작값
MAX_BSZ_CEIL     = 256    # per-device 배치 상한(안전용)
GRAD_ACC         = 8      # 그래디언트 누적(유효 배치 = per_device_bsz * GRAD_ACC)

In [8]:
# ====== 디바이스/정밀도 ======
device = "cuda"
use_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()

In [9]:
# ---------- 1) 토크나이저/모델 ----------
tok = AutoTokenizer.from_pretrained(MODEL_ID,local_files_only=True)
base = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID, use_safetensors=True,local_files_only=True )
tok.src_lang = "eng_Latn"
KOR_ID = tok.convert_tokens_to_ids("kor_Hang")
base.config.forced_bos_token_id = KOR_ID

In [10]:
# LoRA 부착
peft_cfg = LoraConfig(
    r=LORA_R, lora_alpha=LORA_ALPHA, lora_dropout=LORA_DROPOUT,
    task_type="SEQ_2_SEQ_LM", target_modules=TARGET_MODULES, bias="none"
)
model = get_peft_model(base, peft_cfg).to(device)

model.enable_input_require_grads()   # 입력에 grad 요청(ckpt 사용할 때 필수)
model.config.use_cache = False       # ckpt와 충돌 방지
model.train()                        # 학습 모드 확실히

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): M2M100ForConditionalGeneration(
      (model): M2M100Model(
        (shared): M2M100ScaledWordEmbedding(256206, 1024, padding_idx=1)
        (encoder): M2M100Encoder(
          (embed_tokens): M2M100ScaledWordEmbedding(256206, 1024, padding_idx=1)
          (embed_positions): M2M100SinusoidalPositionalEmbedding()
          (layers): ModuleList(
            (0-11): 12 x M2M100EncoderLayer(
              (self_attn): M2M100Attention(
                (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
                (v_proj): lora.Linear(
                  (base_layer): Linear(in_features=1024, out_features=1024, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=1024, out_features=8, bias=False)
                  )
                  (lor

In [11]:
# ---------- 2) 데이터 ----------
# 
files = {"train": TRAIN_CSV, "validation": VALID_CSV, "test": TEST_CSV}
raw = load_dataset("csv", data_files=files)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [12]:
def preprocess(batch):
    inputs  = tok(batch[SRC_COL],
                  max_length=MAX_SRC,
                  truncation=True)

    targets = tok(text_target=batch[TGT_COL], 
                  max_length=MAX_TGT,
                  truncation=True)

    
    inputs["labels"] = targets["input_ids"]   

    return inputs

In [13]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"   # 전처리 때만
tokd = raw.map(preprocess, batched=True, remove_columns=raw["train"].column_names)
collator = DataCollatorForSeq2Seq(tokenizer=tok, model=model, padding="longest")

Map:   0%|          | 0/36195 [00:00<?, ? examples/s]

Map:   0%|          | 0/3217 [00:00<?, ? examples/s]

Map:   0%|          | 0/805 [00:00<?, ? examples/s]

In [14]:
# ---------- 3) VRAM 오토튜닝 (per-device batch 탐색) ----------
def current_vram_frac():
    free, total = torch.cuda.mem_get_info()
    used = total - free
    return used / total

def fits_with_batch(bsz_try: int) -> bool:
    try:
        # 작은 더미 배치로 forward+backward 한번 수행
        dummy = tok(["hello"] * bsz_try, return_tensors="pt", padding=True,
                    truncation=True, max_length=MAX_SRC).to(device)
        labels = tok(["안녕"] * bsz_try, return_tensors="pt", padding=True,
                     truncation=True, max_length=MAX_TGT)["input_ids"].to(device)
        dummy["labels"] = labels
        out = model(**dummy)
        (out.loss / GRAD_ACC).backward()
        model.zero_grad(set_to_none=True)
        torch.cuda.synchronize()
        del dummy, labels, out
        gc.collect(); torch.cuda.empty_cache()
        return True
    except RuntimeError as e:
        if "CUDA out of memory" in str(e):
            torch.cuda.empty_cache()
            return False
        raise

def autotune_bsz(start=START_BSZ, ceiling=MAX_BSZ_CEIL, target_frac=TARGET_VRAM_FRAC):
    # 증가시키며 맞춰보기 → OOM 나면 반으로 낮추는 식의 이분 탐색
    lo, hi = 1, start
    while hi <= ceiling and fits_with_batch(hi) and current_vram_frac() <= target_frac:
        lo = hi
        hi *= 2

    # 이분 탐색
    left, right = lo, min(hi, ceiling)
    best = left
    while left <= right:
        mid = (left + right) // 2
        ok = fits_with_batch(mid) and current_vram_frac() <= target_frac
        if ok:
            best = mid
            left = mid + 1
        else:
            right = mid - 1
    return max(1, best)


In [15]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # 학습 돌릴 때는 안전하게
PER_DEV_BSZ = 6
print(f"[AutoTune] per_device_train_batch_size = {PER_DEV_BSZ} (grad_acc={GRAD_ACC}, eff_bsz={PER_DEV_BSZ*GRAD_ACC})")
torch.cuda.empty_cache()

[AutoTune] per_device_train_batch_size = 6 (grad_acc=8, eff_bsz=48)


In [16]:
import transformers, torch
print("transformers:", transformers.__version__)
print("torch:", torch.__version__)
print(Seq2SeqTrainingArguments.__module__)
print(Seq2SeqTrainingArguments.__name__)
import transformers
print(transformers.__file__)

transformers: 4.56.2
torch: 2.5.1+cu121
transformers.training_args_seq2seq
Seq2SeqTrainingArguments
/home/j-j13c104/.conda/envs/c104-env/lib/python3.9/site-packages/transformers/__init__.py


In [17]:
# ---------- 4) 트레이너/세이브 재개 ----------
args = Seq2SeqTrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    lr_scheduler_type="linear",
    warmup_steps=WARMUP_STEPS,
    weight_decay=WEIGHT_DECAY,

    per_device_train_batch_size=PER_DEV_BSZ,
    per_device_eval_batch_size=min(PER_DEV_BSZ, 64),
    gradient_accumulation_steps=GRAD_ACC,

    eval_strategy="steps",
    eval_steps=EVAL_STEPS,
    save_strategy="steps",
    save_steps=SAVE_STEPS,
    save_total_limit=5,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    predict_with_generate=False,
    generation_max_length=MAX_TGT,
    bf16=use_bf16,
    fp16=not use_bf16,
    dataloader_num_workers=2,
    logging_steps=100,
    report_to="none",

    save_safetensors=True,
)

In [18]:

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokd["train"],
    eval_dataset=tokd["validation"],
    tokenizer=tok,
    data_collator=collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

  trainer = Seq2SeqTrainer(


In [19]:
# 재개 지원: 중간에 끊겨도 마지막 checkpoint에서 이어감
last_ckpt = None
if os.path.isdir(OUT_DIR):
    cks = [os.path.join(OUT_DIR, d) for d in os.listdir(OUT_DIR) if d.startswith("checkpoint-")]
    if cks:
        last_ckpt = sorted(cks, key=lambda p: int(p.split("-")[-1]))[-1]
        print(f"[Resume] from {last_ckpt}")

trainer.train(resume_from_checkpoint=last_ckpt)
trainer.save_model(OUT_DIR)
tok.save_pretrained(OUT_DIR)

print("== DONE ==")

Step,Training Loss,Validation Loss
1000,1.299,1.215894
2000,1.2212,1.158332
3000,1.1848,1.135449
4000,1.1718,1.12384
5000,1.1669,1.118778
6000,1.1457,1.11629


== DONE ==
