In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"  # GPU 1번만 사용

import json
import torch
import gc
import numpy as np
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from tqdm import tqdm
from datasets import Dataset, DatasetDict, Audio
from torch.utils.data import DataLoader
import evaluate

# ========================================
# 1. 데이터셋 로드 및 전처리
# ========================================

def load_my_dataset(folder_path):
    audio_paths = []
    transcripts = []
    
    for fname in sorted(os.listdir(folder_path)):
        if fname.endswith(".wav"):
            wav_path = os.path.join(folder_path, fname)
            base_name = fname.replace(".wav", "")
            txt_path = os.path.join(folder_path, base_name + ".txt")
            json_path = os.path.join(folder_path, base_name + ".json")
    
            # 텍스트 읽기 (txt > json 우선순위)
            if os.path.exists(txt_path):
                with open(txt_path, "r", encoding="utf-8") as f:
                    text = f.read().strip()
            elif os.path.exists(json_path):
                with open(json_path, "r", encoding="utf-8") as f:
                    j = json.load(f)
                    text = j["06_transcription"]["1_text"]
            else:
                continue  # 해당 텍스트가 없으면 건너뜀
    
            audio_paths.append(wav_path)
            transcripts.append(text)
    
    return Dataset.from_dict({
        "audio": audio_paths,
        "sentence": transcripts,
    })

# DatasetDict 생성: 학습/검증 데이터 (예시에서는 kr_univ 데이터를 사용)
common_voice = DatasetDict()
common_voice["train"] = load_my_dataset("/data/seungmin/dataset/kr_univ_training_processed/number_and_english/")
common_voice["test"]  = load_my_dataset("/data/seungmin/dataset/kr_univ_validation_processed/number_and_english/")
print(common_voice)

# ========================================
# 2. Whisper 모델 관련 모듈 로드
# ========================================

model_name_or_path = "openai/whisper-large-v2"  # base 모델 경로
language = "korean"
task = "transcribe"

from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor

feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name_or_path)
tokenizer = WhisperTokenizer.from_pretrained(model_name_or_path, language=language, task=task)
processor = WhisperProcessor.from_pretrained(model_name_or_path, language=language, task=task)

# Audio 컬럼 샘플링레이트 강제 16kHz 변환
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

def prepare_dataset(batch):
    # 오디오를 16kHz로 리샘플링 후 log-Mel feature 추출
    audio = batch["audio"]
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    # 텍스트를 label id로 변환
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

# train과 test 모두에 대해 전처리 적용
common_voice["train"] = common_voice["train"].map(
    prepare_dataset, 
    remove_columns=common_voice["train"].column_names,
    num_proc=None
)
common_voice["test"] = common_voice["test"].map(
    prepare_dataset, 
    remove_columns=common_voice["test"].column_names,
    num_proc=None
)

# ========================================
# 3. Data Collator 정의 (패딩 처리)
# ========================================

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    
    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # 오디오 입력에 대해서 패딩 처리
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
    
        # 라벨에 대해서 패딩 처리 (pad 후 -100으로 마스킹)
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
    
        # 만약 bos 토큰이 앞에 추가되었다면 제거 (모델 generate 시 재추가됨)
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]
    
        batch["labels"] = labels
        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

# ========================================
# 4. 모델 및 PEFT(LoRA) 어댑터 로드
# ========================================

from transformers import WhisperForConditionalGeneration, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

# base 모델을 8bit 양자화 및 device_map 자동 설정으로 로드
model = WhisperForConditionalGeneration.from_pretrained(
    model_name_or_path, 
    quantization_config=BitsAndBytesConfig(load_in_8bit=True),
    device_map="auto"
)
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

# PEFT 적용을 위한 준비
model = prepare_model_for_kbit_training(model)

# LoRA adapter 구성 및 적용
config = LoraConfig(
    r=32, 
    lora_alpha=64, 
    target_modules=["q_proj", "v_proj"], 
    lora_dropout=0.05, 
    bias="none"
)
model = get_peft_model(model, config)
model.print_trainable_parameters()  # adapter 파라미터 확인

# ========================================
# 5. 학습 설정 및 Trainer 구성
# ========================================

from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, TrainerCallback, TrainingArguments, TrainerState, TrainerControl
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR

training_args = Seq2SeqTrainingArguments(
    output_dir="/data/seungmin/trial2_whisper_large/train_result2/0416-2",  # 기존 경로 사용
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,  # 배치 크기에 맞게 그대로 유지
    learning_rate=5e-6,             # 학습률 낮춤
    lr_scheduler_type="cosine",
    warmup_steps=100,               # warmup step 늘림
    num_train_epochs=10,            # epoch 수 증가
    eval_strategy="steps",
    save_strategy="steps",
    fp16=True,
    per_device_eval_batch_size=8,
    generation_max_length=128,
    logging_steps=25,
    remove_unused_columns=False,   # PEFT 모델 맞춤
    label_names=["labels"],
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

# 체크포인트 저장 시 PEFT adapter만 별도 저장하는 Callback
class SavePeftModelCallback(TrainerCallback):
    def on_save(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        checkpoint_folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")
        peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
        kwargs["model"].save_pretrained(peft_model_path)
    
        pytorch_model_path = os.path.join(checkpoint_folder, "pytorch_model.bin")
        if os.path.exists(pytorch_model_path):
            os.remove(pytorch_model_path)
        return control

# Trainer 생성 (compute_metrics는 필요에 따라 추가 가능)
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"],
    data_collator=data_collator,
    # compute_metrics=compute_metrics,  # 평가 시 metric 함수 정의 가능
    tokenizer=processor.feature_extractor,
    callbacks=[SavePeftModelCallback],
)

model.config.use_cache = False  # inference 시 cache 사용 안함 (경고 메시지 방지)

# ========================================
# 6. 재학습(재개) 실행
# ========================================

# 이전 학습의 체크포인트 경로 (반드시 해당 경로에 checkpoint 폴더가 존재해야 함)
checkpoint_path = "/data/seungmin/trial2_whisper_large/train_result2/0416/checkpoint-749"  # 예: checkpoint-5892

# resume_from_checkpoint 인자를 통해 학습 재개
trainer.train(resume_from_checkpoint=checkpoint_path)


DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 6840
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 917
    })
})


Map:   0%|          | 0/6840 [00:00<?, ? examples/s]

Map:   0%|          | 0/917 [00:00<?, ? examples/s]

trainable params: 15,728,640 || all params: 1,559,033,600 || trainable%: 1.0089


  trainer = Seq2SeqTrainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mhandsomemin[0m ([33mhandsomemin-kookmin-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
750,2.0195,1.994824


OutOfMemoryError: CUDA out of memory. Tried to allocate 726.00 MiB. GPU 0 has a total capacity of 23.68 GiB of which 6.81 MiB is free. Process 527129 has 12.48 GiB memory in use. Process 538260 has 5.68 GiB memory in use. Including non-PyTorch memory, this process has 5.50 GiB memory in use. Of the allocated memory 4.22 GiB is allocated by PyTorch, and 982.63 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
model_name_or_path = "openai/whisper-large-v2-kr_univ-0416"  # 모델 이름

# 문자열로 안전하게 변환 (Enum → str)
peft_type = str(model.peft_config["default"].peft_type).split(".")[-1]  # "LORA"

# 모델 ID 생성
peft_model_id = "handsomemin/" + f"{model_name_or_path}-{peft_type}".replace("/", "-")

# 업로드
model.push_to_hub(peft_model_id)
print("Uploaded to:", peft_model_id)
