# Import

In [1]:
import os
import yaml
import tqdm

from unsloth import FastLanguageModel
from trl import SFTConfig, SFTTrainer
import torch
import wandb
import pandas as pd
import pytorch_lightning as pl
from rouge import Rouge # 모델의 성능을 평가하기 위한 라이브러리입니다.
from torch.utils.data import Dataset , DataLoader
from transformers import EarlyStoppingCallback



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!


# Config

In [2]:
config_data = {
    "general": {
        "data_path": "../data/", # 모델 생성에 필요한 데이터 경로를 사용자 환경에 맞게 지정합니다.
        "model_name": "Qwen3:8b", #"digit82/kobart-summarization", # 불러올 모델의 이름을 사용자 환경에 맞게 지정할 수 있습니다.
        "output_dir": "../prdiction" # 모델의 최종 출력 값을 저장할 경로를 설정합니다.
    },
    "tokenizer": {
        "max_len": 512,
        "eos_token": "<|im_end|>",
        # 특정 단어들이 분해되어 tokenization이 수행되지 않도록 special_tokens을 지정해줍니다.
        "special_tokens": ["[SUMM]", '#Person1#', '#Person2#', '#Person3#', '#PhoneNumber#', '#Address#', '#PassportNumber#']
    },
    "training": {
        "overwrite_output_dir": True,
        "num_train_epochs": 20,
        "learning_rate": 1e-5,
        "per_device_train_batch_size": 10,
        "per_device_eval_batch_size": 10,
        "warmup_ratio": 0.1,
        "weight_decay": 0.01,
        "lr_scheduler_type": 'cosine',
        "optim": 'adamw_torch',
        "gradient_accumulation_steps": 1,
        "evaluation_strategy": 'epoch',
        "save_strategy": 'epoch',
        "save_total_limit": 5,
        "fp16": True,
        "load_best_model_at_end": True,
        "seed": 42,
        "logging_dir": "./logs",
        "logging_strategy": "epoch",
        "predict_with_generate": True,
        "generation_max_length": 100,
        "generation_max_new_tokens": 100,
        "do_train": True,
        "do_eval": True,
        "early_stopping_patience": 3,
        "early_stopping_threshold": 0.001,
        "report_to": "wandb" # (선택) wandb를 사용할 때 설정합니다.
    },
    # (선택) wandb 홈페이지에 가입하여 얻은 정보를 기반으로 작성합니다.
    "wandb": {
        "project": "dialogSUM",
        "name": "baseline002"
    },
    "inference": {
        "ckt_path": "model ckt path", # 사전 학습이 진행된 모델의 checkpoint를 저장할 경로를 설정합니다.
        "result_path": "./prediction/",
        "no_repeat_ngram_size": 2,
        "early_stopping": True,
        "generate_max_length": 100,
        "num_beams": 4,
        "batch_size" : 10,
        # 정확한 모델 평가를 위해 제거할 불필요한 생성 토큰들을 정의합니다.
        "remove_tokens": ['<usr>', "<|im_end|>", "<|vision_pad|>"]
    }
}

# 모델의 구성 정보를 YAML 파일로 저장합니다.
config_path = "./config.yaml"
with open(config_path, "w") as file:
    yaml.dump(config_data, file, allow_unicode=True)

# 저장된 config 파일을 불러옵니다.
config_path = "./config.yaml"

with open(config_path, "r") as file:
    loaded_config = yaml.safe_load(file)

# 불러온 config 파일의 전체 내용을 확인합니다.
print(loaded_config)

{'general': {'data_path': '../data/', 'model_name': 'Qwen3:8b', 'output_dir': '../prdiction'}, 'inference': {'batch_size': 10, 'ckt_path': 'model ckt path', 'early_stopping': True, 'generate_max_length': 100, 'no_repeat_ngram_size': 2, 'num_beams': 4, 'remove_tokens': ['<usr>', '<|im_end|>', '<|vision_pad|>'], 'result_path': './prediction/'}, 'tokenizer': {'eos_token': '<|im_end|>', 'max_len': 512, 'special_tokens': ['[SUMM]', '#Person1#', '#Person2#', '#Person3#', '#PhoneNumber#', '#Address#', '#PassportNumber#']}, 'training': {'do_eval': True, 'do_train': True, 'early_stopping_patience': 3, 'early_stopping_threshold': 0.001, 'evaluation_strategy': 'epoch', 'fp16': True, 'generation_max_length': 100, 'generation_max_new_tokens': 100, 'gradient_accumulation_steps': 1, 'learning_rate': 1e-05, 'load_best_model_at_end': True, 'logging_dir': './logs', 'logging_strategy': 'epoch', 'lr_scheduler_type': 'cosine', 'num_train_epochs': 20, 'optim': 'adamw_torch', 'overwrite_output_dir': True, 'p

# Dataset

In [5]:
# 1. 데이터 전처리를 위한 클래스로, 데이터셋을 데이터프레임으로 변환하고 인코더와 디코더의 입력을 생성합니다.

class Preprocess:
    def __init__(self,
            eos_token: str,
        ) -> None:

        self.eos_token = eos_token

    @staticmethod
    # 실험에 필요한 컬럼을 가져옵니다.
    def make_set_as_df(file_path, is_train = True):
        if is_train:
            df = pd.read_csv(file_path)
            train_df = df[['fname','dialogue','summary', 'topic']]
            return train_df
        else:
            df = pd.read_csv(file_path)
            test_df = df[['fname','dialogue']]
            return test_df
    
    def make_input(self, dataset, is_test=False):
        if is_test:
            # 테스트 시에는 입력 문장만 준비
            inputs = []
            for idx, row in dataset.iterrows():
                input_text = str(row["dialogue"]) + " [SUMM] " 
                inputs.append(input_text)
            return inputs
            # inputs = dataset['dialogue'].tolist()
            # return inputs
        else:
            inputs = []
            labels = []
            for idx, row in dataset.iterrows():
                # 모델이 요약까지 포함된 전체 문장을 보고 다음 토큰 학습 가능하게 만듬
                input_text = str(row["dialogue"]) + " [SUMM] " + str(row["summary"]) + " " + self.eos_token
                inputs.append(input_text)
                # 라벨은 입력 시퀀스와 동일하게 하면 causal LM 에서는 shift 라벨링은 트레이너가 처리해서 label prediction 가능
                labels.append(input_text)
            return inputs, labels
    '''
    def make_input(self, dataset, is_test=False):
        if is_test:
            # 테스트 시: 디코더 입력이 필요 없다면 빈문자열 또는 None으로 전달
            encoder_inputs = dataset['dialogue'].tolist()
            decoder_inputs = [""] * len(encoder_inputs)  # 또는 None도 가능
            return encoder_inputs, decoder_inputs
        else:
            encoder_inputs = dataset['dialogue'].tolist()

            # BOS 없이 그냥 정답 요약문만 활용
            decoder_inputs = [str(summary) for summary in dataset['summary']]
            decoder_outputs = [str(summary) + self.eos_token for summary in dataset['summary']]
            return encoder_inputs, decoder_inputs, decoder_outputs
    '''     
        
# 2. datasets 

class DatasetForTrain(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels
        self.len = inputs['input_ids'].size(0)

    def __getitem__(self, idx):
        item = {}
        for key, val in self.inputs.items():
            item[key] = val[idx]
        item['labels'] = self.labels['input_ids'][idx]
        return item

    def __len__(self):
        return self.len


class DatasetForVal(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels
        self.len = inputs['input_ids'].size(0)

    def __getitem__(self, idx):
        item = {}
        for key, val in self.inputs.items():
            item[key] = val[idx]
        item['labels'] = self.labels['input_ids'][idx]
        return item

    def __len__(self):
        return self.len


class DatasetForInference(Dataset):
    def __init__(self, inputs, ids):
        self.inputs = inputs
        self.ids = ids
        self.len = inputs['input_ids'].size(0)

    def __getitem__(self, idx):
        item = {}
        for key, val in self.inputs.items():
            item[key] = val[idx]
        item['ID'] = self.ids[idx]
        return item

    def __len__(self):
        return self.len

        

In [6]:

def prepare_train_dataset(config, preprocessor, data_path, tokenizer):
    train_file_path = os.path.join(data_path, 'train.csv')
    val_file_path = os.path.join(data_path, 'dev.csv')

    # 데이터프레임 생성
    train_data = preprocessor.make_set_as_df(train_file_path)
    val_data = preprocessor.make_set_as_df(val_file_path)

    print('-'*150)
    print(f'train_data:\n {train_data["dialogue"][0]}')
    print(f'train_label:\n {train_data["summary"][0]}')

    print('-'*150)
    print(f'val_data:\n {val_data["dialogue"][0]}')
    print(f'val_label:\n {val_data["summary"][0]}')

    # GPT-2용으로 inputs, labels 생성 (make_input은 이전에 GPT-2용으로 수정한 구조 가정)
    train_texts, train_labels = preprocessor.make_input(train_data, is_test=False)
    val_texts, val_labels = preprocessor.make_input(val_data, is_test=False)

    print('-'*10, 'Load data complete', '-'*10)

    # tokenizer 호출 (padding, truncation, max_length를 설정)
    tokenized_train_inputs = tokenizer(
        train_texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=config['tokenizer']['max_len'],  # encoder_max_len == max length로 통일해도 됨
        add_special_tokens=True,
        return_token_type_ids=False
    )
    tokenized_train_labels = tokenizer(
        train_labels,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=config['tokenizer']['max_len'],
        add_special_tokens=True,
        return_token_type_ids=False
    )

    tokenized_val_inputs = tokenizer(
        val_texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=config['tokenizer']['max_len'],
        add_special_tokens=True,
        return_token_type_ids=False
    )
    tokenized_val_labels = tokenizer(
        val_labels,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=config['tokenizer']['max_len'],
        add_special_tokens=True,
        return_token_type_ids=False
    )

    # Dataset 객체 생성 (GPT-2용 DatasetForTrain, DatasetForVal 사용)
    train_dataset = DatasetForTrain(tokenized_train_inputs, tokenized_train_labels)
    val_dataset = DatasetForVal(tokenized_val_inputs, tokenized_val_labels)

    print('-'*10, 'Make dataset complete', '-'*10)
    return train_dataset, val_dataset

# Metric

In [7]:
# 모델 성능에 대한 평가 지표를 정의합니다. 본 대회에서는 ROUGE 점수를 통해 모델의 성능을 평가합니다.
def compute_metrics(config,tokenizer,pred):
    rouge = Rouge()
    predictions = pred.predictions
    labels = pred.label_ids

    predictions[predictions == -100] = tokenizer.pad_token_id
    labels[labels == -100] = tokenizer.pad_token_id

    decoded_preds = tokenizer.batch_decode(predictions, clean_up_tokenization_spaces=True)
    labels = tokenizer.batch_decode(labels, clean_up_tokenization_spaces=True)

    # 정확한 평가를 위해 미리 정의된 불필요한 생성토큰들을 제거합니다.
    replaced_predictions = decoded_preds.copy()
    replaced_labels = labels.copy()
    remove_tokens = config['inference']['remove_tokens']
    for token in remove_tokens:
        replaced_predictions = [sentence.replace(token," ") for sentence in replaced_predictions]
        replaced_labels = [sentence.replace(token," ") for sentence in replaced_labels]

    print('-'*150)
    print(f"PRED: {replaced_predictions[0]}")
    print(f"GOLD: {replaced_labels[0]}")
    print('-'*150)
    print(f"PRED: {replaced_predictions[1]}")
    print(f"GOLD: {replaced_labels[1]}")
    print('-'*150)
    print(f"PRED: {replaced_predictions[2]}")
    print(f"GOLD: {replaced_labels[2]}")

    # 최종적인 ROUGE 점수를 계산합니다.
    results = rouge.get_scores(replaced_predictions, replaced_labels,avg=True)

    # ROUGE 점수 중 F-1 score를 통해 평가합니다.
    result = {key: value["f"] for key, value in results.items()}
    return result

# Trainer

In [9]:
def load_trainer_for_train(config, model, tokenizer, train_inputs_dataset, val_inputs_dataset):
    print('-'*10, 'Make training arguments', '-'*10,)

    # wandb 초기화
    wandb.init(
        project=config['wandb']['project'],
        name=config['wandb']['name'],
    )
    # SFTConfig 객체 생성 (TRL 공식 설정 객체)

    # 1) SFTConfig 객체 생성
    training_args = SFTConfig(
        output_dir="./output_dir",
        max_length=512,
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        num_train_epochs=1,
        learning_rate=2e-4,
        # eval_steps=1000,
        # save_steps=1,
        # save_total_limit=3,
        gradient_accumulation_steps=4,
        load_best_model_at_end=False,
        metric_for_best_model="rougeL",
        greater_is_better=True,
        report_to="wandb",
        save_strategy='steps',
        eval_strategy='no',
        fp16=False,
        bf16=True,
        weight_decay = 0.01,

    )
    '''
    training_args = SFTConfig(
        output_dir=config['general']['output_dir'],
        max_length=config['tokenizer']['max_len'],  # 최대 토큰 길이
        per_device_train_batch_size=config['training']['per_device_train_batch_size'],
        per_device_eval_batch_size=config['training']['per_device_eval_batch_size'],
        num_train_epochs=config['training']['num_train_epochs'],
        learning_rate=config['training']['learning_rate'],
        weight_decay=config['training']['weight_decay'],
        warmup_ratio=config['training']['warmup_ratio'],
        logging_dir=config['training']['logging_dir'],
        logging_strategy=config['training']['logging_strategy'],  # ex) 'steps'
        logging_steps=config['training'].get('logging_steps', 10),
        evaluation_strategy=config['training']['evaluation_strategy'],  # ex) 'steps'
        eval_steps=config['training'].get('eval_steps', None),
        save_strategy=config['training']['save_strategy'],  # ex) 'steps'
        save_steps=config['training'].get('save_steps', None),
        save_total_limit=config['training']['save_total_limit'],
        gradient_accumulation_steps=config['training']['gradient_accumulation_steps'],
        fp16=config['training']['fp16'],
        seed=config['training']['seed'],
        load_best_model_at_end=config['training']['load_best_model_at_end'],
        metric_for_best_model=config['training'].get('metric_for_best_model', None),
        greater_is_better=config['training'].get('greater_is_better', True),
        report_to=config['training']['report_to'],  # e.g. 'wandb' or None
    )
    '''
    trainer = SFTTrainer(
        model=model,
        args=training_args,
        train_dataset=train_inputs_dataset,
        # eval_dataset=val_inputs_dataset,
        tokenizer=tokenizer,
        # compute_metrics=lambda pred: compute_metrics(pred, tokenizer, config),
    )
  
    print('-'*10, 'Make trainer complete', '-'*10,)

    return trainer


In [10]:
# 학습을 위한 tokenizer와 사전 학습된 모델을 불러옵니다.
def load_tokenizer_and_model_for_train(config, device):
    
    model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/qwen3-8B",
    max_seq_length = 512,   # Context length - can be longer, but uses more memory : config['tokenizer']['encoder_max_len']
    load_in_4bit = True,     # 4bit uses much less memory
    load_in_8bit = False,    # A bit more accurate, uses 2x memory
    full_finetuning = False, # We have full finetuning now!
    # token = "hf_...",      # use one if using gated models
    )

    model = FastLanguageModel.get_peft_model(
    model,
    r = 32,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 42,
    use_rslora = False,   # We support rank stabilized LoRA
    loftq_config = None,  # And LoftQ
    )
    
    special_tokens_dict={'additional_special_tokens':config['tokenizer']['special_tokens']}
    tokenizer.add_special_tokens(special_tokens_dict)

    model.resize_token_embeddings(len(tokenizer)) # 사전에 special token을 추가했으므로 재구성 해줍니다.
    model.to(device)

    return model , tokenizer

In [11]:
def prepare_test_dataset(config,preprocessor, tokenizer):

    test_file_path = os.path.join(config['general']['data_path'],'train.csv')

    test_data = preprocessor.make_set_as_df(test_file_path,is_train=False)
    test_ids = test_data['fname']

    inputs = preprocessor.make_input(test_data,is_test=True)

    inputs = tokenizer(
        inputs,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=config['tokenizer']['max_len'],
        add_special_tokens=True,
        return_token_type_ids=False,
    )

    # DatasetForInference 생성: 토크나이저 결과 + id 리스트를 함께 넘깁니다.
    test_dataset = DatasetForInference(inputs, test_ids)

    return test_data, test_dataset

In [None]:
def main(config):
    # 사용할 device를 정의합니다.
    device = torch.device('cuda:0' if torch.cuda.is_available()  else 'cpu')
    print('-'*10, f'device : {device}', '-'*10,)
    print(torch.__version__)

    # 사용할 모델과 tokenizer를 불러옵니다.
    model , tokenizer = load_tokenizer_and_model_for_train(config, device)
    print('-'*10,"tokenizer special tokens : ",tokenizer.special_tokens_map,'-'*10)

    # 학습에 사용할 데이터셋을 불러옵니다.

    preprocessor = Preprocess(eos_token=config['tokenizer']['eos_token']) # decoder_start_token: str, eos_token: str
    data_path = config['general']['data_path']
    train_inputs_dataset, val_inputs_dataset = prepare_train_dataset(config,preprocessor, data_path, tokenizer)

    # Trainer 클래스를 불러옵니다.
    trainer = load_trainer_for_train(config, model,tokenizer,train_inputs_dataset,val_inputs_dataset)
    trainer.train()   # 모델 학습을 시작합니다.

    model.save_pretrained("../output_dir/model")
    tokenizer.save_pretrained("../output_dir/tokenizer")

    # (선택) 모델 학습이 완료된 후 wandb를 종료합니다.
    wandb.finish()
    preprocessor = Preprocess(eos_token=config['tokenizer']['eos_token']) 


    test_data, test_dataset = prepare_test_dataset(config, preprocessor, tokenizer)
    dataloader = DataLoader(test_dataset, batch_size=1)

    summary = []
    text_ids = []
    with torch.no_grad():
        for item in tqdm.tqdm(dataloader):
            text_ids.extend(item['ID'])
            generated_ids = model.generate(input_ids=item['input_ids'].to("cuda"),
                            attention_mask=item['attention_mask'].to("cuda"),
                            no_repeat_ngram_size=config['inference']['no_repeat_ngram_size'],
                            early_stopping=config['inference']['early_stopping'],
                            max_new_tokens=config['inference']['generate_max_length'],
                            num_beams=1,#loaded_config['inference']['num_beams'],
                            pad_token_id=tokenizer.pad_token_id,
                            eos_token_id=tokenizer.eos_token_id,
                        )
            for ids in generated_ids:
                result = tokenizer.decode(ids, skip_special_tokens=False)
                summary.append(result)
                print(result)
                print("---"*50)

    # 정확한 평가를 위하여 노이즈에 해당되는 스페셜 토큰을 제거합니다.
    remove_tokens = config['inference']['remove_tokens']
    preprocessed_summary = summary.copy()
    for token in remove_tokens:
        preprocessed_summary = [sentence.replace(token," ") for sentence in preprocessed_summary]

    output = pd.DataFrame(
        {
            "fname": test_data['fname'],
            "summary" : preprocessed_summary,
        }
    )
    result_path = config['inference']['result_path']
    if not os.path.exists(result_path):
        os.makedirs(result_path)
    output.to_csv(os.path.join(result_path, "output_qwen.csv"), index=False)

In [None]:
if __name__ == "__main__":
    main(loaded_config)

---------- device : cuda:0 ----------
2.7.1+cu126
==((====))==  Unsloth 2025.7.8: Fast Qwen3 patching. Transformers: 4.53.3.
   \\   /|    NVIDIA GeForce RTX 3090. Num GPUs = 1. Max memory: 23.691 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.13s/it]
Unsloth 2025.7.8 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


---------- tokenizer special tokens :  {'eos_token': '<|im_end|>', 'pad_token': '<|vision_pad|>', 'additional_special_tokens': ['[SUMM]', '#Person1#', '#Person2#', '#Person3#', '#PhoneNumber#', '#Address#', '#PassportNumber#']} ----------
------------------------------------------------------------------------------------------------------------------------------------------------------
train_data:
 #Person1#: 안녕하세요, Mr. Smith. 저는 Dr. Hawkins입니다. 오늘 무슨 일로 오셨어요? 
#Person2#: 건강검진을 받으려고 왔어요. 
#Person1#: 네, 5년 동안 검진을 안 받으셨네요. 매년 한 번씩 받으셔야 해요. 
#Person2#: 알죠. 특별히 아픈 데가 없으면 굳이 갈 필요가 없다고 생각했어요. 
#Person1#: 음, 심각한 질병을 피하려면 미리 발견하는 게 제일 좋거든요. 본인을 위해서라도 매년 한 번은 오세요. 
#Person2#: 알겠습니다. 
#Person1#: 여기 좀 볼까요. 눈과 귀는 괜찮으시네요. 깊게 숨 한 번 쉬어보세요. Mr. Smith, 담배 피우세요? 
#Person2#: 네. 
#Person1#: 담배가 폐암하고 심장병의 주된 원인인 거 아시죠? 끊으셔야 해요. 
#Person2#: 수백 번 시도했는데, 도저히 습관이 안 끊어져요. 
#Person1#: 음, 도움 될만한 수업과 약물들이 있습니다. 가시기 전에 더 정보를 드릴게요. 
#Person2#: 네, 고맙습니다, 의사 선생님.
train_label:
 Mr. Smith는 Dr. Hawkins에게 건강검진을 받으러 와서, 매

[34m[1mwandb[0m: Currently logged in as: [33mhoppure[0m ([33mhoppure-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


---------- Make trainer complete ----------


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 12,457 | Num Epochs = 1 | Total steps = 49
O^O/ \_/ \    Batch size per device = 64 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (64 x 4 x 1) = 256
 "-____-"     Trainable parameters = 87,293,952 of 8,275,899,392 (1.05% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,11.396
2,11.3673
3,11.0795
4,10.1571
5,8.6442
6,8.2861
7,7.089
8,6.4308
9,6.2768
10,5.9085


# 추론

In [None]:
from unsloth import FastLanguageModel
from peft import PeftModel

model, tokenizer = FastLanguageModel.from_pretrained("unsloth/qwen3-8B")
model.resize_token_embeddings(151676)

model = PeftModel.from_pretrained(model, '/data/ephemeral/home/dev/output_dir/model', is_trainable=True)

from transformers import AutoTokenizer

def load_tokenizer_from_directory(tokenizer_dir):
    """
    tokenizer_dir: str
        tokenizer가 저장된 디렉토리 경로 (예: 체크포인트 경로)
    """
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
    return tokenizer
tokenizer = load_tokenizer_from_directory("/data/ephemeral/home/dev/output_dir/tokenizer")


In [67]:
preprocessor = Preprocess(eos_token=loaded_config['tokenizer']['eos_token']) 
# tokenization 과정까지 진행된 최종적으로 모델에 입력될 데이터를 출력합니다.
def prepare_test_dataset(config,preprocessor, tokenizer):

    test_file_path = os.path.join(config['general']['data_path'],'train.csv')

    test_data = preprocessor.make_set_as_df(test_file_path,is_train=False)
    test_ids = test_data['fname']

    inputs = preprocessor.make_input(test_data,is_test=True)

    inputs = tokenizer(
        inputs,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=config['tokenizer']['max_len'],
        add_special_tokens=True,
        return_token_type_ids=False,
    )

    # DatasetForInference 생성: 토크나이저 결과 + id 리스트를 함께 넘깁니다.
    test_dataset = DatasetForInference(inputs, test_ids)

    return test_data, test_dataset

test_data, test_dataset = prepare_test_dataset(loaded_config, preprocessor, tokenizer)
dataloader = DataLoader(test_dataset, batch_size=1)

summary = []
text_ids = []
with torch.no_grad():
    for item in tqdm.tqdm(dataloader):
        text_ids.extend(item['ID'])
        generated_ids = model.generate(input_ids=item['input_ids'].to("cuda"),
                        attention_mask=item['attention_mask'].to("cuda"),
                        no_repeat_ngram_size=loaded_config['inference']['no_repeat_ngram_size'],
                        early_stopping=loaded_config['inference']['early_stopping'],
                        max_new_tokens=loaded_config['inference']['generate_max_length'],
                        num_beams=1,#loaded_config['inference']['num_beams'],
                        pad_token_id=tokenizer.pad_token_id,
                        eos_token_id=tokenizer.eos_token_id,
                    )
        for ids in generated_ids:
            result = tokenizer.decode(ids, skip_special_tokens=False)
            summary.append(result)
            print(result)
            print("---"*50)

# 정확한 평가를 위하여 노이즈에 해당되는 스페셜 토큰을 제거합니다.
remove_tokens = loaded_config['inference']['remove_tokens']
preprocessed_summary = summary.copy()
for token in remove_tokens:
    preprocessed_summary = [sentence.replace(token," ") for sentence in preprocessed_summary]

output = pd.DataFrame(
    {
        "fname": test_data['fname'],
        "summary" : preprocessed_summary,
    }
)
result_path = loaded_config['inference']['result_path']
if not os.path.exists(result_path):
    os.makedirs(result_path)
output.to_csv(os.path.join(result_path, "output_qwen.csv"), index=False)

  0%|          | 0/12457 [00:00<?, ?it/s]


AttributeError: 'Parameter' object has no attribute '_fast_lora'

# Generation

In [10]:

# tokenization 과정까지 진행된 최종적으로 모델에 입력될 데이터를 출력합니다.
def prepare_test_dataset(config,preprocessor, tokenizer):

    test_file_path = os.path.join(config['general']['data_path'],'test.csv')

    test_data = preprocessor.make_set_as_df(test_file_path,is_train=False)
    test_ids = test_data['fname']

    print('-'*150)
    print(f'test_data:\n{test_data["dialogue"][0]}')
    print('-'*150)

    inputs = preprocessor.make_input(test_data,is_test=True)
    print('-'*10, 'Load data complete', '-'*10,)

     # Qwen3는 causal LM, 디코더-온리 모델이므로 인코더/디코더 구분 X → 입력 토크나이징만 수행
    inputs = tokenizer(
        inputs,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=config['tokenizer']['max_len'],
        add_special_tokens=True,
        return_token_type_ids=False,
    )

    # DatasetForInference 생성: 토크나이저 결과 + id 리스트를 함께 넘깁니다.
    test_dataset = DatasetForInference(inputs, test_ids)

    print('-'*10, 'Make dataset complete', '-'*10)
    return test_data, test_dataset


In [11]:
# from transformers import AutoTokenizer

# 추론을 위한 tokenizer와 학습시킨 모델을 불러옵니다.
def load_tokenizer_and_model_for_test(config, device):
    print('-'*10, 'Load tokenizer & model', '-'*10,)

    ckt_path = config['inference']['ckt_path']

    model, tokenizer = FastLanguageModel.from_pretrained(ckt_path)
    # tokenizer = AutoTokenizer.from_pretrained(ckt_path)
    
    # special_tokens_dict={'additional_special_tokens':config['tokenizer']['special_tokens']}
    # tokenizer.add_special_tokens(special_tokens_dict)

    # model.resize_token_embeddings(len(tokenizer)) # 사전에 special token을 추가했으므로 재구성 해줍니다.
    model.to(device)
    print('-'*10, 'Load tokenizer & model complete', '-'*10,)

    return model , tokenizer

In [None]:
# 학습된 모델이 생성한 요약문의 출력 결과를 보여줍니다.
def inference(config):
    device = torch.device('cuda:0' if torch.cuda.is_available()  else 'cpu')
    print('-'*10, f'device : {device}', '-'*10,)
    print(torch.__version__)

    model , tokenizer = load_tokenizer_and_model_for_test(config, device)

    preprocessor = Preprocess(config['tokenizer']['eos_token'])

    test_data, test_dataset = prepare_test_dataset(config, preprocessor, tokenizer)
    dataloader = DataLoader(test_dataset, batch_size=config['inference']['batch_size'])

    summary = []
    text_ids = []
    with torch.no_grad():
        for item in tqdm(dataloader):
            text_ids.extend(item['ID'])
            generated_ids = model.generate(input_ids=item['input_ids'].to(device),
                            no_repeat_ngram_size=config['inference']['no_repeat_ngram_size'],
                            early_stopping=config['inference']['early_stopping'],
                            max_length=config['inference']['generate_max_length'],
                            num_beams=config['inference']['num_beams'],
                        )
            for ids in generated_ids:
                result = tokenizer.decode(ids)
                summary.append(result)

    # 정확한 평가를 위하여 노이즈에 해당되는 스페셜 토큰을 제거합니다.
    remove_tokens = config['inference']['remove_tokens']
    preprocessed_summary = summary.copy()
    for token in remove_tokens:
        preprocessed_summary = [sentence.replace(token," ") for sentence in preprocessed_summary]

    output = pd.DataFrame(
        {
            "fname": test_data['fname'],
            "summary" : preprocessed_summary,
        }
    )
    result_path = config['inference']['result_path']
    if not os.path.exists(result_path):
        os.makedirs(result_path)
    output.to_csv(os.path.join(result_path, "output.csv"), index=False)

    return output

In [None]:
from unsloth import FastLanguageModel
from peft import PeftModel

model, tokenizer = FastLanguageModel.from_pretrained("unsloth/qwen3-8B")
model.resize_token_embeddings(151676)

model = PeftModel.from_pretrained(model, '/data/ephemeral/home/dev/output_dir/model', is_trainable=True)

from transformers import AutoTokenizer

def load_tokenizer_from_directory(tokenizer_dir):
    """
    tokenizer_dir: str
        tokenizer가 저장된 디렉토리 경로 (예: 체크포인트 경로)
    """
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
    return tokenizer
tokenizer = load_tokenizer_from_directory("/data/ephemeral/home/dev/output_dir/tokenizer")


==((====))==  Unsloth 2025.7.8: Fast Qwen3 patching. Transformers: 4.53.3.
   \\   /|    NVIDIA GeForce RTX 3090. Num GPUs = 1. Max memory: 23.691 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.83it/s]


In [36]:
from transformers import AutoTokenizer

def load_tokenizer_from_directory(tokenizer_dir):
    """
    tokenizer_dir: str
        tokenizer가 저장된 디렉토리 경로 (예: 체크포인트 경로)
    """
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
    return tokenizer
tokenizer = load_tokenizer_from_directory("/data/ephemeral/home/dev/output_dir/tokenizer")


In [13]:
# 이곳에 내가 사용할 wandb config 설정
loaded_config['inference']['ckt_path'] = "/data/ephemeral/home/dev/code/output_dir/checkpoint-98"

# 학습된 모델의 test를 진행합니다.
if __name__ == "__main__":
    output = inference(loaded_config)

---------- device : cuda:0 ----------
2.7.1+cu126
---------- Load tokenizer & model ----------
==((====))==  Unsloth 2025.7.8: Fast Qwen3 patching. Transformers: 4.53.3.
   \\   /|    NVIDIA GeForce RTX 3090. Num GPUs = 1. Max memory: 23.691 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.91it/s]


RuntimeError: Error(s) in loading state_dict for PeftModelForCausalLM:
	size mismatch for base_model.model.model.embed_tokens.weight: copying a param with shape torch.Size([151676, 4096]) from checkpoint, the shape in current model is torch.Size([151936, 4096]).
	size mismatch for base_model.model.lm_head.weight: copying a param with shape torch.Size([151676, 4096]) from checkpoint, the shape in current model is torch.Size([151936, 4096]).

In [31]:
import torch
import gc

for k, v in list(globals().items()):
    # GPU에 있고, 텐서/모델이면
    try:
        if isinstance(v, torch.Tensor) and v.is_cuda:
            del globals()[k]
        elif isinstance(v, torch.nn.Module) and v.is_cuda:
            del globals()[k]
    except Exception:
        pass

gc.collect()
torch.cuda.empty_cache()


In [None]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    llm_int8_enable_fp32_cpu_offload=True,
)

device_map = {
    "transformer.word_embeddings": 0,
    "transformer.word_embeddings_layernorm": 0,
    "lm_head": 0,  # 예시: lm_head만 CPU에 올림
    "transformer.h": 0,
    "transformer.ln_f": 0,
}

model, tokenizer = AutoModelForCausalLM.from_pretrained(
    "/data/ephemeral/home/dev/code/output_dir/checkpoint-3",
    quantization_config=quantization_config,
    device_map=device_map,
)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]


ValueError: model.embed_tokens.weight doesn't have any device set.

In [None]:


# 1. PEFT(LoRA) 적용
peft_model = FastLanguageModel.get_peft_model(
    model,
    r=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=32,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42,
    use_rslora=False,
    loftq_config=None,
)

# 2. 옵티마이저 설정 (예: AdamW)
optimizer = torch.optim.AdamW(peft_model.parameters(), lr=2e-4)

# 3. 데이터로더 정의 (batch_size는 VRAM에 맞게 조절)
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=1)

# 4. SFTTrainer 생성
trainer = SFTTrainer(
    model=peft_model,
    tokenizer=tokenizer,
    optimizer=optimizer,
    train_dataloader=train_dataloader,
    eval_dataloader=eval_dataloader,
    max_epochs=3,          # 원하는 학습 epoch 수
    gradient_accumulation_steps=8,  # 배치가 너무 작다면 누적 gradient 사용
    max_seq_length=2048,   # 입력 최대 길이 (tokenizer 설정과 일치해야 함)
    save_dir="./peft_finetuned_model",  # 체크포인트 저장 경로
    save_interval=1000,   # step 단위 저장 (필요시 설정)
    logging_dir="./logs", # 로그 저장 경로
    fp16=True,            # mixed precision 사용 권장 (하드웨어에 따라)
)

# 5. 학습 실행
trainer.fit()
