In [2]:
# !pip install transformers datasets peft bitsandbytes accelerate matplotlib

import time
import re
import torch
import matplotlib.pyplot as plt
from datasets import load_dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    Trainer, 
    TrainingArguments, 
    DataCollatorForSeq2Seq
)
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model


In [3]:
def formatting_prompts_func(examples):
    eos_token = '<|end_of_text|>'  # Llama 모델의 종료 토큰
    korQuAD_prompt = """
    # 학습시킬 Prompt의 형태
        ### Question:
        {}

        ### Context:
        {}

        ### Answer:
        {}
    """
    instructions = examples["question"]
    inputs = examples["context"]
    outputs = [item['text'][0] for item in examples["answers"]]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = korQuAD_prompt.format(instruction, input, output) + eos_token
        texts.append(text)
    return {"text": texts}

# KorQuAD 데이터셋 로딩 (train과 validation 세트가 있다고 가정)
raw_datasets = load_dataset("KorQuAD/squad_kor_v1")

# train, validation 세트에 포맷팅 함수 적용
formatted_train = raw_datasets["train"].map(formatting_prompts_func, batched=True, remove_columns=raw_datasets["train"].column_names)
formatted_valid = raw_datasets["validation"].map(formatting_prompts_func, batched=True, remove_columns=raw_datasets["validation"].column_names)


Map:   0%|          | 0/60407 [00:00<?, ? examples/s]

Map:   0%|          | 0/5774 [00:00<?, ? examples/s]

In [6]:
# 토크나이저 로딩 및 데이터셋 토큰화
model_name = "meta-llama/Llama-3.2-1B-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

tokenizer.pad_token = tokenizer.eos_token

max_length = 512  # 최대 토큰 길이 설정 # 1024하면 안됨

def tokenize_function(examples):
    # 예제 텍스트를 토큰화 (truncation, padding 적용)
    return tokenizer(examples["text"], truncation=True, max_length=max_length, padding="max_length")

# train, validation 데이터셋 토큰화
tokenized_train = formatted_train.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_valid = formatted_valid.map(tokenize_function, batched=True, remove_columns=["text"])


Map:   0%|          | 0/60407 [00:00<?, ? examples/s]

Map:   0%|          | 0/5774 [00:00<?, ? examples/s]

In [7]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training

config = BitsAndBytesConfig(load_in_4bit = True) # 4bit quantization

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=config, # 8 or 4 bit Quan인데 4bit로 성능보다는 학습 시간의 단축에 focus
    device_map="auto" # quantization과 lora는 gpu 상에서만 가능한 거로 알고 있어서 gpu할당이 끊어지면 학습도 중단될 것 같긴한데 그냥 auto로 둘래
)

# QLoRA를 위한 준비 (모델의 특정 부분을 학습 가능하게 변경)
model = prepare_model_for_kbit_training(model)


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [8]:
check_memory = model.get_memory_footprint()
print("vram 사용량 : ", check_memory / 1024**2, "MB")

2024022144


In [9]:
# lora hyperparam
lora_config = LoraConfig(
    task_type="CAUSAL_LM",   # 언어 모델 작업
    inference_mode=False,
    r=8,                    # 기본값
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=[
    "q_proj", # 입력 벡터를 query로 변환하여 self-attention에서 사용
    # "o_proj", 
    # "k_proj", 
    "v_proj", # 입력 벡터를 value로 변환함, attention의 가중합 계산에 사용
    # "gate_proj", 
    # "up_proj", 
    # "down_proj"
    ],
    bias= 'none'
)


In [31]:
model = get_peft_model(model, lora_config)

model.config.use_cache = False # 양자화 기법 적용할 경우 캐시 관련 설정때문에 loss 계산이 누락되는 경우가 있다고 함

after_lora_config = model.get_memory_footprint()
print("after lora vram usage : ", after_lora_config / 1024**2, "MB")

2027430016


In [32]:
# trainer

training_args = TrainingArguments(
    output_dir="./outputs",
    num_train_epochs=1,# 총 1 에포크 학습
    run_name="experiment_2934",
    logging_dir = "./logs",
    per_device_train_batch_size=4, # 각 디바이스(batch)당 학습 배치 크기
    per_device_eval_batch_size=4,# 평가 배치 크기
    eval_strategy="epoch", # 에포크마다 평가 수행
    logging_strategy="steps",
    logging_steps=50,
    save_strategy="epoch",
    fp16=True,# 혼합 정밀도 학습 (학습 속도 향상 및 VRAM 최적화)
    push_to_hub=False,
)


In [33]:
# 토크나이저와 모델에 맞는 데이터 batch 구성

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [34]:
def normalize_text(s):
    # 소문자화 및 불필요한 공백 제거
    s = s.lower().strip()
    s = re.sub(r'\s+', ' ', s)
    return s

def compute_em_f1(pred, truth):
    pred, truth = normalize_text(pred), normalize_text(truth)
    # Exact Match (정확하게 일치하는지)
    em = 1 if pred == truth else 0
    # F1 계산: 단어 단위 비교
    pred_tokens = pred.split()
    truth_tokens = truth.split()
    common = set(pred_tokens) & set(truth_tokens)
    if len(common) == 0:
        return em, 0.0
    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(truth_tokens)
    f1 = 2 * precision * recall / (precision + recall)
    return em, f1

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # 토큰을 문자열로 복원 (skip_special_tokens로 불필요한 토큰 제거)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # labels에 pad token(-100)이 포함되어 있을 수 있으므로 복원 시 처리
    labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    total_em = 0
    total_f1 = 0
    count = 0
    for pred, truth in zip(decoded_preds, decoded_labels):
        em, f1 = compute_em_f1(pred, truth)
        total_em += em
        total_f1 += f1
        count += 1
    return {"exact_match": total_em / count, "f1": total_f1 / count}


In [45]:
import torch
import torch.nn.functional as F
from transformers import Trainer

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # 입력에서 labels를 추출 (없으면 input_ids를 labels로 사용)
        labels = inputs.get("labels")
        if labels is None:
            labels = inputs["input_ids"]
        
        # 모델을 호출하여 outputs를 얻습니다.
        outputs = model(**inputs)
        logits = outputs.logits
        
        # Causal LM의 경우, 일반적으로 logits와 labels를 한 토큰씩 시프트하여 계산합니다.
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()
        
        # pad token id는 패딩 토큰을 무시하도록 설정합니다.
        loss_fct = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss

In [46]:
# trainer instance creation

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


  trainer = CustomTrainer(
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [47]:
# 학습 전 vram 사용량 (only GPU)

if torch.cuda.is_available():
    vram_before = torch.cuda.memory_allocated()
    print(f"학습 전 VRAM 사용량: {vram_before / (1024**2):.2f} MB")


학습 전 VRAM 사용량: 4729.25 MB


In [29]:
!pip install weave

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: http://repo.ai.gato/registry/repository/pypi-proxy/simple
Collecting graphql-core<3.2.4,>=3.2 (from gql[aiohttp,requests]->weave)
  Using cached http://repo.ai.gato/registry/repository/pypi-proxy/packages/graphql-core/3.2.3/graphql_core-3.2.3-py3-none-any.whl (202 kB)
Installing collected packages: graphql-core
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
mlflow 2.20.1 requires pyarrow<19,>=4.0.0, but you have pyarrow 19.0.0 which is incompatible.[0m[31m
[0mSuccessfully installed graphql-core-3.2.3


In [20]:
!wandb login

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mhandsomemin[0m ([33mhandsomemin-kookmin-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [48]:
start_time = time.time()
train_result = trainer.train()
end_time = time.time()
training_time = end_time - start_time
print(f"학습 시간: {training_time:.2f} 초")


Epoch,Training Loss,Validation Loss




OutOfMemoryError: CUDA out of memory. Tried to allocate 8.81 GiB. GPU 0 has a total capacity of 23.68 GiB of which 3.98 GiB is free. Process 61661 has 28.06 MiB memory in use. Process 46130 has 19.67 GiB memory in use. Of the allocated memory 13.44 GiB is allocated by PyTorch, and 5.91 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)