<a href="https://colab.research.google.com/github/hajeong67/nlp_LoRA/blob/main/llm_finetuning_eng_lab_20221346.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PEFT lab (finetuning `gemma-2b-it` )



## Overview
- **Food Order Analysis Dataset**: 3,000 records
   - **Goal**: Extract food names, option names, and quantities from order sentences
- **Solution**: Fine-tuning gemma-2b-it
- **Methodology**:
   - Load model with 4-bit quantization
   - Attach LoRA adapter
   - Train using **SFTTrainer**: Sentence -> next token prediction
   - Process dataset with **ConstantLengthDataset**


In [None]:
pip install transformers accelerate datasets peft trl bitsandbytes wandb

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting trl
  Downloading trl-0.12.0-py3-none-any.whl.metadata (10 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting transformers
  Downloading transformers-4.46.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?2

In [None]:
import os
from dataclasses import dataclass, field
from typing import Optional
import re

import torch
import sys
# import tyro
from accelerate import Accelerator
from datasets import load_dataset, Dataset
from peft import AutoPeftModelForCausalLM, LoraConfig
from tqdm import tqdm
from transformers import (
    HfArgumentParser,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    TextStreamer,
    logging as hf_logging,
)
import logging
from trl import SFTTrainer, SFTConfig

from trl.trainer import ConstantLengthDataset

# Setting

In [None]:
base_model_id = "google/gemma-2b-it"
device_map="cuda"
torch_dtype = torch.bfloat16
output_dir = "./gemma-order-analysis"
dataset_name = "./llm-modeling-lab.jsonl"
seq_length = 512

# Raw Dataset

In [None]:
full_dataset = Dataset.from_json(path_or_paths=dataset_name)

# Loading the Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id
)
tokenizer.padding_side = "right"

# Loading the Base Model

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",  # {"": Accelerator().local_process_index},
)

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
base_model.config.use_cache = False

In [None]:
if getattr(tokenizer, "pad_token", None) is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training
if base_model.config.pad_token_id != tokenizer.pad_token_id:
    base_model.config.pad_token_id = tokenizer.pad_token_id

# Utilities

In [None]:
def chars_token_ratio(dataset, tokenizer, prepare_sample_text, nb_examples=400):
    """
    Estimate the average number of characters per token in the dataset.
    """
    total_characters, total_tokens = 0, 0
    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
        text = prepare_sample_text(example)
        total_characters += len(text)
        if tokenizer.is_fast:
            total_tokens += len(tokenizer(text).tokens())
        else:
            total_tokens += len(tokenizer.tokenize(text))

    return total_characters / total_tokens

In [None]:
def function_prepare_sample_text(tokenizer, for_train=True):
    """A Closure"""

    def _prepare_sample_text(example):
        """Prepare the text from a sample of the dataset."""
        user_prompt="너는 사용자가 입력한 주문 문장을 분석하는 에이전트이다. 주문으로부터 이를 구성하는 음식명, 옵션명, 수량을 차례대로 추출해야 한다.\n### 주문 문장: "
        messages = [
            # {"role": "system", "content": f"{system_prompt}"},
            {"role": "user", "content": f"{user_prompt}{example['input']}"},
        ]
        if for_train:
            messages.append({"role": "assistant", "content": f"{example['output']}"})

        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False if for_train else True)
        return text
    return _prepare_sample_text

In [None]:
def create_datasets(tokenizer, dataset, seq_length):

    prepare_sample_text = function_prepare_sample_text(tokenizer)

    chars_per_token = chars_token_ratio(dataset, tokenizer, prepare_sample_text)
    print(
        f"The character to token ratio of the dataset is: {chars_per_token:.2f}"
    )

    cl_dataset = ConstantLengthDataset(
        tokenizer,
        dataset,
        formatting_func=prepare_sample_text,
        infinite=True,
        seq_length=seq_length,
        chars_per_token=chars_per_token,
    )

    return cl_dataset

# Build a dataset

In [None]:
ds = create_datasets(tokenizer, full_dataset, seq_length)

100%|██████████| 400/400 [00:00<00:00, 2015.97it/s]

The character to token ratio of the dataset is: 1.81





In [None]:
it = iter(ds)

In [None]:
tokenizer.decode(next(it)['input_ids'])



'<bos><start_of_turn>user\n너는 사용자가 입력한 주문 문장을 분석하는 에이전트이다. 주문으로부터 이를 구성하는 음식명, 옵션명, 수량을 차례대로 추출해야 한다.\n### 주문 문장: 후루룩 어우동 덜맵게 해주세요. 그리고 콜드브루라떼는 아이스로, 두 잔 주세요.<end_of_turn>\n<start_of_turn>model\n- 분석 결과 0: 음식명:후루룩 어우동,옵션:덜맵게\n- 분석 결과 1: 음식명:콜드브루라떼,옵션:아이스,수량:두 잔<end_of_turn>\n<eos><bos><bos><start_of_turn>user\n너는 사용자가 입력한 주문 문장을 분석하는 에이전트이다. 주문으로부터 이를 구성하는 음식명, 옵션명, 수량을 차례대로 추출해야 한다.\n### 주문 문장: 오뎅탕 중자하고 가지크림카레파스타 두 그릇 주세요.<end_of_turn>\n<start_of_turn>model\n- 분석 결과 0: 음식명:오뎅탕,옵션:중자\n- 분석 결과 1: 음식명:가지크림카레파스타,수량:두그릇<end_of_turn>\n<eos><bos><bos><start_of_turn>user\n너는 사용자가 입력한 주문 문장을 분석하는 에이전트이다. 주문으로부터 이를 구성하는 음식명, 옵션명, 수량을 차례대로 추출해야 한다.\n### 주문 문장: 알밥 점심으로 주시고, 히비스커스 핫으로 한잔 주세요.<end_of_turn>\n<start_of_turn>model\n- 분석 결과 0: 음식명:알밥,옵션:점심\n- 분석 결과 1: 음식명:히비스커스,옵션:핫,수량:한잔<end_of_turn>\n<eos><bos><bos><start_of_turn>user\n너는 사용자가 입력한 주문 문장을 분석하는 에이전트이다. 주문으로부터 이를 구성하는 음식명, 옵션명, 수량을 차례대로 추출해야 한다.\n### 주문 문장: 버블밀크티 볼륨 UP 하려고 ICE XL로 주세요, 그리고 플레인 크로아상 한개 해주세요.<end_of_turn>\n<start_of_turn>

# Fine Tuning



## Training Time (1 Epoch)
- **T4**: 1 hour 20 minutes
- **RTX 4090**: 10 minutes

## Loss
- **500 steps**: 0.552
- **1500 steps**: 0.432

## Seting for a LoRA adapter

In [None]:
lora_config = LoraConfig(
            r=8,
            lora_alpha=16,
            lora_dropout=0.05,
            target_modules=[
                "q_proj",
                "k_proj",
                "v_proj",
                "o_proj",
                "down_proj",
                "up_proj",
                "gate_proj",
            ],
            bias="none",
            task_type="CAUSAL_LM",
        )

In [None]:
peft_config = lora_config

In [None]:
from google.colab import userdata
import wandb

wandb_api_key = userdata.get('WANDB_API_KEY')
if wandb_api_key:
    wandb.login(key=wandb_api_key)
    print("Successfully logged in to Weights & Biases")
else:
    print("WANDB_API_KEY not found in Colab secrets")

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mjangmin-o[0m ([33mozlab[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Successfully logged in to Weights & Biases


In [None]:
sft_config = SFTConfig(
    output_dir=output_dir,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    gradient_checkpointing=False,
    learning_rate=1e-4,
    warmup_ratio=0.1,
    max_grad_norm=0.3,
    weight_decay=0.05,
    num_train_epochs=1,
    logging_steps=20,
    eval_strategy="no",
    save_strategy="steps",
    save_steps=50,
    save_total_limit=2,
    max_seq_length=seq_length,
    report_to="wandb",
    run_name="gemma-2b-fine-tuning"
)

In [None]:
trainer = SFTTrainer(
    model=base_model,
    train_dataset=ds,
    eval_dataset=None,
    peft_config=peft_config,
    tokenizer=tokenizer,
    args=sft_config
)

In [None]:
trainer.train()



Step,Training Loss
20,4.6362
40,3.6537
60,2.3497
80,1.5774
100,1.1472
120,0.931
140,0.8458
160,0.7979
180,0.7441


# Evaluation

## Utilities for Evaluation

In [None]:
def wrapper_generate(tokenizer, model, input_prompt, do_stream=False):
    def get_text_after_prompt(text):
        pattern = r'<start_of_turn>model\n(.*?)<end_of_turn>'
        match = re.search(pattern, text, re.DOTALL)

        if match:
            extracted_text = match.group(1).strip()
            return extracted_text
        else:
            return "매칭되는 텍스트가 없습니다."

    data = tokenizer(input_prompt, return_tensors="pt")
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    input_ids = data.input_ids[..., :-1]
    with torch.no_grad():
        pred = model.generate(
            input_ids=input_ids.cuda(),
            streamer=streamer if do_stream else None,
            use_cache=True,
            max_new_tokens=128, #float("inf"),
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    decoded_text = tokenizer.batch_decode(pred, skip_special_tokens=False)

    # gemma 결과에 대해 특별 처리
    return get_text_after_prompt(decoded_text[0])

## Load the Fine-Tuned Model

## Release GPU memory

In [None]:
import gc
del trainer
del base_model
gc.collect()
torch.cuda.empty_cache()

If you want other checkpoints, modify suffix.

In [None]:
trained_model = (
    AutoPeftModelForCausalLM.from_pretrained(
        f"{output_dir}/checkpoint-100",
        quantization_config=bnb_config,
        device_map="auto",
        # device='cuda',
        trust_remote_code=True,
    )
)

## Test

In [None]:
preprocessor = function_prepare_sample_text(tokenizer, for_train=False)

In [None]:
preprocessor({'input':'아이스아메리카노 그랑데 한잔 주세요'})

In [None]:
wrapper_generate(tokenizer=tokenizer, model=trained_model, input_prompt=preprocessor({'input':'아이스아메리카노 그랑데 한잔 주세요. 그리고 베이글 두개요.'}), do_stream=True)