<a href="https://colab.research.google.com/github/hengjiUSTC/learn-llm/blob/main/learn_qlora_finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets trl sentencepiece protobuf

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


# Prepare data

- load 366 rows of Chinese poetry
- format data to conform llama2 chat.

In [1]:
import pandas as pd
import requests
from datasets import Dataset

def load_json_from_url(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        raise Exception("Failed to load data from URL")

def generate_prompt(data_point):
    tags = ';'.join(data_point['tags'])
    paragraph = '\n'.join(data_point['paragraphs'])
    return f"""[INST] <<SYS>>你是一个唐诗助手,帮助用户写一首对应要求的唐诗<</SYS>>
作者:{data_point["author"]}
标签:{tags}
[/INST]{data_point['title']}
{paragraph}
""".strip()

def generate_text(data_point):
    full_prompt = generate_prompt(data_point)
    return {"text": full_prompt}

# URL of the JSON file
url = "https://raw.githubusercontent.com/chinese-poetry/chinese-poetry/master/%E5%85%A8%E5%94%90%E8%AF%97/%E5%94%90%E8%AF%97%E4%B8%89%E7%99%BE%E9%A6%96.json"
data = load_json_from_url(url)

# Convert data to Pandas DataFrame
df = pd.DataFrame(data=data)

# Create Dataset and apply transformations
dataset = Dataset.from_pandas(df)
dataset = dataset.shuffle().map(generate_text)

Map:   0%|          | 0/366 [00:00<?, ? examples/s]

In [2]:
print(dataset[0]['text'])
dataset

[INST] <<SYS>>你是一个唐诗助手,帮助用户写一首对应要求的唐诗<</SYS>>
作者:劉禹錫
标签:唐诗三百首;怀古;七言绝句;带有地名;地名
[/INST]金陵五題 烏衣巷 
朱雀橋邊野草花，烏衣巷口夕陽斜。
舊時王謝堂前燕，飛入尋常百姓家。


Dataset({
    features: ['author', 'paragraphs', 'tags', 'title', 'id', 'text'],
    num_rows: 366
})

In [5]:
from transformers import AutoTokenizer
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Parameters for finetune process

In [6]:
model_id = "NousResearch/Llama-2-7b-chat-hf"

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

lora_target_modules = [
    "q_proj",
    "up_proj",
    "o_proj",
    "k_proj",
    "down_proj",
    "gate_proj",
    "v_proj",
]

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = torch.float16

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = True

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "results"
save_dir = "qlora-result"

# Number of training epochs
num_train_epochs = 3

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = True
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 1

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 3

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Log every X updates steps
logging_steps = 10

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = 2048

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

# Load model

- load llama2 chat model in int4 format (quantization)

In [7]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_use_double_quant=use_nested_quant,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=bnb_4bit_compute_dtype
)


model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map=device_map,
    torch_dtype=torch.float16,
    use_cache=False
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



# Build tokenizer

- cautious I added special token handling `tokenizer.pad_token_id = 18610`. Without this change finetuning process will cause none stopping issue for generated model.

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_id,
                                          trust_remote_code=True,
                                          add_eos_token=True,
                                          use_fast=False)
tokenizer.add_special_tokens({
    "eos_token": tokenizer.convert_ids_to_tokens(model.config.eos_token_id),
    "bos_token": tokenizer.convert_ids_to_tokens(model.config.bos_token_id),
    "unk_token": tokenizer.convert_ids_to_tokens(
        model.config.pad_token_id if model.config.pad_token_id != -1 else tokenizer.pad_token_id
    ),
})
#ref:
# https://github.com/huggingface/transformers/issues/22794#issuecomment-1616258519
# https://www.reddit.com/r/LocalLLaMA/comments/15hz7gl/my_finetuning_based_on_llama27bchathf_model/
tokenizer.pad_token_id = 18610 #_***
tokenizer.padding_side = "right"
tokenizer

LlamaTokenizer(name_or_path='NousResearch/Llama-2-7b-chat-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '▁***'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32000: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
}

In [9]:
# Util function for generate result
def test_model(model, tokenizer, prompt, max_new_tokens=400, top_p=0.9, temperature=0.7):
    """
    Generates text using a provided model and tokenizer.

    Args:
    - model: The language model to use for generation.
    - tokenizer: The tokenizer associated with the model.
    - prompt: The prompt to feed to the model.
    - max_new_tokens: The maximum number of new tokens to generate. Default is 400.
    - top_p: Nucleus sampling's cumulative probability cutoff. Default is 0.9.
    - temperature: Controls randomness in generation. Lower values make text less random. Default is 0.7.

    Returns:
    A string containing the generated text.
    """

    # Tokenize the prompt
    tmp_eos = tokenizer.add_eos_token
    tokenizer.add_eos_token = False
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda()
    tokenizer.add_eos_token = tmp_eos

    # Generate the output
    outputs = model.generate(
        input_ids=input_ids,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        do_sample=True,
        temperature=temperature,
        eos_token_id=tokenizer.eos_token_id
    )

    # Decode and clean up the output
    generated_output = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]
    print(f"Prompt:\n{prompt}\n")
    print(f"Generated output:\n{generated_output}")
    return

# Test original model

In [10]:
prompt = f"""[INST] <<SYS>>你是一个唐诗助手,帮助用户写一首对应要求的唐诗<</SYS>>
作者:駱賓王
标签:思念;七言律诗;秋天;咏物
[/INST]
"""

test_model(model, tokenizer, prompt)



Prompt:
[INST] <<SYS>>你是一个唐诗助手,帮助用户写一首对应要求的唐诗<</SYS>>
作者:駱賓王
标签:思念;七言律诗;秋天;咏物
[/INST]


Generated output:
凛冬风雨晨，思念萧瑟潇湿。
Autumn winds howl through the night,
Memories of you, my heart's delight.

In the morning dew, I wander alone,
Missing you, my dear, like a forgotten tone.
The trees stand tall, their leaves now gold,
But without you, my heart's cold.

Seven lines, a perfect rhyme,
A poem of longing, a heart's crime.
In autumn's hue, my thoughts are cast,
Longing for you, my love, at last.


# Initilize training parameters

In [11]:
from peft import LoraConfig, PeftModel, PeftConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import TrainingArguments
from trl import SFTTrainer

peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=lora_target_modules,
    bias="none",
    task_type="CAUSAL_LM",
)

args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    weight_decay=weight_decay,
    lr_scheduler_type=lr_scheduler_type,
)



In [12]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=packing,
    dataset_text_field="text",
    args=args,
)

trainable params: 159,907,840 || all params: 6,898,323,456 || trainable%: 2.3180681656919973


Map:   0%|          | 0/366 [00:00<?, ? examples/s]

In [13]:
trainer.train()



Step,Training Loss
10,3.6785
20,2.4519
30,1.9756
40,1.8281
50,1.8357
60,1.6478
70,1.6741
80,1.5853
90,1.6098
100,1.6075


TrainOutput(global_step=366, training_loss=1.4362254742064762, metrics={'train_runtime': 1709.92, 'train_samples_per_second': 0.642, 'train_steps_per_second': 0.214, 'total_flos': 1.0621756074811392e+16, 'train_loss': 1.4362254742064762, 'epoch': 3.0})

In [14]:
# save model
trainer.save_model(save_dir)

In [15]:
# Empty VRAM
del model
del trainer
import gc
gc.collect()
gc.collect()
torch.cuda.empty_cache()

In [16]:
# load saved model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

config = PeftConfig.from_pretrained(save_dir)
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)
model = PeftModel.from_pretrained(model, save_dir)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



# Check finetune result

In [20]:
prompt = f"""<s>[INST] <<SYS>>你是一个唐诗助手,帮助用户写一首对应要求的唐诗<</SYS>>
作者:李白
标签:乐府;赞美;近代曲辞
[/INST]"""
test_model(model, tokenizer, prompt)

Prompt:
<s>[INST] <<SYS>>你是一个唐诗助手,帮助用户写一首对应要求的唐诗<</SYS>>
作者:李白
标签:乐府;赞美;近代曲辞
[/INST]

Generated output:
辭 清平調 三 
樂聖樂天下，妾心中欲妝。
誰調彈劒輕，一把琴聽行。
鳴琴吞聲賦，感懷滋潤勞。
欲妝妾心怯，欲慰心欲殘。


In [21]:
prompt = f"""[INST] <<SYS>>你是一个唐诗助手,帮助用户写一首对应要求的唐诗<</SYS>>
作者:李商隱
标签:黄河;咏物;抒情;鼓吹曲辞;乐府;咏物诗
[/INST]
"""
test_model(model, tokenizer, prompt)

Prompt:
[INST] <<SYS>>你是一个唐诗助手,帮助用户写一首对应要求的唐诗<</SYS>>
作者:李商隱
标签:黄河;咏物;抒情;鼓吹曲辞;乐府;咏物诗
[/INST]


Generated output:
鼓吹曲辭 黄河 
黃河出山東，兩岸胡時雨。
輕舟橫漠漠，閑坐覽夕陽。
兩岸廢舊村，一片荒涼草。
黃河又何處，千里寒雲滿。


In [19]:
prompt = f"""<s>[INST] <<SYS>>你是一个唐诗助手,帮助用户写一首对应要求的唐诗<</SYS>>
作者:杜甫
标签:乐府;赞美;近代曲辞
[/INST]
"""
test_model(model, tokenizer, prompt)

Prompt:
<s>[INST] <<SYS>>你是一个唐诗助手,帮助用户写一首对应要求的唐诗<</SYS>>
作者:杜甫
标签:乐府;赞美;近代曲辞
[/INST]


Generated output:
辭 清平調 一 
清平調，欲往舟卒，終於晚泊。
獨立鳳雛，萬里長風。
昔日青春，不可復來。
潮自復興，海燕自遁散。
