<a href="https://colab.research.google.com/github/hengjiUSTC/learn-llm/blob/main/inference_qlora_finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets trl sentencepiece protobuf

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for accelerate (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━

In [2]:
def test_model(model, tokenizer, prompt, max_new_tokens=400, top_p=0.9, temperature=0.7):
    """
    Generates text using a provided model and tokenizer.

    Args:
    - model: The language model to use for generation.
    - tokenizer: The tokenizer associated with the model.
    - prompt: The prompt to feed to the model.
    - max_new_tokens: The maximum number of new tokens to generate. Default is 400.
    - top_p: Nucleus sampling's cumulative probability cutoff. Default is 0.9.
    - temperature: Controls randomness in generation. Lower values make text less random. Default is 0.7.

    Returns:
    A string containing the generated text.
    """

    # Tokenize the prompt
    tmp_eos = tokenizer.add_eos_token
    tokenizer.add_eos_token = False
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda()
    tokenizer.add_eos_token = tmp_eos

    # Generate the output
    outputs = model.generate(
        input_ids=input_ids,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        do_sample=True,
        temperature=temperature,
        eos_token_id=tokenizer.eos_token_id
    )

    # Decode and clean up the output
    generated_output = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]
    print(f"Prompt:\n{prompt}\n")
    print(f"Generated output:\n{generated_output}")
    return

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
from transformers import TrainingArguments

PEFT_MODEL = 'HenryJJ/tangshi-llama2-7b-chat-qlora'

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

config = PeftConfig.from_pretrained(PEFT_MODEL)
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)
model = PeftModel.from_pretrained(model, PEFT_MODEL)

tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path,
                                          trust_remote_code=True,
                                          add_eos_token=True,
                                          use_fast=False)
tokenizer.add_special_tokens({
    "eos_token": tokenizer.convert_ids_to_tokens(model.config.eos_token_id),
    "bos_token": tokenizer.convert_ids_to_tokens(model.config.bos_token_id),
    "unk_token": tokenizer.convert_ids_to_tokens(
        model.config.pad_token_id if model.config.pad_token_id != -1 else tokenizer.pad_token_id
    ),
})
#ref:
# https://github.com/huggingface/transformers/issues/22794#issuecomment-1616258519
# https://www.reddit.com/r/LocalLLaMA/comments/15hz7gl/my_finetuning_based_on_llama27bchathf_model/
tokenizer.pad_token_id = 18610 #_***
tokenizer.padding_side = "right"
tokenizer

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

LlamaTokenizer(name_or_path='NousResearch/Llama-2-7b-chat-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '▁***'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32000: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
}

In [5]:
prompt = f"""<s>[INST] <<SYS>>你是一个唐诗助手,帮助用户写一首对应要求的唐诗<</SYS>>
作者:李白
标签:乐府;赞美;近代曲辞
[/INST]"""
test_model(model, tokenizer, prompt)



Prompt:
<s>[INST] <<SYS>>你是一个唐诗助手,帮助用户写一首对应要求的唐诗<</SYS>>
作者:李白
标签:乐府;赞美;近代曲辞
[/INST]

Generated output:
辭 清平調 二 
秦時謠壑壑，漢時歌誦誦。
今人如吳中郎，吳中郎何爲來。
歌辭傳人家，讀應感遊子。


In [6]:
prompt = f"""[INST] <<SYS>>你是一个唐诗助手,帮助用户写一首对应要求的唐诗<</SYS>>
作者:李商隱
标签:黄河;咏物;抒情;鼓吹曲辞;乐府;咏物诗
[/INST]
"""
test_model(model, tokenizer, prompt)

Prompt:
[INST] <<SYS>>你是一个唐诗助手,帮助用户写一首对应要求的唐诗<</SYS>>
作者:李商隱
标签:黄河;咏物;抒情;鼓吹曲辞;乐府;咏物诗
[/INST]


Generated output:
玉韻 一 
玉韻在長河，輕駈吹螢紋。
玉韻一絮聲，一聲悲涼奏。
誰見玉韻停，將坐覽黃河。


In [7]:
prompt = f"""<s>[INST] <<SYS>>你是一个唐诗助手,帮助用户写一首对应要求的唐诗<</SYS>>
作者:杜甫
标签:乐府;赞美;近代曲辞
[/INST]
"""
test_model(model, tokenizer, prompt)

Prompt:
<s>[INST] <<SYS>>你是一个唐诗助手,帮助用户写一首对应要求的唐诗<</SYS>>
作者:杜甫
标签:乐府;赞美;近代曲辞
[/INST]


Generated output:
辭 長干曲四首 二 
輞川長干曲四首，皆從孔融所著。
昔時曹植採藥，終南山雙遭乘。
君不見復臨湘，將軍尚在絳州。
