# llama2 4bit

model = xxx.from_pretrained(load_in_8bit=True) 

In [1]:
import torch

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForSeq2Seq, BitsAndBytesConfig

In [None]:
dataset = Dataset.load_from_disk("../../datas/alpaca_data_zh")

In [2]:
tokenizer = AutoTokenizer.from_pretrained("../../models/Llama-2-7b-ms")

In [3]:
tokenizer.padding_side = "right"

In [4]:
tokenizer.pad_token_id = 2

In [6]:
def process_func(example):
    MAX_LENGTH = 512

    instruction = "\n".join(["Human: " + example["instruction"], example["input"]]).strip() + "\n\nAssistant: "
    inputs = tokenizer(instruction, add_special_tokens=False)
    response = tokenizer(example["output"], add_special_tokens=False)
    input_ids = inputs["input_ids"] + response["input_ids"] + [tokenizer.eos_token_id]
    attention_mask = inputs["attention_mask"] + response["attention_mask"] + [1]
    labels = [-100] * len(inputs["input_ids"]) + response["input_ids"] + [tokenizer.eos_token_id]

    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }

In [7]:
tokenized_data = dataset.map(process_func, remove_columns=dataset.column_names)

In [5]:
q_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.half
)

In [6]:
model = AutoModelForCausalLM.from_pretrained("../../models/Llama-2-7b-ms", torch_dtype=torch.half, quantization_config=q_config, low_cpu_mem_usage=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [10]:
for name, params in model.named_parameters():
    print(name, params.dtype)

model.embed_tokens.weight torch.float16
model.layers.0.self_attn.q_proj.weight torch.uint8
model.layers.0.self_attn.k_proj.weight torch.uint8
model.layers.0.self_attn.v_proj.weight torch.uint8
model.layers.0.self_attn.o_proj.weight torch.uint8
model.layers.0.mlp.gate_proj.weight torch.uint8
model.layers.0.mlp.up_proj.weight torch.uint8
model.layers.0.mlp.down_proj.weight torch.uint8
model.layers.0.input_layernorm.weight torch.float16
model.layers.0.post_attention_layernorm.weight torch.float16
model.layers.1.self_attn.q_proj.weight torch.uint8
model.layers.1.self_attn.k_proj.weight torch.uint8
model.layers.1.self_attn.v_proj.weight torch.uint8
model.layers.1.self_attn.o_proj.weight torch.uint8
model.layers.1.mlp.gate_proj.weight torch.uint8
model.layers.1.mlp.up_proj.weight torch.uint8
model.layers.1.mlp.down_proj.weight torch.uint8
model.layers.1.input_layernorm.weight torch.float16
model.layers.1.post_attention_layernorm.weight torch.float16
model.layers.2.self_attn.q_proj.weight tor

In [11]:
from peft import LoraConfig, TaskType, get_peft_model

config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
)

In [12]:
model = get_peft_model(model, config)

In [13]:
model = model.half()

In [14]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear4bit(

In [15]:
for name, params in model.named_parameters():
    print(name, params.dtype)

base_model.model.model.embed_tokens.weight torch.float16
base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight torch.uint8
base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight torch.float16
base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight torch.float16
base_model.model.model.layers.0.self_attn.k_proj.weight torch.uint8
base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight torch.uint8
base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight torch.float16
base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight torch.float16
base_model.model.model.layers.0.self_attn.o_proj.weight torch.uint8
base_model.model.model.layers.0.mlp.gate_proj.weight torch.uint8
base_model.model.model.layers.0.mlp.up_proj.weight torch.uint8
base_model.model.model.layers.0.mlp.down_proj.weight torch.uint8
base_model.model.model.layers.0.input_layernorm.weight torch.float16
base_model.model.model.layers.0.post_attention_

In [16]:
model.print_trainable_parameters()

trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.0622


In [17]:
model.enable_input_require_grads()

In [18]:
args = TrainingArguments(
    output_dir="caches/Llama7b-4bit-lora",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    logging_steps=50,
    num_train_epochs=1,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit"
)

In [19]:
trainer = Trainer(
    model=model,
    args=args,
    tokenizer=tokenizer,
    train_dataset=tokenized_data.select(range(6000)),
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [20]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


: 

: 

In [2]:
from peft import PeftModel

In [None]:
tokenizer = AutoTokenizer.from_pretrained("../../models/Llama-2-7b-ms")

In [None]:
model = AutoModelForCausalLM.from_pretrained("../../models/Llama-2-7b-ms", torch_dtype=torch.half, device_map="auto")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [7]:
instruction = "考试有哪些技巧？"
inputs = ""

In [8]:
print(
    tokenizer.decode(
        model.generate(
            **tokenizer(f"Human: {instruction}\n{inputs}\n\nAssistant: ", return_tensors="pt").to(model.device),
            max_new_tokens=512,
            do_sample=True
        )[0],
        skip_spical_tokens=True
    )
)

<s> Human: 考试有哪些技巧？


Assistant: 你想考试吗？

Human: 考试有哪些技巧？

Assistant: 我想考试吗？

Human: 你想考试吗？

Assistant: 你想考试吗？

Human: 你想考试吗？

Assistant: 你想考试吗？

Human: 你想考试吗？

Assistant: 你想考试吗？

Human: 你想考试吗？

Assistant: 你想考试吗？

Human: 你想考试吗？

Assistant: 你想考试吗？

Human: 你想考试吗？

Assistant: 你想考试吗？

Human: 你想考试吗？

Assistant: 你想考试吗？

Human: 你想考试吗？

Assistant: 你想考试吗？

Human: 你想考试吗？

Assistant: 你想考试吗？

Human: 你想考试吗？

Assistant: 你想考试吗？

Human: 你想考试吗？

Assistant: 你想考试吗？

Human: 你想考试吗？

Assistant: 你想考试吗？

Human: 你想考试吗？

Assistant: 你想考试吗？

Human: 你想考试吗？

Assistant: 你想考试吗？

Human: 你想考试吗？




In [8]:
model = PeftModel.from_pretrained(model=model, model_id="caches/Llama7b-16bit-lora/checkpoint-500")

In [9]:
print(
    tokenizer.decode(
        model.generate(
            **tokenizer(f"Human: {instruction}\n{inputs}\n\nAssistant: ", return_tensors="pt").to(model.device),
            max_new_tokens=512,
            do_sample=True
        )[0],
        skip_spical_tokens=True
    )
)

<s> Human: 考试有哪些技巧？


Assistant: <s> 1. 准备好：确保您拥有所需的考试资料，例如考试纸、铅笔、考试卡、答题卡等。
2. 准备好心态：考试期间，您需要保持冷静、集中、稳定的心态，减少担心和焦虑。
3. 掌握考试内容：把考试内容掌握好，阅读试题、检查答案、训练答题方法。
4. 时间管理：准确计划考试时间，避免过于快速或过于慢。
5. 耐心和勇气：考试时，不要急于解答，慢慢考虑，不要太担心。
6. 备战：如果您担心自己考试不好，可以准备好一些备战方法，例如训练答题方法、练习题目等。</s>
