In [1]:
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [2]:
# bitsandbytes：专为量化设计的库，重点在于减少大语言模型（尤其是在GPU上）的内存占用。
# peft：用于将LoRA适配器集成到大语言模型（LLMs）中。
# trl：该库包含一个SFT（监督微调）类，用于辅助微调模型。
# accelerate和xformers：这些库用于提高模型的推理速度，从而优化其性能。
# wandb：该工具作为一个监控平台，用于跟踪和观察训练过程。
# datasets：与Hugging Face一起使用，该库便于加载数据集。

import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
    TextStreamer,
    Trainer
)
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
import os
import wandb


In [3]:
torch.cuda.is_available()

True

In [4]:
torch.cuda.device_count()

1

## 1. 加载模型和Tokenizer

In [5]:
# 预训练模型
model_name = "./model/Meta-Llama-3-8B"

# 数据集名称
dataset_name = "./guanaco-llama3-1k"


In [6]:
# 加载预训练模型和tokenizer

# 量化配置
# https://huggingface.co/docs/transformers/en/main_classes/quantization#transformers.BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True, # 模型将以4位量化格式加载
    bnb_4bit_quant_type = "nf4", # 指定4位量化的类型为 nf4 
    bnb_4bit_compute_dtype = torch.float16, # 计算数据类型 
    bnb_4bit_use_double_quant = False, # 表示不使用双重量化
)

# 模型加载
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = bnb_config,
    device_map = {"": 0} # 将模型加载到设备0（通常是第一个GPU）
)

model = prepare_model_for_kbit_training(model) 

# tokenizer 加载
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True # 在生成序列时会自动添加结束标记


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [7]:
# 加载数据集

dataset = load_dataset(dataset_name, split="train")

dataset["text"][0]

'<|start_header_id|>user<|end_header_id|>{{Me gradué hace poco de la carrera de medicina ¿Me podrías aconsejar para conseguir rápidamente un puesto de trabajo?}}<|eot_id|><|start_header_id|>assistant<|end_header_id|>{{Esto vale tanto para médicos como para cualquier otra profesión tras finalizar los estudios aniversarios y mi consejo sería preguntar a cuántas personas haya conocido mejor. En este caso, mi primera opción sería hablar con otros profesionales médicos, echar currículos en hospitales y cualquier centro de salud. En paralelo, trabajaría por mejorar mi marca personal como médico mediante un blog o formas digitales de comunicación como los vídeos. Y, para mejorar las posibilidades de encontrar trabajo, también participaría en congresos y encuentros para conseguir más contactos. Y, además de todo lo anterior, seguiría estudiando para presentarme a las oposiciones y ejercer la medicina en el sector público de mi país.}}<|eot_id|>'

## 2.wandb配置

In [8]:
# 监控
# 需要在WandB官网注册账号

wandb.login(key="67e081897172fa78f72efb9d4932745c9fc70334")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33m1341355826[0m ([33m1341355826-shu[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [9]:
run = wandb.init(
    project="finetune llama-3-8B",
    job_type = "training",
)

In [10]:
# 计算训练参数量

def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"训练参数量 : {trainable_params} || 总的参数量 : {all_param} || 训练参数量占比%: {100 * (trainable_params / all_param):.2f}"
    )

## 3. LoRA与训练超参配置

In [11]:
# LoRA config

peft_config = LoraConfig(
    r = 2,
    lora_alpha = 4, # 小技巧：把α值设置成rank值的两倍
    # scaling = alpha / r # LoRA 权重的值越大，影响就越大。
    # weight += (lora_B @ lora_A) * scaling
    lora_dropout = 0.05,
    bias = "none",
    task_type = "CAUSAL_LM",
    # ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj", "down_proj", "embed_tokens", "lm_head"]
    target_modules = ["q_proj","k_proj"]
)

In [12]:
# 训练超参

training_arguments = TrainingArguments(
    output_dir = "./output",
    num_train_epochs = 5,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 2, # 梯度累积步数为2，即每2步更新一次梯度。有助于在显存有限的情况下使用较大的有效批次大小。
    optim = "paged_adamw_8bit",
    save_steps = 100, # 每100步保存一次模型 
    logging_steps = 30,
    learning_rate = 2e-4,
    weight_decay = 0.001, # 权重衰减系数，用于L2正则化，帮助防止过拟合。
    fp16 = False,
    bf16 = False,
    max_grad_norm = 0.3, # 最大梯度范数，用于梯度裁剪，防止梯度爆炸。
    max_steps = -1, # 最大训练步数为-1，表示没有限制。
    warmup_ratio = 0.3, # 预热阶段的比例。在训练开始时，学习率会逐渐升高，预热比例为0.3表示前30%的训练步骤用于预热。
    group_by_length = True, # 按序列长度分组，以提高训练效率。
    lr_scheduler_type = "linear", # 表示使用线性学习率调度。
    report_to = "wandb", # tensorboard
)

## 4. 模型微调

In [13]:
# SFT超参

trainer = SFTTrainer(
    model = model,
    train_dataset = dataset,
    peft_config = peft_config,
    tokenizer = tokenizer,
    dataset_text_field="text",
    args = training_arguments,
    packing=False
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


In [14]:
# 开始训练

trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
30,2.1667
60,2.0739
90,1.8913
120,1.6769
150,1.6088
180,1.6354
210,1.5692
240,1.6067
270,1.576
300,1.5336




TrainOutput(global_step=625, training_loss=1.6413387390136718, metrics={'train_runtime': 4713.12, 'train_samples_per_second': 1.061, 'train_steps_per_second': 0.133, 'total_flos': 9.332874822057984e+16, 'train_loss': 1.6413387390136718, 'epoch': 5.0})

In [15]:
model = get_peft_model(model, peft_config)

# 计算可训练参数量
print_trainable_parameters(model)

训练参数量 : 851968 || 总的参数量 : 4541452288 || 训练参数量占比%: 0.02


## 5. 保存模型

In [16]:
# 保存微调模型

trainer.model.save_pretrained("./model/lora_model")

wandb.finish()

model.config.use_cache = True

model.eval()

VBox(children=(Label(value='0.038 MB of 0.038 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇▇██
train/global_step,▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇▇██
train/grad_norm,▇█▇▆▅▃▃▃▂▂▂▁▁▁▂▁▁▂▁▁
train/learning_rate,▂▃▄▆▇██▇▇▆▆▅▅▄▄▃▃▂▂▁
train/loss,█▇▅▃▂▂▂▂▂▁▁▂▁▁▁▂▂▁▁▁

0,1
total_flos,9.332874822057984e+16
train/epoch,5.0
train/global_step,625.0
train/grad_norm,0.20779
train/learning_rate,1e-05
train/loss,1.5378
train_loss,1.64134
train_runtime,4713.12
train_samples_per_second,1.061
train_steps_per_second,0.133


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=2, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=2, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): l

## 6. 模型推理

In [17]:
# base模型测试

def stream(user_input):
    device = "cuda:0"
    system_prompt = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n'
    B_INST, E_INST = "### Instruction:\n", "### Response:\n"
    prompt = f"{system_prompt}{B_INST}{user_input.strip()}\n\n{E_INST}"
    inputs = tokenizer([prompt], return_tensors="pt").to(device)
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    _ = model.generate(**inputs, streamer=streamer, max_new_tokens=128)

In [18]:
stream("Tell me something about the Great Wall.")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


The Great Wall of China is a series of fortifications made of stone, brick, tamped earth, and other materials, generally built along an east-to-west line across the historical northern borders of China. The oldest parts date from the 7th century BC, but most of them were built between the 2nd century BC and 16th century. Sections near Beijing and Hebei were built from 1400 BC to 1100 BC. The Great Wall is the largest construction project ever completed on the planet. It is estimated that 10 million people died in its construction. The Great Wall was originally conceived as a military defense


In [19]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
    TextStreamer,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
import os, wandb

## 7. 模型合并

In [20]:
# 预训练模型
model_name = "./model/Meta-Llama-3-8B"

In [21]:
# 合并 base model 与 lora model
# https://huggingface.co/docs/trl/main/en/use_model#use-adapters-peft

base_model = AutoModelForCausalLM.from_pretrained(
    model_name, low_cpu_mem_usage=True,
    return_dict=True,torch_dtype=torch.float16,
    device_map= {"": 0})

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [22]:
new_model = PeftModel.from_pretrained(base_model, "./model/lora_model")

In [23]:
# 模型合并

merged_model = new_model.merge_and_unload()

In [24]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [25]:
user_input = "Tell me something about the Great Wall."
device = "cuda:0"
system_prompt = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n'
B_INST, E_INST = "### Instruction:\n", "### Response:\n"
prompt = f"{system_prompt}{B_INST}{user_input.strip()}\n\n{E_INST}"
inputs = tokenizer([prompt], return_tensors="pt").to(device)
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
_ = merged_model.generate(**inputs, streamer=streamer, max_new_tokens=128, num_return_sequences=1)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


The Great Wall of China is a series of stone and earthen fortifications in northern China, built originally to protect the northern borders of the Chinese Empire against intrusions by various nomadic groups. It was built, rebuilt, and maintained between the 5th century BC and the 16th century. Several walls were being built as early as the 7th century BC; these, later joined together and made bigger, stronger, and unified are now collectively referred to as the Great Wall. Especially famous is the wall built between 220–206 BC by the first Emperor of China, Qin Shi Huang. Little of that wall remains.
