In [1]:
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

## 1. 加载数据集

In [2]:
from datasets import load_dataset

train_dataset = load_dataset("gem/viggo", split="train")
eval_dataset = load_dataset("gem/viggo", split="validation")
test_dataset = load_dataset("gem/viggo", split="test")

print(train_dataset)
print(eval_dataset)
print(test_dataset)

  from .autonotebook import tqdm as notebook_tqdm
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Downloading builder script: 100%|██████████| 3.14k/3.14k [00:00<00:00, 4.40MB/s]
Downloading metadata: 100%|██████████| 3.81k/3.81k [00:00<00:00, 4.82MB/s]
Downloading readme: 100%|██████████| 24.5k/24.5k [00:00<00:00, 674kB/s]
Downloading data: 100%|██████████| 1.37M/1.37M [00:00<00:00, 25.2MB/s]
Downloading data: 100%|██████████| 187k/187k [00:00<00:00, 22.3MB/s]
Downloading data: 100%|██████████| 269k/269k [00:00<00:00, 22.2MB/s]
Downloading data: 100%|██████████| 11.2k/11.2k [00:00<00:00, 9.43MB/s]
Downloading data: 100%|██████████| 25.1k/25.1k [00:00<00:00, 706kB/s]
Downloading data: 100%|██████████| 64.3k/64.3k [00:00<00:00, 222kB/s]
Downloading data: 100%|██████████| 123k/123k [00:00<00:00, 271kB/s] 
Downloading data: 100%|████████

Dataset({
    features: ['gem_id', 'meaning_representation', 'target', 'references'],
    num_rows: 5103
})
Dataset({
    features: ['gem_id', 'meaning_representation', 'target', 'references'],
    num_rows: 714
})
Dataset({
    features: ['gem_id', 'meaning_representation', 'target', 'references'],
    num_rows: 1083
})


In [3]:
# 查看样本

train_dataset[0]

{'gem_id': 'viggo-train-0',
 'meaning_representation': 'inform(name[Dirt: Showdown], release_year[2012], esrb[E 10+ (for Everyone 10 and Older)], genres[driving/racing, sport], platforms[PlayStation, Xbox, PC], available_on_steam[no], has_linux_release[no], has_mac_release[no])',
 'target': "Dirt: Showdown from 2012 is a sport racing game for the PlayStation, Xbox, PC rated E 10+ (for Everyone 10 and Older). It's not available on Steam, Linux, or Mac.",
 'references': ["Dirt: Showdown from 2012 is a sport racing game for the PlayStation, Xbox, PC rated E 10+ (for Everyone 10 and Older). It's not available on Steam, Linux, or Mac."]}

## 2. 加载基模型

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# llama-3-8B
base_model_id = "/root/autodl-tmp/model/Meta-Llama-3-8B"

# 量化配置
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_type = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16
)

# 加载模型
model = AutoModelForCausalLM.from_pretrained(base_model_id,
                                             quantization_config = bnb_config)

Unused kwargs: ['bnb_4bit_use_double_type']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.69s/it]


## 3. 加载 Tokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    model_max_length = 512,
    padding_side = "left",
    add_eos_token = True
)

tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
def tokenize(prompt):
    '''分词器'''
    result = tokenizer(prompt,
                       truncation = True,
                       max_length = 512,
                       padding = "max_length")

    result["labels"] = result["input_ids"].copy()
    return result

In [7]:
def generate_and_tokenize_prompt(data_point):
    full_prompt =f"""Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
                    This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
                    The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']
                    
                    ### Target sentence:
                    {data_point["target"]}
                    
                    ### Meaning representation:
                    {data_point["meaning_representation"]}
                  """
    return tokenize(full_prompt)

## 4. 对train和eval数据集进行tokenzier

In [8]:
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)

tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)

Map: 100%|██████████| 5103/5103 [00:02<00:00, 1924.83 examples/s]
Map: 100%|██████████| 714/714 [00:00<00:00, 1946.46 examples/s]


In [9]:
# 查看样本

print(tokenized_train_dataset[1]["input_ids"])

[128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,

In [10]:
print(len(tokenized_train_dataset[1]["input_ids"]))

512


## 5. 基于 base model 进行测试

In [11]:
# 查看 test 数据集样本

print("目标语句: \n", test_dataset[1]["target"])

print("意义表示: \n", test_dataset[1]["meaning_representation"])


目标语句: 
 Earlier, you stated that you didn't have strong feelings about PlayStation's Little Big Adventure. Is your opinion true for all games which don't have multiplayer?
意义表示: 
 verify_attribute(name[Little Big Adventure], rating[average], has_multiplayer[no], platforms[PlayStation])


In [12]:
eval_prompt = """Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']

### Target sentence:
Earlier, you stated that you didn't have strong feelings about PlayStation's Little Big Adventure. Is your opinion true for all games which don't have multiplayer?

### Meaning representation:
"""

In [13]:
# 重新初始化 tokenizer，这样它就不会添加 padding 或 eos token

eval_tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    add_bos_token = True,
)

model_input = eval_tokenizer(eval_prompt, return_tensors = "pt").to("cuda")

model.eval()

with torch.no_grad():
    # 模型推理
    result = model.generate(**model_input, max_new_tokens=256)
    # 解码
    result = eval_tokenizer.decode(result[0], skip_special_tokens=True)
    print(result)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
2024-05-24 09:41:33.731816: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-24 09:41:33.772823: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX512_FP16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
This function should describe the target string accurately and the function must be one of the following ['inform','request', 'give_opinion', 'confirm','verify_attribute','suggest','request_explanation','recommend','request_attribute'].
The attributes must be one of the following: ['name', 'exp_release_date','release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release','specifier']

### Target sentence:
Earlier, you stated that you didn't have strong feelings about PlayStation's Little Big Adventure. Is your opinion true for all games which don't have multiplayer?

### Meaning representation:
{'type': 'inform', 'attributes': [{'name': 'opinion', 'value': 'true'}], 'attributes_values': [{'name': 'opinion', 'value': 'true'}], 'at

## 6. 配置 LoRA

In [14]:
from peft import prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

In [15]:
def print_trainable_parameters(model):
    '''计算训练的参数量'''
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"训练参数量：{trainable_params} || 所有参数量：{all_param} || 可训练参数量比例：{100 * trainable_params / all_param}"
    )

In [16]:
# 打印模型

# print(model)

In [17]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r = 8,
    lora_alpha = 16,
    target_modules = [
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias = "none",
    lora_dropout = 0.05,
    task_type = "CAUSAL_LM",
)

model = get_peft_model(model, config)

print_trainable_parameters(model)

训练参数量：22030336 || 所有参数量：4562630656 || 可训练参数量比例：0.4828428523143645


In [18]:
# 打印模型

# print(model)

## 7. wandb 配置

In [19]:
# 需要在WandB官网注册账号
import wandb

wandb.login(key="11a0ff012b65b101fdf6613d7c21f66a5960e623")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mtommytang[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [20]:
run = wandb.init(
    project="llama-3-8B-QLoRA",
    job_type = "training",
)

## 8. 模型训练

In [21]:
if torch.cuda.device_count() > 1:
    model.is_parallelizable = True
    model.model_parallel = True

In [22]:
import transformers
from datetime import datetime

run_name = "llama-3-8B-QLoRA"
output_dir = "./" + run_name

tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    # 指定要训练的模型
    model = model,
    # 指定训练数据集
    train_dataset = tokenized_train_dataset,
    # 指定验证数据集
    eval_dataset = tokenized_val_dataset,
    # 训练参数配置
    args = transformers.TrainingArguments(
        output_dir = output_dir, # 训练输出的目录
        warmup_steps = 5, # 训练过程中的预热步骤数 
        # 解释：
        # warmup_steps ：在训练的初始阶段，学习率从一个较低的值逐步增加到设定的学习率。
        #                预热步骤的作用是避免模型在一开始就收到较大的梯度更新，从而有助于稳定训练过程。
        per_device_train_batch_size = 4, # 训练批次大小
        gradient_checkpointing = True, # 是否开启梯度检查点以节省内存
        # 解释：
        # gradient_checkpointing ： 这是一种技术，允许在训练过程中节省显存。
        #                          具体来说，它会在前向传播时保存某些中间结果，而不是所有中间结果，从而减少显存占用量。然后在反向传播时，必要时重新计算这些中间结果。
        gradient_accumulation_steps = 4, # 梯度累积的步数，实际 batch size = per_device_train_batch_size * gradient_accumulation_steps
        max_steps = 500, # 最大训练步数，1000,5000等
        learning_rate = 2.5e-5, # 学习率
        logging_steps = 50, # 每 50 步记录一次日志
        bf16 = True, # 使用 bfloat16 精度进行训练
        optim = "paged_adamw_8bit", # 使用8-bit的AdamW优化器
        # 解释：
        # paged_adamw_8bit ：这是一种优化器的实现，将参数和梯度压缩到8-bit表示，以减少内存和计算需求。
        #                    Paged表示则是指优化器分页处理数据，以进一步优化内存使用。
        logging_dir = "./logs", # 日志存储目录
        save_strategy = "steps", # 模型保存策略：每隔一定步数保存一次
        save_steps = 50, # 每50步保存一次模型检查点
        evaluation_strategy = "steps", # 评估策略：每隔一定步数进行评估
        eval_steps = 50, # 每50步进行一次评估
        do_eval = True, # 是否在训练结束后进行评估
        report_to = "wandb",
        run_name = f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"  # W&B 运行名称，包含当前时间戳
    ),
    data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False), # 数据整理器
)

model.config.use_cache = False # 禁用缓存以避免警告。推理时请重新启用


max_steps is given, it will override any value given in num_train_epochs


In [23]:
trainer.train() # 开始训练



Step,Training Loss,Validation Loss
50,1.2584,0.391524
100,0.3422,0.295344
150,0.2611,0.240467
200,0.2327,0.226517
250,0.2206,0.217926
300,0.2092,0.21217
350,0.2051,0.208268
400,0.1982,0.206516
450,0.1954,0.204782
500,0.2015,0.204114




TrainOutput(global_step=500, training_loss=0.33244283866882324, metrics={'train_runtime': 3845.3729, 'train_samples_per_second': 2.08, 'train_steps_per_second': 0.13, 'total_flos': 1.8495932347082342e+17, 'train_loss': 0.33244283866882324, 'epoch': 1.567398119122257})

## 8. 基于base model和LoRA model进行推理

In [1]:
import torch

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = "/root/autodl-tmp/model/Meta-Llama-3-8B"

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config = bnb_config,
    device_map = "auto",
    trust_remote_code = True,
)

eval_tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    add_bos_token = True,
    trust_remote_code = True,
)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.14s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
# 加载 QLoRA adapter

from peft import PeftModel

best_qlora_checkpoint = "/root/autodl-tmp/project/2_QLoRA/notebook/llama-3-8B-QLoRA/checkpoint-500"

ft_model = PeftModel.from_pretrained(base_model,
                                     best_qlora_checkpoint)

In [3]:
eval_prompt = """Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']

### Target sentence:
Earlier, you stated that you didn't have strong feelings about PlayStation's Little Big Adventure. Is your opinion true for all games which don't have multiplayer?

### Meaning representation:
"""

model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda")

ft_model.eval()
with torch.no_grad():
    print(eval_tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
2024-05-24 11:05:37.558035: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-24 11:05:37.642115: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX512_FP16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
This function should describe the target string accurately and the function must be one of the following ['inform','request', 'give_opinion', 'confirm','verify_attribute','suggest','request_explanation','recommend','request_attribute'].
The attributes must be one of the following: ['name', 'exp_release_date','release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release','specifier']

### Target sentence:
Earlier, you stated that you didn't have strong feelings about PlayStation's Little Big Adventure. Is your opinion true for all games which don't have multiplayer?

### Meaning representation:
verify_attribute(name[Little Big Adventure], rating[average], has_multiplayer[no], platforms[PlayStation])

