In [1]:
import os

# os.environ["WANDB_API_KEY"] = '+++++++++++'  # 将引号内的+替换成自己在wandb上的一串值
# os.environ["WANDB_MODE"] = "offline"  # 离线  （此行代码不用修改）

import json

import pandas as pd
import torch
from datasets import Dataset
from peft import LoraConfig, TaskType, get_peft_model
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from swanlab.integration.huggingface import SwanLabCallback
from transformers import DataCollatorForSeq2Seq, Trainer
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
glm4_model_path = '/home/LLM_para/para_glm4'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# 加载训练集和测试集
tokenizer = AutoTokenizer.from_pretrained(glm4_model_path, use_fast=False,
                                          trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(glm4_model_path,
                                             #quantization_config=bnb_config,
                                             device_map="auto", torch_dtype=torch.bfloat16,
#                                              attn_implementation="flash_attention_2",
                                             trust_remote_code=True)
model.config.use_cache = False

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|████████████████████████████████████████████████████████| 10/10 [00:06<00:00,  1.48it/s]


In [4]:
model.enable_input_require_grads()  #  开启梯度检查点
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
#结束标识符
print(tokenizer.eos_token)
tokenizer.encode('<|endoftext|>')

<|endoftext|>


[151331, 151333, 151329]

In [5]:
#获取最大toekn数
train_path = "./mydataset2_SQL/T2Q_GLM4_SFT_train_sql.jsonl"
max_len = 0
count = 0
with open(train_path, "r") as file:
        for line in file:
            count+=1
            # 解析每一行的json数据
            example = json.loads(line)
            if count == 1:
                print(f"<|system|>\n {example['instruction']}.<|endoftext|>\n<|user|>\n{example['input']}<|endoftext|>\n<|assistant|>\n")
            instruction = tokenizer(
        f"<|system|>\n {example['instruction']}.<|endoftext|>\n<|user|>\n{example['input']}<|endoftext|>\n<|assistant|>\n",
        add_special_tokens=False, )
            response = tokenizer(f"{example['output']}", add_special_tokens=False)
            input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
            attention_mask = (
            instruction["attention_mask"] + response["attention_mask"] + [1])
            labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]
            if len(input_ids) > max_len:
                max_len = len(input_ids)
print("max_len:",max_len)   
#1810

<|system|>
  I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request. 
##instruction:CREATE TABLE `Manufacturers` (
  Code INTEGER,
  Name VARCHAR(255),
  Headquarter VARCHAR(255),
  Founder VARCHAR(255),
  Revenue REAL
);

.<|endoftext|>
<|user|>
Find the total revenue of companies of each founder.<|endoftext|>
<|assistant|>

max_len: 407


In [6]:
#  和下面的max_len 作一个合并
#  max_seq_length = 2100

def process_func(example):
    """
    将数据集进行预处理
    """
    MAX_LENGTH = 700 
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer(
        f"<|system|>\n {example['instruction']}.<|endoftext|>\n<|user|>\n{example['input']}<|endoftext|>\n<|assistant|>\n",
        add_special_tokens=False,
    )
    response = tokenizer(f"{example['output']}", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = (
        instruction["attention_mask"] + response["attention_mask"] + [1]
    )
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]
    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

In [7]:
#  set dataset
train_path = "./mydataset2_SQL/T2Q_GLM4_SFT_train_sql.jsonl"
val_path = "./mydataset2_SQL/T2Q_GLM4_SFT_val_sql.jsonl"

train_df = pd.read_json(train_path, lines=True)
train_ds = Dataset.from_pandas(train_df)
train_dataset = train_ds.map(process_func, remove_columns=train_ds.column_names)


val_df = pd.read_json(val_path, lines=True)
val_ds = Dataset.from_pandas(val_df)
val_dataset = val_ds.map(process_func, remove_columns=val_ds.column_names)

Map: 100%|█████████████████████████████████████████████████████████████████| 1736/1736 [00:01<00:00, 1423.34 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████| 178/178 [00:00<00:00, 1386.22 examples/s]


In [8]:
#  train_config
#  示例里 r 为 8
lora_r = 64
lora_alpha = 32
lora_dropout = 0.1
output_dir = "./SFT_SQL_dataset2"
num_train_epochs = 8
bf16 = True
overwrite_output_dir = True
per_device_train_batch_size = 4
per_device_eval_batch_size = 2
gradient_accumulation_steps = 16
gradient_checkpointing = True
evaluation_strategy = "steps"
learning_rate = 5e-5
weight_decay = 0.01
lr_scheduler_type = "cosine"
warmup_ratio = 0.01
max_grad_norm = 0.3
group_by_length = True
auto_find_batch_size = False
save_steps = 40
logging_steps = 50
load_best_model_at_end= False
packing = False
save_total_limit=4
neftune_noise_alpha=5
# report_to="wandb"
max_seq_length = 700

In [9]:
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    target_modules=[
       "query_key_value", "dense", "dense_h_to_4h", "activation_func", "dense_4h_to_h"
    ],
    task_type=TaskType.CAUSAL_LM,
)

In [10]:
swanlab_callback = SwanLabCallback(
    project="GLM4-SFT_T2QSQL_CN_val_train",
    experiment_name="GLM4-9B-Chat",
    description="使用智谱GLM4-9B-Chat模型在spider数据集上微调 生成SQL-CN。",
    config={
        "model": "/home/LLM_para/para_glm4",
        "dataset": "T2Q_GLM4_SFT_train_SQL.jsonl",
    },
)

In [11]:
training_arguments = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    num_train_epochs=num_train_epochs,
    load_best_model_at_end=load_best_model_at_end,
    per_device_train_batch_size=per_device_train_batch_size,
    evaluation_strategy=evaluation_strategy,
    max_grad_norm = max_grad_norm,
    auto_find_batch_size = auto_find_batch_size,
    save_total_limit = save_total_limit,
    gradient_accumulation_steps=gradient_accumulation_steps,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    bf16=bf16,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="none",
    neftune_noise_alpha= neftune_noise_alpha
)

In [12]:
# response_template = "### Response:"
# collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)
# collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True)
# trainer = SFTTrainer(
#     model=model,
#     train_dataset=dataset['train'],
#     eval_dataset=dataset['validation'],
#     peft_config=peft_config,
#     data_collator=collator,
#     args=training_arguments,
#     max_seq_length=max_seq_length,
#     packing=packing
# )
model = get_peft_model(model, peft_config)
trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=train_dataset,
    eval_dataset= val_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
    callbacks=[swanlab_callback],
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [13]:
trainer.train()

[1m[34mswanlab[0m[0m: swanlab version 0.3.20 is available!  Upgrade: `pip install -U swanlab`
[1m[34mswanlab[0m[0m: Tracking run with swanlab version 0.3.19                                  
[1m[34mswanlab[0m[0m: Run data will be saved locally in [35m[1m/home/code/chat_SQL/exp1/GLM4/swanlog/run-20240915_115339-a3b1799d[0m[0m
[1m[34mswanlab[0m[0m: 👋 Hi [1m[39mwinhong[0m[0m, welcome to swanlab!
[1m[34mswanlab[0m[0m: Syncing run [33mGLM4-9B-Chat_Sep15_11-53-39[0m to the cloud
[1m[34mswanlab[0m[0m: 🌟 Run `[1mswanlab watch /home/code/chat_SQL/exp1/GLM4/swanlog[0m` to view SwanLab Experiment Dashboard locally
[1m[34mswanlab[0m[0m: 🏠 View project at [34m[4mhttps://swanlab.cn/@winhong/GLM4-SFT_T2QSQL_CN_val_train[0m[0m
[1m[34mswanlab[0m[0m: 🚀 View run at [34m[4mhttps://swanlab.cn/@winhong/GLM4-SFT_T2QSQL_CN_val_train/runs/xvaqwyyr17ai6dzrnz20r[0m[0m


Step,Training Loss,Validation Loss
50,0.8113,0.095398
100,0.0666,0.054755
150,0.0245,0.048457
200,0.0151,0.047991




[1m[33mswanlab[0m[0m: Step 50 on key train/epoch already exists, ignored.




[1m[33mswanlab[0m[0m: Step 100 on key train/epoch already exists, ignored.




[1m[33mswanlab[0m[0m: Step 150 on key train/epoch already exists, ignored.




[1m[33mswanlab[0m[0m: Step 200 on key train/epoch already exists, ignored.




TrainOutput(global_step=216, training_loss=0.21335852642854056, metrics={'train_runtime': 1004.1453, 'train_samples_per_second': 13.831, 'train_steps_per_second': 0.215, 'total_flos': 1.4420390486114304e+17, 'train_loss': 0.21335852642854056, 'epoch': 7.9631336405529956})

In [14]:
# output_dir = os.path.join("./", "final_checkpoint_newS_SFT_SQL_CN_TV")
output_dir = os.path.join("./", "final_checkpoint_dataset2_SQL")
trainer.model.save_pretrained(output_dir)



In [None]:
def predict(messages, model, tokenizer):
#     device = "cuda"
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(device)

    generated_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    print(response)
     
    return response

In [None]:
# 用测试集的前10条，测试模型
test_df = val_df[:10]
print(test_df)
print(1)

In [None]:
# import swanlab
test_text_list = []
for index, row in test_df.iterrows():
    instruction = row['instruction']
    input_value = row['input']
    
    messages = [
        {"role": "system", "content": f"{instruction}"},
        {"role": "user", "content": f"{input_value}"}
    ]

    response = predict(messages, model, tokenizer)
#     print(response)
#     messages.append({"role": "assistant", "content": f"{response}"})
#     result_text = f"{messages[0]}\n\n{messages[1]}\n\n{messages[2]}"
#     test_text_list.append(swanlab.Text(result_text, caption=response))
    
# swanlab.log({"Prediction": test_text_list})
# swanlab.finish()