In [None]:
# 导入库
from transformers import AutoTokenizer,AutoModelForCausalLM,TrainingArguments,Trainer
from datasets import load_dataset
import pprint
import torch
print(torch.__version__)

In [None]:
# 配置模型、数据
model_name="EleutherAI/pythia-70m" #hugging face模型
dataset_path="lamini/lamini_docs" #hugging face数据

In [None]:
# 加载分词器
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token=tokenizer.eos_token # 以0填充

#加载模型
base_model=AutoModelForCausalLM.from_pretrained(model_name)

# 输出模型使用的device
device_count=torch.cuda.device_count()

if device_count>0:
    device=torch.device("cuda")
else:
    device = torch.device('cpu')
print("设备类型：",device)
print("模型结构\n",base_model)


In [None]:
# 加载训练集测试集数据
train_dataset=load_dataset(path=dataset_path,split="train")
test_dataset=load_dataset(path=dataset_path,split="test")

In [None]:
# 定义推理函数,根据问题回答的函数,相当于机器学习中的预测
## 训练之前先预测看看效果
def inference(text,model,tokenizer,max_input_tokens=1000,max_output_tokens=100):
    # Tokenize 文本分词，生成分词编码，类型为pytorch的张量tensor,tokens
    input_ids = tokenizer.encode(text,return_tensors='pt',truncation=True,max_length=max_input_tokens)

    # Generate 模型根据这些编码tensor的tokens，生成内容的tokens，模型生成的最大的词元数量为100
    device = model.device
    generated_tokens_with_prompt=model.generate(
        input_ids=input_ids.to(device),
        max_length=max_output_tokens # 生成更过内容时，需要更多时间
    )

    # Decode 对模型生成的tokens用分词器解码
    generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt)
    
    # Strip the prompt # 删除初始的提示词，让它只返回结果
    generated_text_answer=generated_text_with_prompt[0][len(text):]
    
    return generated_text_answer


In [None]:
# 测试集测试
test_text=test_dataset[0]["question"] # 取第0行的question列
print("问题为：")
print(test_text)
print("默认回答为：")
print(test_dataset[0]["answer"])

print("模型的推理结果为：")
pprint.pprint(inference(test_text,base_model,tokenizer=tokenizer))


In [None]:
# 定义微调训练的参数
## 定义输出模型样式
max_steps=10 # 定义最大步数
num_train_epochs =10 #定义数据的完整遍历次数
trained_model_name = f"lamini_docs_{max_steps}_steps"
output_dir=trained_model_name

## 定义训练参数
training_args=TrainingArguments(

    # 学习率
    learning_rate=1.0e-5, #解释: 学习率是优化器调整模型权重时的步长大小。一个较小的学习率意味着每次更新的步伐较小，从而使训练过程更加稳定，但也可能导致训练速度变慢。
                          #设置: 1.0e-5，这是一个非常小的学习率，通常用于微调模型时，以确保模型在小范围内更新，避免对预训练模型的权重造成较大修改。

    # epochs数，对整个数据集遍历的次数
    num_train_epochs=num_train_epochs,   #解释: 最大训练轮次，即整个训练集将被迭代的次数。一个 epoch 代表模型经过一次完整的训练数据集。
                          #设置: 1，意味着模型将整个训练数据集遍历一次。这通常用于实验或在数据量较小的情况下。

    # 对模型的更新次数
    max_steps=max_steps,  #解释: 最大训练步数，即模型在训练期间所处理的批次数。这个参数与 max_train_epochs 互斥使用，通常用来设置训练的步数上限。
                          #设置: 这个参数的具体值没有在这里列出，但它需要与 max_train_epochs 一起考虑，以避免重复计算。

    # 每个设备上训练时的批量大小。批量大小决定了每次梯度更新时使用多少样本。
    per_device_train_batch_size=1,  #解释: 每个设备上训练时的批量大小。批量大小决定了每次梯度更新时使用多少样本。
                                    #设置: 1，这表示每个设备每次训练只处理一个样本，这可能会导致训练速度较慢，但可以在显存较小的情况下进行训练。

    # 模型快照的保存路径
    output_dir=output_dir,

    # 其他参数
    overwrite_output_dir=False, #覆写输出目录的内容
    disable_tqdm=False, #关闭过程进度条
    eval_steps=120, #每隔120步进行一次评估
    save_steps=120, #每隔120步进行一次保存
    warmup_steps=1, #预热步骤数。在训练初期，学习率会逐渐增加到设定的学习率，然后再进行正常训练。设置: 1，表示训练的前 1 步进行学习率预热。
    per_device_eval_batch_size=1,  # 每个设备上评估时的批量大小。
    eval_strategy="steps", # 评估策略，按照步数进行评估
    logging_strategy="steps", #按照步数，记录日志
    logging_steps=1, # 每一步记录一次
    optim="adafactor",#使用 Adafactor 优化器，这是一种用于大规模模型的优化算法。
    gradient_accumulation_steps=4, #解释: 梯度累积步骤数。在这个步骤数内累积梯度，然后进行一次参数更新。这有助于在显存较小的情况下使用较大的有效批量大小。设置: 4，表示每积累 4 个步骤的梯度后再进行一次更新
    gradient_checkpointing=False,#解释: 是否启用梯度检查点技术。这种技术可以节省显存，但会增加计算时间。设置: False，表示不启用梯度检查点。

    # 设置提前停止的参数
    load_best_model_at_end=True, #解释: 是否在训练结束时加载最佳模型。通常与提前停止机制一起使用。设置: True，表示在训练结束时自动加载在评估指标上表现最好的模型。
    save_total_limit=1, #解释: 最大保存检查点数量。如果设置了这个限制，系统将保留最近的检查点，而删除较早的检查点。设置: 1，表示只保留一个最新的检查点，删除旧的检查点以节省存储空间。
    metric_for_best_model="eval_loss", # 解释: 用于确定最佳模型的评估指标。设置: "eval_loss"，表示以评估损失作为选择最佳模型的依据。
    greater_is_better=False #解释: 指定是否指标越大越好。如果设为 True，则较高的指标值表示模型性能更好；如果设为 False，则较低的指标值表示模型性能更好。设置: False，表示评估损失越小越好。
)

In [None]:
# 训练模型的配置
training_config={
    "model":{
        "pretrained_name":model_name,
        "max_length":2048
    },
    "dataset":{
        "path":dataset_path
    },
    "verbose":True
}

# 展示模型占用内存空间
model_flops=(
    base_model.floating_point_ops(
        {
            "input_ids":torch.zeros(
                (1,training_config["model"]["max_length"])
            )
        }
    )
    *training_args.gradient_accumulation_steps
)

# 模型内存占用
print("Memory footprint",base_model.get_memory_footprint()/1e9,"GB")
print("Flops",model_flops/1e9,"GFLOPs")


In [None]:
# 模型训练
trainer=Trainer(
    model=base_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()

In [None]:
# 保存模型
save_dir=f'{output_dir}/final'

trainer.save_model(save_dir)
print("模型保存至：",save_dir)

In [None]:
# 再次加载本地模型
finetuned_slightly_model=AutoModelForCausalLM.from_pretrained(save_dir,local_files_only=True) # 选项中设置后就不会从huggingface中下载

In [None]:
# 用本地模型测试
# 由于训练时间较短，与未经过微调的模型并无什么区别
test_text=test_dataset[0]["question"] # 取第0行的question列
print("问题为：")
print(test_text)
print("默认回答为：")
print(test_dataset[0]["answer"])

print("模型的推理结果为：") 
pprint.pprint(inference(test_text,finetuned_slightly_model,tokenizer=tokenizer))

In [None]:
# 使用一个huggingface上长时间训练的模型看看效果
# 加载长时间微调模型
finetuned_longer_model = AutoModelForCausalLM.from_pretrained("lamini/lamini_docs_finetuned")

# 加载分词器
tokenizer = AutoTokenizer.from_pretrained("lamini/lamini_docs_finetuned")

# 用长时间微调模型进行推理
test_text=test_dataset[0]["question"] # 取第0行的question列
print("问题为：")
print(test_text)
print("默认回答为：")
print(test_dataset[0]["answer"])

print("模型的推理结果为：") 
pprint.pprint(inference(test_text,finetuned_longer_model,tokenizer=tokenizer))


In [None]:
# 使用lamini框架展示 “通义千问” 推理
import lamini
lamini.api_key = "6454ca8e1544d649cc150ca3dd87bc6d3fc99c45f11ec224afcee1d6dd4073ea"

llm = lamini.Lamini("Qwen/Qwen2-7B-Instruct")
print(llm.generate(test_text))

In [None]:
# lamini框架的微调

# 构造微调数据
def get_data():
    data = [
        {
            "input": "Are there any step-by-step tutorials or walkthroughs available in the documentation?",
            "output": "Yes, there are step-by-step tutorials and walkthroughs available in the documentation section. Here\u2019s an example for using Lamini to get insights into any python SDK: https://lamini-ai.github.io/example/",
        },
        {
            "input": "Is the Lamini type system similar to a python type system?",
            "output": "Yes, the Lamini type system is built using Pydantic BaseModel.",
        },
        {
            "input": "Does Lamini have a limit on the number of API requests I can make?",
            "output": "Lamini provides each user with free tokens up front.",
        },
        {
            "input": "What does it mean to cancel a job using the `cancel_job()` function? Can we stop the machine from doing its task?",
            "output": "The `cancel_job()` function is used to stop a tuning job that is currently running.",
        },
        {
            "input": "Can Lamini automatically handle hyperparameter tuning during the customization process? How does it optimize the model for a specific use case?",
            "output": "Lamini is capable of automatically handling hyperparameter tuning during the model customization process. It employs an intelligent algorithm to explore the hyperparameter space and find the optimal combination of values. This is done through techniques such as heuristics, grid search, random search, Bayesian optimization, or genetic algorithms. Lamini efficiently utilizes computational resources to evaluate multiple model instances with different hyperparameter configurations. It incorporates techniques like cross-validation to prevent overfitting and ensure generalization. By automating hyperparameter tuning, Lamini streamlines the machine learning workflow and improves the chances of developing high-performing models for specific use cases.",
        },
        {
            "input": "Can you explain the CC-BY license mentioned? What does it allow me to do with the customized language model?",
            "output": "Lamini allows for commercial use of their LLM technology under a permissive Apache 2.0 license unless otherwise specified. You keep access and ownership of your own data, and we don't use your data to tune models for anyone else but you. For more information, please reach out to Lamini directly.",
        },
        {
            "input": "Can Lamini be used on a regular computer, or do I need specialized hardware or software?",
            "output": "Lamini can be used on a regular computer without any specialized hardware or software by using the python client or REST APIs. It is designed to be easily accessible and user-friendly.  The LLM Engine itself requires GPU accelerated servers capable of running large language models.  Lamini hosts such machines that can be easily connected to, e.g. from a jupyter notebook or python program.  Lamini also offers enterprise deployments of the LLM Engine on your infrastructure.",
        },
        {
            "input": "Does Lamini require an internet connection to work, or can I use it offline?",
            "output": "Lamini requires an internet connection to work, as it is a cloud-based language model. However, you can use it offline by downloading and running a local instance of the Lamini API. To learn more about this deployment process, reach out to the Lamini team.",
        },
        {
            "input": "Can Lamini help me with tasks like translating text or answering questions, or is it focused on generating text?",
            "output": "Lamini is primarily focused on generating text, and it can be used for tasks like summarization and paraphrasing. Lamini can also be used to tune a LLM for tasks like translation and question answering. You\u2019re talking to a model tuned using Lamini right now!",
        },
        {
            "input": "What is Lamini? Is it like a robot or a computer program?",
            "output": "Lamini is a program for the execution of LLMs called a large language model engine. It is not a robot, but rather a tool for building and executing LLMs.",
        },
    ]
    return data

# 根据数据进行微调
## 读取模型
llm = lamini.Lamini(model_name='meta-llama/Meta-Llama-3.1-8B-Instruct') 

## 获取数据
data = get_data()

## 微调
llm.tune(data_or_dataset_id=data)

## 进行推理
print(llm.generate("What is Lamini? Is it like a robot or a computer program?"))

In [None]:
# 处理自己的数据集实现主题纠偏
## 打印下作者的数据集，看看作者是如何实现主题纠偏的
count=0
for i in range(len(train_dataset)):
    if "keep the discussion relevant to Lamini" in train_dataset[i]["answer"]:
        print(i,train_dataset[i]["question"],train_dataset[i]["answer"])
        count+=1
print(count)