In [None]:
import json
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
import torch
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# 加载预训练的tokenizer和model
model_name = "/data/modelscope/Qwen2.5-0.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)


# 示例数据
data = [
    {
        "prefix": "今天天气很好，我和朋友去公园散步，我们看到了",
        "middle": "很多人在那里玩耍。有的人在树荫下乘凉，有的人在野地里烧烤，还有的人在捉迷藏，看谁先看见金鱼。",
        "suffix": "太阳光耀着大地，万物生长得如此之茂盛，我和朋友去公园散步，",
    },
    {
        "prefix": "昨天我在图书馆借了一本书，书名是",
        "middle": "《活着》，",
        "suffix": "作者是余华。这本书非常值得一读。",
    },
]

# 将数据转换为Dataset对象
dataset = Dataset.from_list(data)


def preprocess_function(example):

    input_sequence = f"<__PREFIX__>{example['prefix']}\n\n<__MIDDLE__>[MASK]\n\n<__SUFFIX__>{example['suffix']}"
    target_sequence = f"<__PREFIX__>{example['prefix']}\n\n<__MIDDLE__>{example['middle']}\n\n<__SUFFIX__>"

    model_inputs = tokenizer(input_sequence, max_length=256, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(target_sequence, max_length=256, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_datasets = dataset.map(preprocess_function, batched=False)

# 定义训练参数
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

# 定义Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
)

# 开始微调
trainer.train()

# 保存微调后的模型
model.save_pretrained("./fine_tuned_qwen_2.5-0.5B")
tokenizer.save_pretrained("./fine_tuned_qwen_2.5-0.5B")

Map:   0%|          | 0/2 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,No log,9.158321
2,No log,3.34331
3,No log,1.816067


('./fine_tuned_qwen_2.5-0.5B/tokenizer_config.json',
 './fine_tuned_qwen_2.5-0.5B/special_tokens_map.json',
 './fine_tuned_qwen_2.5-0.5B/vocab.json',
 './fine_tuned_qwen_2.5-0.5B/merges.txt',
 './fine_tuned_qwen_2.5-0.5B/added_tokens.json',
 './fine_tuned_qwen_2.5-0.5B/tokenizer.json')

In [9]:
# 使用微调后的模型进行预测
def predict_missing_text(prefix, suffix, fine_tuned_model, fine_tuned_tokenizer):
    input_sequence = f"<__PREFIX__>{prefix}\n\n<__MIDDLE__>[MASK]\n\n<__SUFFIX__>{suffix}"
    inputs = fine_tuned_tokenizer(input_sequence, return_tensors="pt")

    with torch.no_grad():
        outputs = fine_tuned_model.generate(
            inputs.input_ids,
            max_new_tokens=128,
            num_return_sequences=1,
            do_sample=True,
            top_k=50,
            top_p=0.95
        )

    predicted_text = fine_tuned_tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(predicted_text)

    start_index = predicted_text.find(suffix) + len(suffix)
    end_index = predicted_text.rfind("\n", start_index)
    if end_index == -1:
        end_index = len(predicted_text)

    predicted_middle_part = predicted_text[start_index:end_index].strip()

    return predicted_middle_part

# 加载微调后的模型和分词器
fine_tuned_model = AutoModelForCausalLM.from_pretrained("./fine_tuned_qwen_2.5-0.5B")
fine_tuned_tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_qwen_2.5-0.5B")

# 示例用法
prefix = "昨天我在图书馆借了一本书，书名是"
suffix = "作者是余华。这本书非常值得一读。"

predicted_middle_part = predict_missing_text(prefix, suffix, fine_tuned_model, fine_tuned_tokenizer)
print(f"Predicted middle part: {predicted_middle_part}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


<__PREFIX__>昨天我在图书馆借了一本书，书名是

<__MIDDLE__>[MASK]

<__SUFFIX__>作者是余华。这本书非常值得一读。
Predicted middle part: 
