In [5]:
import json
import random
from nltk.tokenize import sent_tokenize


def mask_random_phrase(text, mask_token="[MASK]", max_mask_length=8):
    words = text.split()
    if len(words) == 0:
        return text

    start_index = random.randint(0, len(words) - 1)
    end_index = min(start_index + random.randint(1, max_mask_length), len(words))

    masked_words = words[:start_index] + [mask_token] + words[end_index:]
    return " ".join(masked_words)


def process_content(content, num_masks=3):
    sentences = sent_tokenize(content)
    merged_sentences = []

    for i in range(0, len(sentences), 2):
        if i + 1 < len(sentences):
            merged_text = sentences[i] + " " + sentences[i + 1]
        else:
            merged_text = sentences[i]

        original_text = merged_text
        for _ in range(num_masks):
            masked_text = mask_random_phrase(original_text)
            yield f"{masked_text}\t{original_text}"


def generate_dataset(input_file, output_file, num_masks=3):
    with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
        for line in infile:
            data = json.loads(line)
            content = data.get("title", "") + " " + data.get("content", "")
            content = content.replace("\n", " ")
            for sample in process_content(content, num_masks=num_masks):
                outfile.write(sample + "\n")

# 使用示例
generate_dataset("data/bbc_news/bbc_news_2024-11.jsonl", "train.txt", num_masks=5)

In [1]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    TextDataset,
    DataCollatorForLanguageModeling,
)
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# Step 1: Load and preprocess the dataset
def load_and_preprocess_data(file_path):
    # Convert list of strings to a Hugging Face Dataset
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=128,
    )

    return dataset


# Step 2: Set up the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

# Step 3: Initialize the model and tokenizer
model_name =  "/data/modelscope/Qwen2.5-0.5B" # Replace with the correct model name if different
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Add special tokens if necessary
special_tokens_dict = {"additional_special_tokens": ["[MASK]"]}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

# Step 4: Prepare the dataset
file_path = "train_1k.txt"
dataset = load_and_preprocess_data(file_path)

# Step 5: Create data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Step 6: Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

# Step 7: Train the model
trainer.train()

# Step 8: Save the model
model.save_pretrained("./fine-tuned-qwen")
tokenizer.save_pretrained("./fine-tuned-qwen")



Step,Training Loss
500,2.4594
1000,0.7681


('./fine-tuned-qwen/tokenizer_config.json',
 './fine-tuned-qwen/special_tokens_map.json',
 './fine-tuned-qwen/vocab.json',
 './fine-tuned-qwen/merges.txt',
 './fine-tuned-qwen/added_tokens.json',
 './fine-tuned-qwen/tokenizer.json')

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Step 1: Load the tokenizer and model
model_name_or_path = "./fine-tuned-qwen"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)

# Ensure the model is in evaluation mode
model.eval()

In [None]:
# Step 2: Define a function to generate text
def generate_text(prompt, max_length=50):
    # Encode the input prompt
    encoding = tokenizer.encode_plus(
        prompt,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=max_length
    )
    
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']
    
    # Generate text
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_length,
            num_return_sequences=1,
            no_repeat_ngram_size=2
        )
    
    # Decode the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

prompt = "This is a document. It contains some [MASK]"
generated_text = generate_text(prompt)
print(f"Prompt: {prompt}")
print(f"Generated Text: {generated_text}")

In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# 加载预训练的tokenizer和model
# model_name = "/data/modelscope/Qwen2.5-0.5B"
model_name = "./fine-tuned-qwen"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


今天天气很好，我和朋友去公园散步，我们看到了 <MASK> 很多人在那里玩耍。有的人在树荫下乘凉，有的人在野地里烧烤，还有的人在捉迷藏，看谁先看见金鱼。	太阳光耀着大地，万物生长得如此之茂盛，我和朋友去公园散步，
Predicted middle part: >


In [17]:
def predict_missing_text(prefix, suffix):
    # 创建一个特殊的token表示缺失的部分
    mask_token = "[MASK]"

    # 构建输入序列，使用模板和提示
    input_sequence = (
        f"# 请根据上下文填补缺失的部分：\n上文: {prefix}\n\n下文: {suffix}\n给出上下文间缺失部分："
    )

    # 编码输入序列
    inputs = tokenizer(input_sequence, return_tensors="pt")

    # 获取模型的输出
    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_new_tokens=20,  # 设置生成的新令牌数量
            num_return_sequences=1,
            do_sample=True,
            top_k=50,
            top_p=0.95,
        )

    # 解码生成的文本
    predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(predicted_text)

    # 提取预测的缺失部分
    start_index = predicted_text.find(suffix) + len(suffix)
    end_index = predicted_text.rfind("\n", start_index)
    if end_index == -1:
        end_index = len(predicted_text)

    predicted_middle_part = predicted_text[start_index:end_index].strip()

    return predicted_middle_part


# 示例用法
prefix = "今天天气很好，我和朋友去公园散步，我们看到了"
suffix = "很多人在那里玩耍。有的人在树荫下乘凉，有的人在野地里烧烤，还有的人在捉迷藏，看谁先看见金鱼。太阳光耀着大地，万物生长得如此之茂盛，我和朋友去公园散步，"

predicted_middle_part = predict_missing_text(prefix, suffix)
print(f"Predicted middle part: {predicted_middle_part}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


# 请根据上下文填补缺失的部分：
上文: 今天天气很好，我和朋友去公园散步，我们看到了

下文: 很多人在那里玩耍。有的人在树荫下乘凉，有的人在野地里烧烤，还有的人在捉迷藏，看谁先看见金鱼。太阳光耀着大地，万物生长得如此之茂盛，我和朋友去公园散步，
给出上下文间缺失部分：“如果天气不给力，就没有这场宴席了”意思是，如果运气不好，就没有这场
Predicted middle part: 
