In [None]:
# 加载预训练的GPT-2模型和分词器
model_name = "/data/modelscope/GPT2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# 添加特殊标记
special_tokens_dict = {"pad_token": "<PAD>"}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

In [25]:
# 设置训练参数
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

# 初始化Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets,
)

In [None]:
# 微调模型
trainer.train()

In [None]:
# 保存微调后的模型
model.save_pretrained("./fine-tuned-gpt2")
tokenizer.save_pretrained("./fine-tuned-gpt2")

In [None]:
# 使用微调后的模型进行文本补全
def generate_text(prompt, model, tokenizer, max_length=50):
    inputs = tokenizer.encode(prompt, return_tensors="pt", return_attention_mask=True)
    attention_mask = inputs["attention_mask"]
    input_ids = inputs["input_ids"]

    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        early_stopping=False,  # 取消 early_stopping
    )
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text


# 测试模型
test_document = "This is a document. It con<MASK>ns some information."
predicted_completion = generate_text(test_document, model, tokenizer)
print(predicted_completion.strip())

In [13]:
import pandas as pd
from transformers import GPT2Tokenizer
import nltk
import random
from sklearn.model_selection import train_test_split
from nltk.tokenize import sent_tokenize

# 加载数据
jsonl_file_path = "data/bbc_news/bbc_news_2024-11.jsonl"
data = pd.read_json(jsonl_file_path, lines=True)

# 数据清洗
# 假设JSONL文件有两列: 'title' 和 'content'
data.dropna(subset=["title", "content"], inplace=True)  # 删除缺失标题或内容的行
data.drop_duplicates(subset=["title", "content"], inplace=True)  # 删除重复的标题和内容组合

# 打印数据基本信息
print(f"Total number of articles: {len(data)}")

# 合并标题和内容
data["text"] = data["title"] + " " + data["content"]


# 切分句子并合并相邻的三个句子
def split_and_merge_sentences(text):
    sentences = sent_tokenize(text)
    merged_texts = []

    for i in range(0, len(sentences), 3):
        segment = " ".join(sentences[i : i + 3])
        if segment.strip():  # 确保段落不为空
            merged_texts.append(segment)

    return merged_texts


# 应用函数
merged_texts = []
for text in data["text"]:
    merged_texts.extend(split_and_merge_sentences(text))

# 创建DataFrame
data_merged = pd.DataFrame({"text": merged_texts})


# 随机掩盖部分段落
def mask_random_segments(texts, mask_ratio=0.2):
    masked_texts = []
    original_texts = []

    for text in texts:
        sentences = sent_tokenize(text)
        num_masks = max(1, int(len(sentences) * mask_ratio))  # 至少掩盖一个句子

        # 随机选择一些句子进行掩盖
        mask_indices = sorted(random.sample(range(len(sentences)), num_masks))

        masked_sentences = []
        for i, sentence in enumerate(sentences):
            if i in mask_indices:
                masked_sentences.append("<MASK>")
            else:
                masked_sentences.append(sentence)

        masked_text = " ".join(masked_sentences)
        original_text = " ".join(sentences)

        masked_texts.append(masked_text)
        original_texts.append(original_text)

    return masked_texts, original_texts


masked_texts, original_texts = mask_random_segments(data_merged["text"])

# 创建DataFrame
data_masked = pd.DataFrame({"masked_text": masked_texts, "original_text": original_texts})

# 拆分为训练集和验证集
train_data, test_data = train_test_split(data_masked, test_size=0.1, random_state=42)

# 保存为txt文件
train_file_path = "news_datasets/train.txt"
test_file_path = "news_datasets/test.txt"

# 创建目录（如果不存在）
import os

os.makedirs(os.path.dirname(train_file_path), exist_ok=True)

# 将训练集保存为txt文件
with open(train_file_path, "w", encoding="utf-8") as f:
    for _, row in train_data.iterrows():
        f.write(f"Masked Text: {row['masked_text']}\n")
        f.write(f"Original Text: {row['original_text']}\n\n")

# 将测试集保存为txt文件
with open(test_file_path, "w", encoding="utf-8") as f:
    for _, row in test_data.iterrows():
        f.write(f"Masked Text: {row['masked_text']}\n")
        f.write(f"Original Text: {row['original_text']}\n\n")

print("数据集已保存到 ./news_datasets")

Total number of articles: 1875
数据集已保存到 ./news_datasets
