In [11]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, get_linear_schedule_with_warmup
from peft import LoraConfig, get_peft_model, TaskType
import pandas as pd

# 检查可用的GPU设备
if torch.cuda.is_available():
    num_gpus = torch.cuda.device_count()
    print(f"Number of available GPUs: {num_gpus}")
    for i in range(num_gpus):
        print(f"Device {i}: {torch.cuda.get_device_name(i)}")
    device = torch.device("cuda:0")  # 使用第一个可用的GPU设备
else:
    print("No GPU devices available. Using CPU.")
    device = torch.device("cpu")

# 加载预训练模型和分词器
model_name = "/data/modelscope/Qwen2.5-0.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 确保分词器支持 [MASK] 令牌
if tokenizer.mask_token is None:
    tokenizer.add_special_tokens({"mask_token": "[MASK]"})

model = AutoModelForCausalLM.from_pretrained(model_name)

# 调整模型以适应新的特殊令牌
model.resize_token_embeddings(len(tokenizer))

# 设置设备
model.to(device)

# 定义LORA配置
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

# 将LORA应用到模型上
model = get_peft_model(model, lora_config)
print(model.print_trainable_parameters())


# 自定义数据集类
class CustomDataset(Dataset):
    def __init__(self, examples, tokenizer, max_length=512):
        self.examples = []
        for example in examples:
            parts = example.strip().split("\t")
            if len(parts) == 2:
                context_before_mask, missing_text_and_context_after = parts
                context_before, _, context_after = context_before_mask.partition("[MASK]")
                if " " in missing_text_and_context_after:
                    missing_text, context_after = missing_text_and_context_after.split(" ", 1)
                    self.examples.append((context_before, missing_text, context_after))
                else:
                    print(f"Skipping malformed line: {example.strip()}")
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        context_before, missing_text, context_after = self.examples[idx]

        # 创建完整的上下文
        full_context = f"{context_before} [MASK] {context_after}"
        input_ids = self.tokenizer.encode(
            full_context, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt"
        ).squeeze()

        # 创建标签（即中间缺失的文本）
        labels = [-100] * len(input_ids)
        try:
            mask_start = input_ids.tolist().index(self.tokenizer.mask_token_id)
            mask_end = mask_start + len(missing_text.split())
            for i in range(mask_start, mask_end):
                labels[i] = input_ids[i]
        except ValueError:
            print(f"Mask token not found in example: {full_context}")
            raise

        attention_mask = (input_ids != self.tokenizer.pad_token_id).long()

        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": torch.tensor(labels)}


# 加载自定义数据集
dataset_path = "train2_10k.txt"

# 分割数据集为训练集和验证集
with open(dataset_path, "r", encoding="utf-8") as f:
    lines = f.readlines()
train_lines = lines[: int(0.8 * len(lines))]
eval_lines = lines[int(0.8 * len(lines)) :]

train_dataset = CustomDataset(train_lines, tokenizer)
eval_dataset = CustomDataset(eval_lines, tokenizer)

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(eval_dataset, batch_size=8)

# 设置优化器和学习率调度器
optimizer = torch.optim.AdamW(params=model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

# 训练循环
model.train()
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    for step, batch in enumerate(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        if step % 10 == 0:
            print(f"Step {step}, Loss: {loss.item()}")

# 评估模型
model.eval()
total_loss = 0
with torch.no_grad():
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        total_loss += outputs.loss.item()

avg_eval_loss = total_loss / len(eval_dataloader)
print(f"Evaluation Loss: {avg_eval_loss}")

# 保存模型
model.save_pretrained("./lora_qwen_2.5-0.5B_custom_fill_gpu0")
tokenizer.save_pretrained("./lora_qwen_2.5-0.5B_custom_fill_gpu0")

Number of available GPUs: 1
Device 0: NVIDIA GeForce RTX 4090
trainable params: 1,081,344 || all params: 494,872,192 || trainable%: 0.2185
None
Skipping malformed line: [MASK] : "Forbidden" song haunts South Korea's Suneung students - BBC News.Suneung, an eight-hour university placement exam billed as one of the toughest in the world, kicked off on Thursday  A brief yearly silence has once again enveloped South Korea, as half a million students across the country sit for the most important test of their lives.	APT.
Skipping malformed line: Inside Murder Trial - A Deadly Affair - 1.Well Ablaze - BBC Sounds. [MASK]	5.
Epoch 1/3
Step 0, Loss: 21.33559226989746
Step 10, Loss: 20.634693145751953
Step 20, Loss: 19.510395050048828
Step 30, Loss: 18.83188247680664
Step 40, Loss: 17.6241455078125
Step 50, Loss: 15.809399604797363
Step 60, Loss: 13.600440979003906
Step 70, Loss: 12.043617248535156
Step 80, Loss: 11.139253616333008
Step 90, Loss: 10.368639945983887
Step 100, Loss: 10.178379058837

KeyboardInterrupt: 

In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, get_linear_schedule_with_warmup
from peft import LoraConfig, get_peft_model, TaskType
from torch.utils.tensorboard import SummaryWriter
import pandas as pd

# 检查可用的GPU设备
if torch.cuda.is_available():
    num_gpus = torch.cuda.device_count()
    print(f"Number of available GPUs: {num_gpus}")
    for i in range(num_gpus):
        print(f"Device {i}: {torch.cuda.get_device_name(i)}")
    device = torch.device("cuda:0")  # 使用第一个可用的GPU设备
else:
    print("No GPU devices available. Using CPU.")
    device = torch.device("cpu")

# 加载预训练模型和分词器
model_name = "/data/modelscope/Qwen2.5-0.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 确保分词器支持 [MASK] 令牌
if tokenizer.mask_token is None:
    tokenizer.add_special_tokens({"mask_token": "[MASK]"})

model = AutoModelForCausalLM.from_pretrained(model_name)

# 调整模型以适应新的特殊令牌
model.resize_token_embeddings(len(tokenizer))

# 设置设备
model.to(device)

# 定义LORA配置
lora_config = LoraConfig(
    r=8,  # 减少 r 的值
    lora_alpha=16,  # 减少 lora_alpha 的值
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,  # 减少 dropout 的值
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

# 将LORA应用到模型上
model = get_peft_model(model, lora_config)
print(model.print_trainable_parameters())


# 自定义数据集类
class CustomDataset(Dataset):
    def __init__(self, examples, tokenizer, max_length=512):
        self.examples = []
        for example in examples:
            parts = example.strip().split("\t")
            if len(parts) == 2:
                context_before_mask, missing_text_and_context_after = parts
                context_before, _, context_after = context_before_mask.partition("[MASK]")
                if " " in missing_text_and_context_after:
                    missing_text, context_after = missing_text_and_context_after.split(" ", 1)
                    self.examples.append((context_before, missing_text, context_after))
                else:
                    print(f"Skipping malformed line: {example.strip()}")
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        context_before, missing_text, context_after = self.examples[idx]

        # 创建完整的上下文
        full_context = f"{context_before} [MASK] {context_after}"
        input_ids = self.tokenizer.encode(
            full_context, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt"
        ).squeeze()

        # 创建标签（即中间缺失的文本）
        labels = [-100] * len(input_ids)
        try:
            mask_start = input_ids.tolist().index(self.tokenizer.mask_token_id)
            mask_end = mask_start + len(missing_text.split())
            for i in range(mask_start, mask_end):
                labels[i] = input_ids[i]
        except ValueError:
            print(f"Mask token not found in example: {full_context}")
            raise

        attention_mask = (input_ids != self.tokenizer.pad_token_id).long()

        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": torch.tensor(labels)}


# 加载自定义数据集
dataset_path = "train2_10k.txt"

# 分割数据集为训练集和验证集
with open(dataset_path, "r", encoding="utf-8") as f:
    lines = f.readlines()
train_lines = lines[: int(0.8 * len(lines))]
eval_lines = lines[int(0.8 * len(lines)) :]

train_dataset = CustomDataset(train_lines, tokenizer)
eval_dataset = CustomDataset(eval_lines, tokenizer)

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(eval_dataset, batch_size=8)

# 设置优化器和学习率调度器
optimizer = torch.optim.AdamW(params=model.parameters(), lr=1e-5)  # 尝试更低的学习率
num_epochs = 5  # 增加训练轮数
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

# 初始化 TensorBoard 写入器
writer = SummaryWriter(log_dir="./runs/lora_qwen_2.5-0.5B_custom_fill_gpu0")

# 训练循环
model.train()
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    for step, batch in enumerate(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        if step % 10 == 0:
            print(f"Step {step}, Loss: {loss.item()}")
            writer.add_scalar("Training Loss", loss.item(), epoch * len(train_dataloader) + step)

# 关闭 TensorBoard 写入器
writer.close()

# 评估模型
model.eval()
total_loss = 0
with torch.no_grad():
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        total_loss += outputs.loss.item()

avg_eval_loss = total_loss / len(eval_dataloader)
print(f"Evaluation Loss: {avg_eval_loss}")

# 保存模型
model.save_pretrained("./lora_qwen_2.5-0.5B_custom_fill_gpu0")
tokenizer.save_pretrained("./lora_qwen_2.5-0.5B_custom_fill_gpu0")

Number of available GPUs: 2
Device 0: NVIDIA GeForce RTX 4090
Device 1: NVIDIA GeForce RTX 4090
trainable params: 540,672 || all params: 494,331,520 || trainable%: 0.1094
None
Skipping malformed line: [MASK] : "Forbidden" song haunts South Korea's Suneung students - BBC News.Suneung, an eight-hour university placement exam billed as one of the toughest in the world, kicked off on Thursday  A brief yearly silence has once again enveloped South Korea, as half a million students across the country sit for the most important test of their lives.	APT.
Skipping malformed line: Inside Murder Trial - A Deadly Affair - 1.Well Ablaze - BBC Sounds. [MASK]	5.
Epoch 1/5
Step 0, Loss: 20.749778747558594
Step 10, Loss: 21.54783821105957
Step 20, Loss: 21.47522735595703
Step 30, Loss: 20.945316314697266
Step 40, Loss: 21.52081871032715
Step 50, Loss: 21.945276260375977
Step 60, Loss: 20.522262573242188
Step 70, Loss: 22.00442886352539
Step 80, Loss: 21.065034866333008
Step 90, Loss: 20.418201446533203

KeyboardInterrupt: 