In [4]:
from transformers.integrations import TensorBoardCallback
from torch.utils.tensorboard import SummaryWriter
from transformers import TrainingArguments
from transformers import Trainer
from data.rwkv_tokenizer import TRIE_TOKENIZER
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import torch.nn as nn
from peft import get_peft_model, LoraConfig, TaskType
from dataclasses import dataclass, field
import datasets
import os

#使用jsonl作为数据集

os.environ["CUDA_VISIBLE_DEVICES"] = "0"


tokenizer = TRIE_TOKENIZER('data/rwkv_vocab_v20230424.txt')


@dataclass
class FinetuneArguments:
    dataset_path: str = field(default="data/finetune_dataset")
    model_path: str = field(default="output")
    lora_rank: int = field(default=8)


class CastOutputToFloat(nn.Sequential):
    def forward(self, x):
        return super().forward(x).to(torch.float32)


def data_collator(features: list) -> dict:
    
    len_ids = [len(feature["input_ids"]) for feature in features]
    
    longest = max(len_ids)
    input_ids = []
    labels_list = []

    #print(features)
    
    for ids_l, feature in sorted(zip(len_ids, features), key=lambda x: -x[0]):
        
        ids = feature["input_ids"]
        seq_len = feature["seq_len"]

        print(tokenizer.decode(ids))

        labels = (
            [-100] * (seq_len - 1) + ids[(seq_len - 1) :] + [-100] * (longest - ids_l)
        )
        #ids = ids + [0] * (longest - ids_l)
        _ids = torch.LongTensor(ids)
        labels_list.append(torch.LongTensor(labels))
        input_ids.append(_ids)
    input_ids = torch.stack(input_ids)
    labels = torch.stack(labels_list)
    return {
        "input_ids": input_ids,
        "labels": labels,
    }


class ModifiedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        return model(
            input_ids=inputs["input_ids"],
            labels=inputs["labels"],
        ).loss

    def save_model(self, output_dir=None, _internal_call=False):
        from transformers.trainer import TRAINING_ARGS_NAME

        os.makedirs(output_dir, exist_ok=True)
        torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
        saved_params = {
            k: v.to("cpu") for k, v in self.model.named_parameters() if v.requires_grad
        }
        torch.save(saved_params, os.path.join(output_dir, "adapter_model.bin"))


def main():
    writer = SummaryWriter()
    finetune_args = FinetuneArguments
    

    # init model
    model = AutoModelForCausalLM.from_pretrained("RWKV/rwkv-5-world-1b5", trust_remote_code=True)
  

    # setup peft
    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=finetune_args.lora_rank,
        lora_alpha=16,
        lora_dropout=0.05,
        target_modules=[
        'key', 
        'value', 
        'receptance', 
        'output'
    ]
    )
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()
    model = model.cuda()

    dataset = datasets.load_from_disk(finetune_args.dataset_path)

    # start train
    training_args = TrainingArguments(
        output_dir="lora-out",
        fp16 =True,
        gradient_accumulation_steps=1,
        per_device_train_batch_size = 1,
        learning_rate = 1e-4,
        #num_train_epochs = 1,
        #max_steps=1500,
        max_steps=100,
        logging_steps=50,
        remove_unused_columns=False,
        seed=0,
        data_seed=0,
        group_by_length=False,
)
    
    trainer = ModifiedTrainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        callbacks=[TensorBoardCallback(writer)],

        #法1：GLM微调方法
        data_collator=data_collator
    )
    trainer.train()
    writer.close()
    # save model
    model.save_pretrained(training_args.output_dir)


if __name__ == "__main__":
    main()

trainable params: 7471104 || all params: 1585225728 || trainable%: 0.471295908717424


OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU 0 has a total capacty of 6.00 GiB of which 0 bytes is free. Of the allocated memory 11.86 GiB is allocated by PyTorch, and 42.51 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [6]:
import torch
print(torch.version.cuda) 
print(torch.backends.cudnn.version()) 

11.8
8700
