# 第一步，加载pretrained model and tokenizer

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PromptEncoderConfig, get_peft_model, TaskType

tokenizer = AutoTokenizer.from_pretrained("internlm/internlm2-chat-1_8b", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("internlm/internlm2-chat-1_8b", device_map="cuda",trust_remote_code=True, torch_dtype=torch.float16)
# model = model.eval()
config = PromptEncoderConfig(
    peft_type="P_TUNING",
    task_type=TaskType.CAUSAL_LM,
    num_virtual_tokens=10,
    token_dim=2048,
    # num_transformer_submodules=1,
    # num_attention_heads=12,
    # num_layers=12,
    encoder_reparameterization_type="MLP",
    encoder_hidden_size=2048,
)

model = get_peft_model(model, config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
import pandas as pd

# DataFrame to Json
df = pd.read_excel('./data/2024-02-28-公告测评集.xls', header=0)
df['answer'] = df['answer'].astype(str)
df.to_json('./data/data_short_ans_train.json',orient='records')

In [5]:
import pandas as pd
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq, DataCollatorForLanguageModeling
from typing import Dict
from datasets import Dataset, load_dataset
import numpy as np


def encode_fn(text, tokenizer, max_length,return_attention_mask=False):
    return tokenizer(text, max_length=max_length, padding="max_length", truncation=True,return_attention_mask=return_attention_mask)


def get_dataset(file: str, split: str, encode_fn: callable, encode_args: dict,  cache_dir: str='.cache') -> Dataset:
    """
    Load a dataset
    """
    eos_token = tokenizer.eos_token
    dataset = load_dataset('json', data_files=file, split=split, cache_dir=cache_dir)
    def merge_prompt_and_responses(sample: dict):
        # add an eos token note that end of sentence, using in generate.
        # encoded_prompt = tokenizer([e + eos_token for e in sample['question']], truncation=False, padding=True, return_attention_mask=True)
        # encoded_response = tokenizer([e + eos_token for e in sample['answer']], truncation=False, padding=True, return_attention_mask=False)
        encoded_prompt = tokenizer(sample['question'] + eos_token, truncation=False, padding=True, return_attention_mask=True)
        encoded_response = tokenizer(sample['answer'] + eos_token, truncation=False, padding=True, return_attention_mask=False)
        encoded_q_type = tokenizer(sample['type'] + eos_token, truncation=False, padding=True, return_attention_mask=True)
        # input_ids = [np.array(item + [eos_token_id], dtype=np.uint32) for item in encoded_prompt["input_ids"]]
        # labels = [np.array(item + [eos_token_id], dtype=np.uint32) for item in encoded_response["input_ids"]]
        # prompt = encode_fn(sample['question'] + '[EOS]', return_attention_mask=True)
        # answer = encode_fn(sample['answer'] + '[EOS]', return_attention_mask=False)
        # title = encode_fn(sample['title'] + '[EOS]', **encode_args)
        # print(type(encoded_prompt.input_ids),'\n',type(encoded_prompt.attention_mask),'\n',labels)
        return {
            'input_ids': encoded_prompt.input_ids,
            'attention_mask': encoded_prompt.attention_mask,
            'q_type': encoded_q_type.input_ids,
            'labels': encoded_response.input_ids,       
            'q_type_attention_mask' : encoded_q_type.attention_mask,
        }

    # dataset = dataset.map(merge_prompt_and_responses, batched=True, batch_size=1)
    dataset = dataset.map(merge_prompt_and_responses)
    return dataset
"""
return {
    'input_ids': prompt.input_ids,
    'input_mask': prompt.attention_mask,
    'labels': response.input_ids,
    # 'title_input_ids': title.input_ids,
    # 'title_input_mask': title.attention_mask
} """

dataset = get_dataset(
    file='./data/data_short_ans_train.json', 
    split="train", 
    encode_fn=encode_fn, 
    encode_args={"tokenizer": tokenizer, "max_length": 128}, 
    cache_dir=".cache"
)


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [6]:
args = TrainingArguments(
    output_dir='./train_result/',
    per_device_train_batch_size=1,
    auto_find_batch_size=True,  # 防止OOM
    gradient_accumulation_steps=10,
    learning_rate=1e-3,
    logging_steps=10,
    num_train_epochs=10,
    log_level='info',
    # save_steps=50,
    # save_total_limit=3,
    # fp16=config.fp16,
    # logging_first_step=config.logging_first_step,
    warmup_steps=50,
    seed=42,
)

# trainer
trainer = Trainer(
    model = model,
    args = args,
    train_dataset = dataset,
    data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm=False)
    # data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer)
)

# 训练模型
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching.
The following columns in the training set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: type, q_type_attention_mask, q_type, answer, question. If type, q_type_attention_mask, q_type, answer, question are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.
***** 

Step,Training Loss
10,1.4666
20,1.4471
30,1.421
40,1.429
50,1.4059
60,1.3565
70,1.3081
80,1.2707
90,1.2307
100,1.2003




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=100, training_loss=1.3535794067382811, metrics={'train_runtime': 128.1545, 'train_samples_per_second': 7.803, 'train_steps_per_second': 0.78, 'total_flos': 6195682573639680.0, 'train_loss': 1.3535794067382811, 'epoch': 10.0})