In [2]:
# conda 25.5.1
# python 3.10.10
# torch 2.2.2
# transformers 4.38.2
# accelerate 0.30.0
# peft 0.10.0 (使用旧版本，与 transformers 4.38.2 兼容)

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import get_peft_model, LoraConfig, TaskType

# select a model
model_name = "uer/gpt2-chinese-cluecorpussmall"

# AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(model_name)

lora_config = LoraConfig(
    r=8,    # lower rank for less memory usage
    lora_alpha=32,
    target_modules=[
        "c_attn", "c_proj",
        "mlp.c_fc", "mlp.c_proj",
    ],  # 只对注意力层和 MLP 层进行 LoRA 微调
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

model = get_peft_model(model, lora_config)

# use peft to configure LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


  from .autonotebook import tqdm as notebook_tqdm


trainable params: 1,179,648 || all params: 103,248,384 || trainable%: 1.1425341049405675




In [2]:
# check device
import torch

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")


Using device: mps


In [5]:
from datasets import load_dataset

BLOCK_SIZE = 128
TRAIN_FILE = "cleaned_huagaiji.txt"

# load raw dataset
raw_dataset = load_dataset("text", data_files={"train": TRAIN_FILE})

# define function to tokenize the dataset
def tokenize_function(samples):
    return tokenizer(samples["text"])

# tokenize the dataset
# batched=True 以批处理方式进行分词，加快处理速度
# remove_columns=["text"] 移除原始文本列, 只保留 tokenized 后的数据
tokenized_dataset = raw_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],
)

# 定义一个函数将文本分块
# 例如： BLOCK_SIZE = 128, 那么每块将包含128个token
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // BLOCK_SIZE) * BLOCK_SIZE
    # Split by chunks of max_len
    result = {
        k: [t[i : i + BLOCK_SIZE] for i in range(0, total_length, BLOCK_SIZE)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result
    

FileNotFoundError: Unable to find '/Users/jiaronghe/Desktop/projects/light-weight-private-llm/chapter4/cleaned_huagaiji.txt'