In [2]:
# conda 25.5.1
# python 3.10.10
# torch 2.2.2
# transformers 4.38.2
# accelerate 0.30.0
# peft 0.10.0 (‰ΩøÁî®ÊóßÁâàÊú¨Ôºå‰∏é transformers 4.38.2 ÂÖºÂÆπ)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import get_peft_model, LoraConfig, TaskType

# select a model
BASE_MODEL_NAME = "uer/gpt2-chinese-cluecorpussmall"

# AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)

# AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_NAME)

lora_config = LoraConfig(
    r=8,    # lower rank for less memory usage
    lora_alpha=32,
    target_modules=[
        "c_attn", "c_proj",
        "mlp.c_fc", "mlp.c_proj",
    ],  # Âè™ÂØπÊ≥®ÊÑèÂäõÂ±ÇÂíå MLP Â±ÇËøõË°å LoRA ÂæÆË∞É
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

model = get_peft_model(model, lora_config)

# use peft to configure LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


  from .autonotebook import tqdm as notebook_tqdm


trainable params: 1,179,648 || all params: 103,248,384 || trainable%: 1.1425341049405675




In [2]:
# check device
import torch

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")


Using device: mps


In [6]:
from datasets import load_dataset

BLOCK_SIZE = 128
TRAIN_FILE = "cleaned_huagaiji.txt"

# load raw dataset
raw_dataset = load_dataset("text", data_files={"train": TRAIN_FILE})

# define function to tokenize the dataset
def tokenize_function(samples):
    return tokenizer(samples["text"])

# tokenize the dataset
# batched=True ‰ª•ÊâπÂ§ÑÁêÜÊñπÂºèËøõË°åÂàÜËØçÔºåÂä†Âø´Â§ÑÁêÜÈÄüÂ∫¶
# remove_columns=["text"] ÁßªÈô§ÂéüÂßãÊñáÊú¨Âàó, Âè™‰øùÁïô tokenized ÂêéÁöÑÊï∞ÊçÆ
tokenized_dataset = raw_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],
)

# ÂÆö‰πâ‰∏Ä‰∏™ÂáΩÊï∞Â∞ÜÊñáÊú¨ÂàÜÂùó
# ‰æãÂ¶ÇÔºö BLOCK_SIZE = 128, ÈÇ£‰πàÊØèÂùóÂ∞ÜÂåÖÂê´128‰∏™token
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // BLOCK_SIZE) * BLOCK_SIZE
    # Split by chunks of max_len
    result = {
        k: [t[i : i + BLOCK_SIZE] for i in range(0, total_length, BLOCK_SIZE)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

# ‰ΩøÁî® group_texts ÂáΩÊï∞Â∞ÜÊñáÊú¨ÂàÜÂùó
lm_dataset = tokenized_dataset.map(
    group_texts,
    batched=True,
    batch_size=1000,
)

print(lm_dataset)

Generating train split: 1139 examples [00:00, 231548.67 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1139/1139 [00:00<00:00, 30326.36 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1139/1139 [00:00<00:00, 3902.13 examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 461
    })
})





In [7]:
from transformers import TrainingArguments

OUTPUT_DIR = "./gpt2-luxun-finetuned-mps"
NUM_TRAIN_EPOCHS = 3
PER_DEVICE_TRAIN_BATCH_SIZE = 4
LEARNING_RATE = 5e-5
SAVE_STEPS = 500
OVERWRITE_OUTPUT_DIR = True

# ËÆæÁΩÆËÆ≠ÁªÉÂèÇÊï∞
# ‰æãÂ¶ÇÔºö num_train_epochs ÂèØ‰ª•ËÆæÁΩÆ‰∏∫‰Ω†Â∏åÊúõÁöÑËÆ≠ÁªÉËΩÆÊï∞
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=OVERWRITE_OUTPUT_DIR,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
    save_steps=SAVE_STEPS,
    learning_rate=LEARNING_RATE,
    logging_steps=50, # ÊØè50Ê≠•ÊâìÂç∞‰∏ÄÊ¨°Êó•Âøó
    evaluation_strategy="steps", # ÊØè500Ê≠•ËØÑ‰º∞‰∏ÄÊ¨°
    eval_steps=500,
    save_total_limit=2, # ÊúÄÂ§ö‰øùÂ≠ò2‰∏™checkpoint
    warmup_ratio=0.03, # È¢ÑÁÉ≠ÊØîÁéá
    use_mps_device=True, # ‰ΩøÁî®MPSËÆæÂ§á
)




In [8]:
from transformers import Trainer, DataCollatorForLanguageModeling

# ‰ΩøÁî® DataCollatorForLanguageModeling ‰Ωú‰∏∫Êï∞ÊçÆÊï¥ÁêÜÂô®
# Âú®ÊûÑÂª∫ÊØè‰∏™batchÊó∂ÔºåËá™Âä®ËøõË°åÂä®ÊÄÅÂ°´ÂÖÖ padding, Êé©Á†ÅÁîüÊàê masking, ‰ª•ÂèäÊ†áÁ≠æÂØπÈΩê
# ËøôÈáåÁöÑmlm=False, Ë°®Á§∫Êàë‰ª¨‰∏ç‰ΩøÁî®Êé©Á†ÅËØ≠Ë®ÄÊ®°Âûã (masked language modeling) ËøõË°åËÆ≠ÁªÉ
# Âõ†‰∏∫Êàë‰ª¨Âú®ÂÅöËá™ÂõûÂΩíËØ≠Ë®ÄÊ®°Âûã (causal language modeling) ËÆ≠ÁªÉ
# Â¶ÇÊûú‰Ω†‰ΩøÁî®ÁöÑÊòØ BERT Ê®°Âûã, ÈÇ£‰πà mlm=True, Ë°®Á§∫‰ΩøÁî®Êé©Á†ÅËØ≠Ë®ÄÊ®°ÂûãËøõË°åËÆ≠ÁªÉ

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# ÂàõÂª∫ Trainer ÂØπË±°
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print("--- ÂºÄÂßãËÆ≠ÁªÉ ---")
trainer.train()
print("--- ËÆ≠ÁªÉÂÆåÊàê ---")


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


--- ÂºÄÂßãËÆ≠ÁªÉ ---


 15%|‚ñà‚ñç        | 51/348 [00:09<00:31,  9.29it/s]

{'loss': 4.1968, 'grad_norm': 1.2414604425430298, 'learning_rate': 4.421364985163205e-05, 'epoch': 0.43}


 29%|‚ñà‚ñà‚ñâ       | 101/348 [00:14<00:25,  9.54it/s]

{'loss': 3.8949, 'grad_norm': 1.1374152898788452, 'learning_rate': 3.679525222551929e-05, 'epoch': 0.86}


 43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 151/348 [00:20<00:20,  9.50it/s]

{'loss': 3.8084, 'grad_norm': 1.5812174081802368, 'learning_rate': 2.937685459940653e-05, 'epoch': 1.29}


 58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 201/348 [00:26<00:15,  9.32it/s]

{'loss': 3.7896, 'grad_norm': 1.6326978206634521, 'learning_rate': 2.195845697329377e-05, 'epoch': 1.72}


 72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 251/348 [00:31<00:10,  9.11it/s]

{'loss': 3.7739, 'grad_norm': 1.4555388689041138, 'learning_rate': 1.454005934718101e-05, 'epoch': 2.16}


 86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 301/348 [00:37<00:05,  9.15it/s]

{'loss': 3.7541, 'grad_norm': 1.8545022010803223, 'learning_rate': 7.12166172106825e-06, 'epoch': 2.59}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 348/348 [00:42<00:00,  8.19it/s]

{'train_runtime': 42.4751, 'train_samples_per_second': 32.56, 'train_steps_per_second': 8.193, 'train_loss': 3.8502285836756913, 'epoch': 3.0}
--- ËÆ≠ÁªÉÂÆåÊàê ---





In [9]:
# ‰øùÂ≠òÊ®°Âûã
print(f"--- ‰øùÂ≠òÊ®°Âûã: {OUTPUT_DIR} ---")
model.save_pretrained(OUTPUT_DIR)

# ‰øùÂ≠ò tokenizer
tokenizer.save_pretrained(OUTPUT_DIR)
print("--- ÂÆåÊàê ---")

--- ‰øùÂ≠òÊ®°Âûã: ./gpt2-luxun-finetuned-mps ---
--- ÂÆåÊàê ---


In [11]:
from peft import PeftModel

MODEL_PATH = "./gpt2-luxun-finetuned-mps"
BASE_MODEL_NAME = "uer/gpt2-chinese-cluecorpussmall"

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")

print("--- Âä†ËΩΩÂàÜËØçÂô® ---")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

print("--- Âä†ËΩΩÂü∫Á°ÄÊ®°Âûã ---")
base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_NAME)

print("--- Âä†ËΩΩ LoRA adapter ---")
model = PeftModel.from_pretrained(base_model, MODEL_PATH)
model.to(device)
print("--- Ê®°ÂûãÂä†ËΩΩÂÆåÊàê ---")


Using device: mps
--- Âä†ËΩΩÂàÜËØçÂô® ---
--- Âä†ËΩΩÂü∫Á°ÄÊ®°Âûã ---
--- Âä†ËΩΩ LoRA adapter ---
--- Ê®°ÂûãÂä†ËΩΩÂÆåÊàê ---


In [14]:
def generate_text(prompt_text, max_length=150):

    print(f"--- ÁîüÊàêÊñáÊú¨: {prompt_text} ---")
    # ÁºñÁ†ÅÔºöÂ∞ÜËµ∑ÂßãÁöÑÂè•Â≠êËΩ¨Êç¢‰∏∫Ê®°ÂûãÂèØ‰ª•ÁêÜËß£ÁöÑidÔºåÂπ∂ÁßªÂä®Âà∞ËÆæÂ§á
    inputs = tokenizer(prompt_text, return_tensors="pt").to(device)

    # ÁîüÊàêÊñáÊú¨ÔºåË∞ÉÁî®Ê®°ÂûãÁöÑgenerateÊñπÊ≥ï
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        num_return_sequences=1, # ÁîüÊàêÂá†‰∏™‰∏çÂêåÁöÑÁªìÊûú
        do_sample=True, # ÊòØÂê¶‰ΩøÁî®ÈááÊ†∑, ËÆ©ÊñáÊú¨Êõ¥ÂÖ∑ÊúâÂàõÈÄ†ÊÄß
        top_k=50, # ‰ªéÊ¶ÇÁéáÊúÄÈ´òÁöÑ50‰∏™ËØç‰∏≠ÈÄâÊã©
        top_p=0.95, # ‰ªéÊ¶ÇÁéáÊÄªÂíå‰∏∫95%ÁöÑËØçÊ±á‰∏≠ÈÄâÊã©
        temperature=0.9, # ÊéßÂà∂ÁîüÊàêÈöèÊú∫ÊÄßÔºåÂÄºË∂äÂ§ßÔºåÈöèÊú∫ÊÄßË∂äÈ´ò
        repetition_penalty=1.5, # ÊéßÂà∂ÈáçÂ§çËØçÁöÑÂá∫Áé∞ÔºåÂÄºË∂äÂ§ßÔºåÈáçÂ§çËØçÂá∫Áé∞ÁöÑÊ¶ÇÁéáË∂ä‰Ωé
    )

    # Ëß£Á†ÅÔºöÂ∞ÜÊ®°ÂûãÁîüÊàêÁöÑidËΩ¨Êç¢‰∏∫ÊñáÊú¨
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print("--- ÁîüÊàêÁªìÊûú ---")
    print(generated_text)

prompt_text = "ÁßãÂ§©ÁöÑÂêéÂçäÂ§úÔºåÊúà‰∫Æ‰∏ãÂéª‰∫ÜÔºåÂ§™Èò≥ËøòÊ≤°ÊúâÂá∫ÔºåÂè™Ââ©‰∏ã‰∏ÄÁâá‰πåËìùÁöÑÂ§©Ôºõ"
generate_text(prompt_text, max_length=300)



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


--- ÁîüÊàêÊñáÊú¨: ÁßãÂ§©ÁöÑÂêéÂçäÂ§úÔºåÊúà‰∫Æ‰∏ãÂéª‰∫ÜÔºåÂ§™Èò≥ËøòÊ≤°ÊúâÂá∫ÔºåÂè™Ââ©‰∏ã‰∏ÄÁâá‰πåËìùÁöÑÂ§©Ôºõ ---
--- ÁîüÊàêÁªìÊûú ---
Áßã Â§© ÁöÑ Âêé Âçä Â§ú Ôºå Êúà ‰∫Æ ‰∏ã Âéª ‰∫Ü Ôºå Â§™ Èò≥ Ëøò Ê≤° Êúâ Âá∫ Ôºå Âè™ Ââ© ‰∏ã ‰∏Ä Áâá ‰πå Ëìù ÁöÑ Â§© Ôºõ Áßã
