In [1]:
from peft import LoraConfig
from datasets import load_dataset, load_from_disk
import torch
from peft import (
        get_peft_model, 
        prepare_model_for_kbit_training, 
        LoraConfig
    )
from trl import SFTTrainer, SFTConfig


# Load dataset

run **huggingface-cli login** in terminal

In [2]:
# Load first 10 rows of dataset from the hub
dataset = load_dataset(
    "samsum",
    split={"train": "train[:10]", "test": "test[:10]", "validation": "validation[:10]"},
    trust_remote_code=True,
)
# Save to local disk
# dataset.save_to_disk("textdata")


# Load dataset from disk
# dataset = load_from_disk("textdata")

print(f"Train dataset size: {len(dataset['train'])}")
print(f"Test dataset size: {len(dataset['test'])}")

# Train dataset size: 14732
# Test dataset size: 819

Train dataset size: 10
Test dataset size: 10


In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 10
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 10
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 10
    })
})

# load and save pretrain model from huggingface

In [4]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
model_path = "meta-llama/Llama-3.2-1B"
# model_path = "pretrain/meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    return_dict=True,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)


## save model to local

In [5]:
# tokenizer.save_pretrained('pretrain/meta-llama/Llama-3.2-1B')
# model.save_pretrained('pretrain/meta-llama/Llama-3.2-1B')

## 告訴模型如果要padding要放eos

In [6]:
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
if model.config.pad_token_id is None:
    model.config.pad_token_id = model.config.eos_token_id

# traing config

In [7]:
from transformers import TrainingArguments

In [8]:
from datasets import concatenate_datasets
import numpy as np

# Max input sequence length post-tokenization.
max_input_length = 512
max_target_length = 50


def preprocess_function(examples):
    inputs = [ex for ex in examples['dialogue']]
    # Tokenize inputs with padding
    model_inputs = tokenizer(
        inputs, 
        max_length=max_input_length, 
        truncation=True, 
        padding="max_length"
    )
    # Tokenize targets with padding
    labels = tokenizer(
        examples['summary'], 
        max_length=max_target_length, 
        truncation=True, 
        padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs
# tokenized_datasets = dataset.map(preprocess_function, batched=True)
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)


In [9]:
print(f"the len of input_ids is {len(tokenized_datasets["train"]['input_ids'][0])}")
print(f"the len of labels is {len(tokenized_datasets["train"]['labels'][0])}")

the len of input_ids is 512
the len of labels is 50


# tokenizer 轉換文字為數字代號

In [10]:
dataset["train"]["dialogue"][0]

"Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)"

In [11]:
dataset["train"]["summary"][0]

'Amanda baked cookies and will bring Jerry some tomorrow.'

In [12]:
for example in tokenized_datasets[ "train"]:
    # 提取對話和摘要
    input_ids = example['input_ids']
    labels = example['labels']
    print(f"input_ids= \n{input_ids}")
    print(f"labels = \n{labels}")
    break

input_ids= 
[128000, 32, 36645, 25, 358, 41778, 220, 8443, 13, 3234, 499, 1390, 1063, 46449, 90757, 25, 23371, 46726, 32, 36645, 25, 358, 3358, 4546, 499, 16986, 92941, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,

In [13]:
print(tokenizer.decode([32, 36645, 25, 358, 41778]))
print(tokenizer.decode([32, 36645, 41778, 8443, 323]))

Amanda: I baked
Amanda baked cookies and


In [14]:
tokenizer.encode("王大明")

[128000, 101538, 27384, 31958]

In [15]:
tokenizer.decode([ 31958+i for i in range(15)])

'明 adversequotelev sacrific_side mutexAGIC occurring Communicationumar编 Treatment.person LC ech'

In [16]:

for example in dataset[ "train"]:
    # 提取對話和摘要
    id = example['id']
    dialogue = example['dialogue']
    summary = example['summary']
    print(f"ids= \n{id}")
    print(f"dialogue = \n{dialogue}")
    print(f"summary = \n{summary}")
    break

ids= 
13818513
dialogue = 
Amanda: I baked  cookies. Do you want some?
Jerry: Sure!
Amanda: I'll bring you tomorrow :-)
summary = 
Amanda baked cookies and will bring Jerry some tomorrow.


# 特殊標記符號

In [17]:
tokenizer.encode('<|begin_of_text|>')

[128000, 128000]

# setting trainer

In [18]:
from transformers import DataCollatorForSeq2Seq

In [19]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer, 
    model=model, 
    padding=True,
    return_tensors="pt"
)

In [20]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
    save_total_limit=3,
)


# lora

In [21]:
# Define LoRA Config
lora_config = LoraConfig(
        r=1,
        lora_alpha=8,
        lora_dropout=0.1,
        target_modules=["q_proj","k_proj","v_proj","o_proj"],
        bias="none",
        task_type="CAUSAL_LM",
    )


In [22]:
# prepare int-8 model for training
model = prepare_model_for_kbit_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 212,992 || all params: 1,236,027,392 || trainable%: 0.0172


In [23]:
training_args = SFTConfig(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_eval_batch_size=4,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    max_seq_length=1024,
    save_steps=1,
    weight_decay=0.01,
    logging_steps=1
)

In [24]:
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    peft_config=lora_config,
    tokenizer=tokenizer,
    args=training_args
)


In [25]:
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

In [26]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,2.9083,2.917289
2,3.42,2.916554
3,3.3746,2.916253


TrainOutput(global_step=9, training_loss=2.7984157138400607, metrics={'train_runtime': 13.0247, 'train_samples_per_second': 2.303, 'train_steps_per_second': 0.691, 'total_flos': 89704775024640.0, 'train_loss': 2.7984157138400607, 'epoch': 3.0})

# 使用lora推論

In [27]:
from peft import PeftModel

In [28]:
peft_model_id = "results/checkpoint-9"

In [29]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    return_dict=True,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)


In [30]:
peft_model = PeftModel.from_pretrained(model, peft_model_id, torch_dtype=torch.float32, offload_folder="lora_results/lora_7/temp")

In [31]:
inputs = "summary the following dialogue for me \n"+dataset["train"]['dialogue'][0]
inputs

"summary the following dialogue for me \nAmanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)"

In [32]:
input_tokens = tokenizer(inputs, return_tensors="pt")["input_ids"].to('cuda')
attention_mask = tokenizer(inputs, return_tensors="pt")["attention_mask"].to('cuda')

In [33]:
generation_output = peft_model.generate(
    input_ids=input_tokens,
    attention_mask=attention_mask,
    max_new_tokens=250,
    do_sample=True,
    top_k=10,
    top_p=0.9,
    temperature=0.3,
    repetition_penalty=1.15,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)
op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(op)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


summary the following dialogue for me 
Amanda: I baked  cookies. Do you want some?
Jerry: Sure!
Amanda: I'll bring you tomorrow :-) 
I'm a bit confused here, because it's not clear to me what exactly is going on in this conversation.
The first line says that Amanda has just finished baking and wants to give Jerry some of her freshly-baked cookies.
In response to this request, she offers to bake another batch (line two). She then asks if he would like them now or later (lines three & four).
So far so good - but there are still several things which aren't quite right:
1) Why does she say "Do you want some?"? It sounds as though she expects him to accept her offer without any further questions being asked.
2) What happens when he accepts her offer?
3) How do we know whether they actually get made at all?
4) If they don't get made, why doesn't she ask again after a few days?
5) Is it possible that someone else will have offered to make them before she did?
6) Does anyone ever eat these coo

# 合併lora

In [34]:
model = peft_model.merge_and_unload()

In [35]:
model.save_pretrained("./results_merged", safe_serialization = True)

In [36]:
tokenizer.save_pretrained("./results_merged")

('./results_merged/tokenizer_config.json',
 './results_merged/special_tokens_map.json',
 './results_merged/tokenizer.json')