In [1]:
import torch
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 1. Tải dữ liệu
tokenized_dataset = load_from_disk("./tokenized_dataset")

In [3]:
# 2. Tải model và tokenizer
model_path = "./smollm2_model_1"
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    load_in_8bit=True,
    torch_dtype=torch.bfloat16
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [4]:
tokenized_dataset = tokenized_dataset.map(lambda x: tokenizer(x["input_text"], padding=True, truncation=True))

Map: 100%|██████████| 99698/99698 [00:15<00:00, 6258.77 examples/s]


In [5]:
# 3. Cấu hình LoRA
config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 921,600 || all params: 135,436,608 || trainable%: 0.6805


In [6]:
# 4. Cấu hình TrainingArguments
training_args = TrainingArguments(
    output_dir="./lora_smollm2",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    save_steps=500,
    logging_steps=100,
    remove_unused_columns=False,
    save_total_limit=3,
    push_to_hub=False,
    optim="paged_adamw_8bit",  # Sử dụng optimizer 8bit để tiết kiệm bộ nhớ
)

In [7]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True, truncation=True)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


TypeError: DataCollatorWithPadding.__init__() got an unexpected keyword argument 'truncation'

In [None]:
trainer.train()

In [None]:
# 6. Lưu model đã fine-tune
model.save_pretrained("./lora_smollm2")