In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForSeq2Seq
from datasets import load_from_disk
from torch.utils.data import DataLoader
from transformers import AdamW, get_scheduler
import torch
from tqdm import tqdm

# 1. Load model và tokenizer
model_path = "./smollm2_model_1"
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Sử dụng eos_token làm pad_token
tokenizer.pad_token = tokenizer.eos_token

# 2. Load dataset
dataset = load_from_disk("./tokenized_dataset")
train_dataset = dataset['train']

# 3. DataCollator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,  # Sử dụng pad_token
    return_tensors="pt"
)

# 4. DataLoader
train_dataloader = DataLoader(
    train_dataset,
    batch_size=16,
    shuffle=True,
    collate_fn=data_collator
)

# 5. Optimizer và Scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(train_dataloader) * 3  # 3 epochs
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

# 6. Huấn luyện model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

progress_bar = tqdm(range(num_training_steps))
model.train()

for epoch in range(3):  # 3 epochs
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

# 7. Lưu model đã fine-tune
model.save_pretrained("./lora_finetuned_model")
tokenizer.save_pretrained("./lora_finetuned_model")


  from .autonotebook import tqdm as notebook_tqdm
  0%|          | 0/18696 [00:00<?, ?it/s]

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_text` in this case) have excessive nesting (inputs type `list` where type `int` is expected).