In [None]:
import torch
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    Trainer, 
    TrainingArguments
)
from datasets import load_from_disk
import accelerate

In [None]:
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))

In [None]:
tokenizer_path = './smollm2_model_1'
model = AutoModelForCausalLM.from_pretrained(tokenizer_path).cuda()
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

In [None]:
dataset = load_from_disk('./tokenized_dataset')

In [None]:
# Thêm padding token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

# Tạo data collator với padding và truncation
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False  # Không sử dụng Masked Language Modeling
)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="no",
    fp16=True,
    dataloader_num_workers=4,
    max_grad_norm=0.1
)

In [None]:
def preprocess_dataset(dataset, tokenizer):
    def tokenize_function(examples):
        return tokenizer(
            examples['input_text'], 
            truncation=True, 
            padding='max_length', 
            max_length=249
        )
    
    processed_dataset = dataset.map(
        tokenize_function, 
        batched=True, 
        remove_columns=dataset['train'].column_names
    )
    
    return processed_dataset

# Áp dụng tiền xử lý
processed_dataset = preprocess_dataset(dataset, tokenizer)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset['train'],
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [None]:
trainer.train()