In [1]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from datasets import load_from_disk
import evaluate
import numpy as np
import math

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_dir = './smollm2_model_1'
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForCausalLM.from_pretrained(model_dir)

tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(49153, 576)

In [3]:
tokenized_dataset = load_from_disk('./tokenized_dataset')
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 99698
    })
})


In [4]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [5]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions[:, :-1]
    labels = labels[:, 1:]
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    perplexity = math.exp(np.mean([-np.log(pred[label]) for pred,label in zip(predictions,labels) if label != -100 ]))
    return {"perplexity": perplexity}

In [6]:
training_args = TrainingArguments(
    output_dir="./smollm2-finetuned",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    fp16=False,
    learning_rate=5e-5,
    num_train_epochs=3,
    save_total_limit=3,
    logging_dir="./logs",
    logging_steps=1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)



In [7]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['train'], # Bạn có thể thay bằng tập validation nếu có
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [8]:
trainer.train()

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [None]:
trainer.save_model("./smollm2-finetuned")
tokenizer.save_pretrained("./smollm2-finetuned")
