Install depedencies 

In [None]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

Configuration

In [None]:
config = {
    #"model_name": "unsloth/Qwen2.5-0.5B-Instruct-bnb-4bit",
    #"model_name": "unsloth/gemma-3-1b-it-unsloth-bnb-4bit",
    "model_name": "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "max_seq_length": 128,
    "batch_size": 128,
    "valid_ratio": 0.2,
    "dataset_path": "datasets/5k_bgl_train.txt"
}

Load the dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("text", data_files=config["dataset_path"])
print(dataset)
dataset = dataset["train"].train_test_split(test_size=config["valid_ratio"], shuffle=False, seed=42)
print(dataset)

Creating of the tokenizer

In [None]:
from transformers import PreTrainedTokenizerFast
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, processors

all_logkeys = set()
with open(config["dataset_path"], "r") as f:
    for line in f:
        logkeys = line.strip().split()
        all_logkeys.update(logkeys)

print(f"Found {len(all_logkeys)} unique log keys")


special_tokens = ["<bos>", "<eos>", "<pad>", "<unk>"]
vocab = special_tokens + list(all_logkeys)

vocab_dict = {token: i for i, token in enumerate(vocab)}

raw_tokenizer = Tokenizer(models.WordLevel(vocab=vocab_dict, unk_token="<unk>"))

raw_tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()

raw_tokenizer.normalizer = normalizers.Sequence([])

raw_tokenizer.post_processor = processors.TemplateProcessing(
    single="<bos> $A <eos>",
    special_tokens=[("<bos>", vocab_dict["<bos>"]), ("<eos>", vocab_dict["<eos>"])],
)

tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    bos_token="<bos>",
    eos_token="<eos>",
    pad_token="<pad>",
    unk_token="<unk>",
    model_max_length=config["max_seq_length"]
)

Tokenization of the dataset

In [None]:
def sliding_window_tokenize(examples):
    return tokenizer(examples["text"], truncation=True, max_length=config["max_seq_length"])

valid_tokenized_dataset = dataset["test"].map(
    sliding_window_tokenize,
    batched=True,
    batch_size = config["batch_size"],
    remove_columns=["text"]
)




train_tokenized_dataset = dataset["train"].map(
    sliding_window_tokenize,
    batched=True,
    batch_size = config["batch_size"],
    remove_columns=["text"]
)

Loading of the model

In [None]:
from transformers import DataCollatorForLanguageModeling
from unsloth import FastModel
import torch
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, seed=42)


model, _ = FastModel.from_pretrained(
    config["model_name"],
    max_seq_length=config["max_seq_length"],
    load_in_4bit=True,
    device_map="auto",
    dtype=torch.bfloat16,
)

model.resize_token_embeddings(len(tokenizer))

model = FastModel.get_peft_model(
    model,
    r=32,
    lora_alpha=64,
    finetune_language_layers   = True,
    finetune_mlp_modules       = True,
    inference_mode=False,
)

Training the model

In [None]:
from transformers import EarlyStoppingCallback

early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_tokenized_dataset,
    eval_dataset=valid_tokenized_dataset,
    data_collator=data_collator,
    callbacks=[early_stopping_callback],
    args=SFTConfig(
      output_dir="./output",
      eval_strategy="epoch",
      save_strategy="epoch",
      learning_rate=2e-4,
      num_train_epochs=100,
      per_device_train_batch_size=1028,
      per_device_eval_batch_size=1028,
      metric_for_best_model="eval_loss",
      bf16=True,
    ),
)


# Start training
trainer.train()

Saving the model and tokenizer

In [None]:
model.save_pretrained("trained_models/model") #change the path where you want to save the trained model
tokenizer.save_pretrained("tokenizers/tokenizer") #change the path where you want to save the tokenizer