Install depedencies 

In [None]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

Configuration

In [None]:
config = {
    "model_name": "unsloth/Qwen2.5-0.5B-Instruct-bnb-4bit",
    #"model_name": "unsloth/gemma-3-1b-it-unsloth-bnb-4bit",
    #"model_name": "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "max_seq_length": 128,
    "batch_size": 128,
    "valid_ratio": 0.2,
    "dataset_path": "datasets/5k_bgl_train.txt"
}

Load the dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("text", data_files=config["dataset_path"])
dataset = dataset["train"].train_test_split(test_size=config["valid_ratio"], shuffle=False, seed=42)

Creating of the tokenizer

In [None]:
from transformers import PreTrainedTokenizerFast
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, processors

all_logkeys = set()
with open(config["dataset_path"], "r") as f:
    for line in f:
        logkeys = line.strip().split()
        all_logkeys.update(logkeys)

print(f"Found {len(all_logkeys)} unique log keys")

special_tokens = ["<bos>", "<eos>", "<pad>", "<unk>"]
vocab = special_tokens + list(all_logkeys)

vocab_dict = {token: i for i, token in enumerate(vocab)}

raw_tokenizer = Tokenizer(models.WordLevel(vocab=vocab_dict, unk_token="<unk>"))

raw_tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()

raw_tokenizer.normalizer = normalizers.Sequence([])

raw_tokenizer.post_processor = processors.TemplateProcessing(
    single="<bos> $A <eos>",
    special_tokens=[("<bos>", vocab_dict["<bos>"]), ("<eos>", vocab_dict["<eos>"])],
)

tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    bos_token="<bos>",
    eos_token="<eos>",
    pad_token="<pad>",
    unk_token="<unk>",
    model_max_length=config["max_seq_length"]
)

Tokenization of the dataset

In [None]:
def sliding_window_tokenize(examples):
    return tokenizer(examples["text"], truncation=True, max_length=config["max_seq_length"])


valid_tokenized_dataset = dataset["test"].map(
    sliding_window_tokenize,
    batched=True,
    batch_size = config["batch_size"],
    remove_columns=["text"]
)




train_tokenized_dataset = dataset["train"].map(
    sliding_window_tokenize,
    batched=True,
    batch_size = config["batch_size"],
    remove_columns=["text"]
)


Loading of the model

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, seed=42)


from transformers import DataCollatorForLanguageModeling
from unsloth import FastModel
import torch
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig


model, _ = FastModel.from_pretrained(
    config["model_name"],
    max_seq_length=config["max_seq_length"],
    load_in_4bit=True,
    device_map="auto",
    dtype=torch.bfloat16,
)

model.resize_token_embeddings(len(tokenizer))

model = FastModel.get_peft_model(
    model,
    r=32,
    lora_alpha=64,
    finetune_language_layers   = True,  # Should leave on!
    finetune_mlp_modules       = True,  # SHould leave on always!
    inference_mode=False,
)


Training the model

In [None]:
from transformers import EarlyStoppingCallback

early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_tokenized_dataset,
    eval_dataset=valid_tokenized_dataset,
    data_collator=data_collator,
    callbacks=[early_stopping_callback],
    args=SFTConfig(
      output_dir="./output",
      eval_strategy="epoch",
      save_strategy="epoch",
      learning_rate=2e-4,
      num_train_epochs=200,
      per_device_train_batch_size=3072,
      per_device_eval_batch_size=3072,
      metric_for_best_model="eval_loss",
      bf16=True,
    ),
)


trainer.train()

Saving the model and tokenizer

In [None]:
import shutil
import os

best_model_source = str(trainer.state.best_model_checkpoint)
destination = "drive/MyDrive/final" #change this to the name you wanto to save the model

# Check if the best_model directory exists
if os.path.exists(best_model_source):
    try:
        shutil.copytree(best_model_source, destination, dirs_exist_ok=True)
        print(f"Best model saved to: {destination}")
    except Exception as e:
        print(f"Error copying best model: {e}")
else:
    print("Best model directory not found. Ensure training completed and best model saving was configured.")

Load the trained model to for calculating the K 

In [None]:
from unsloth import FastModel
from transformers import AutoTokenizer
from peft import PeftModel, PeftConfig
tokenizer = AutoTokenizer.from_pretrained("drive/MyDrive/final") #change this to where the model was saved

model, _ = FastModel.from_pretrained(
    "drive/MyDrive/final", #change this to where the model was saved
    max_seq_length=config["max_seq_length"],
    load_in_4bit=True,
    resize_model_vocab=len(tokenizer),
    device_map="auto",
)

model = FastModel.for_inference(model)

Get k

In [None]:
import numpy as np
from tqdm import tqdm
import torch
def calculate_multiple_topk_miss_rates(sequence, topk_candidates):
    model.eval()
    device = model.device
    topk_candidates = sorted(set(topk_candidates))

    print(f"Starting evaluation with top_k values: {topk_candidates} on device: {device}")

    miss_rates_by_k = {k: [] for k in topk_candidates}

    with torch.no_grad():
        with tqdm(total=len(sequence), desc=f"Evaluation", unit="sequence") as pbar:
            for seq_idx, token_list in enumerate(sequence):
                token_tensor = torch.tensor(token_list, dtype=torch.long, device=device).unsqueeze(0)
                input_ids = token_tensor[:, :-1]
                labels = token_tensor[:, 1:]

                if input_ids.size(1) == 0:
                    for k in topk_candidates:
                        miss_rates_by_k[k].append(1.0)
                    pbar.update(1)
                    continue

                with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
                    outputs = model(input_ids=input_ids)

                logits = outputs.logits.to(torch.float32)

                unk_token_id = tokenizer.convert_tokens_to_ids("<unk>")
                if unk_token_id is not None and 0 <= unk_token_id < logits.shape[-1]:
                    logits[:, :, unk_token_id] = float('-inf')

                total_tokens_to_predict = labels.size(1)

                if total_tokens_to_predict <= 2:
                    for k in topk_candidates:
                        miss_rates_by_k[k].append(0.0)
                    pbar.update(1)
                    continue

                max_k = max(topk_candidates)
                _, topk_predictions = torch.topk(logits, k=max_k, dim=-1)

                for k in topk_candidates:
                    correct = 0
                    for i in range(total_tokens_to_predict):
                        true_token_id = labels[0, i].item()
                        predicted_k_token_ids = topk_predictions[0, i, :k].tolist()
                        if true_token_id in predicted_k_token_ids:
                            correct += 1
                    miss_rate = 1.0 - (correct / total_tokens_to_predict)
                    miss_rate = 0.01 if (0 < miss_rate < 0.01) else miss_rate
                    miss_rates_by_k[k].append(miss_rate)

                pbar.update(1)

    return {k: np.array(miss_rates) for k, miss_rates in miss_rates_by_k.items()}

tokenizer_vocab_size = len(tokenizer)
topk_candidates = list(range(1, tokenizer_vocab_size + 1))
miss_rates_dict = calculate_multiple_topk_miss_rates(valid_tokenized_dataset["input_ids"], topk_candidates)

for k, miss_rates in miss_rates_dict.items():
    print(f"Top-{k} avg miss rate: {miss_rates.mean():.4f}")