In [1]:
base_model = "google/gemma-3-270m-it" # @param ["google/gemma-3-270m-it","google/gemma-3-1b-it","google/gemma-3-4b-it","google/gemma-3-12b-it","google/gemma-3-27b-it"] {"allow-input":true}
checkpoint_dir = "/mnt/d/ru_norm_gemma"
learning_rate = 5e-5

In [2]:
from datasets import load_dataset

# Load dataset from the Hub
dataset = load_dataset("kenenbek/gemma-russian-normalization-dataset")
print(dataset["train"][0]["messages"])
print(dataset["validation"][0]["messages"])

[{'content': 'После окончания японо - китайской войны Хуан Цзунъин получила приглашение от одной из шанхайских кинокомпаний сняться в картине « Преследование ».', 'role': 'user'}, {'content': 'после окончания японо  китайской войны хуан цзунъин получила приглашение от одной из шанхайских кинокомпаний сняться в картине  преследование .', 'role': 'assistant'}]
[{'content': '106. Чермоев Абдул Меджид Арцуевич / / БСЭ.', 'role': 'user'}, {'content': 'сто шесть. чермоев абдул меджид арцуевич   бсэ.', 'role': 'assistant'}]


In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="cuda",
    attn_implementation="eager"
)
tokenizer = AutoTokenizer.from_pretrained(base_model)

print(f"Device: {model.device}")
print(f"DType: {model.dtype}")

`torch_dtype` is deprecated! Use `dtype` instead!


Device: cuda:0
DType: torch.bfloat16


In [8]:
from trl import SFTConfig

torch_dtype = model.dtype

args = SFTConfig(
    output_dir=checkpoint_dir,              # directory to save and repository id
    max_length=512,                         # max sequence length for model and packing of the dataset
    packing=False,                          # Groups multiple samples in the dataset into a single sequence
    num_train_epochs=5,                     # number of training epochs
    per_device_train_batch_size=4,          # batch size per device during training
    gradient_checkpointing=False,           # Caching is incompatible with gradient checkpointing
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=10,                        # log every step
    save_strategy="epoch",                  # save checkpoint every epoch
    eval_strategy="steps",                  # evaluate checkpoint every epoch
    learning_rate=learning_rate,            # learning rate
    fp16=True if torch_dtype == torch.float16 else False,   # use float16 precision
    bf16=True if torch_dtype == torch.bfloat16 else False,  # use bfloat16 precision
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    report_to="wandb",                # report metrics to tensorboard
    dataset_kwargs={
        "add_special_tokens": False, # Template with special tokens
        "append_concat_token": True, # Add EOS token as separator token between examples
    }
)

In [9]:
from trl import SFTTrainer

# Create Trainer object
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    processing_class=tokenizer,
)

In [None]:
# Start training, the model will be automatically saved to the Hub and the output directory
trainer.train()

# Save the final model again to the Hugging Face Hub
trainer.save_model()

Step,Training Loss,Validation Loss


In [10]:
tokenizer.pad_token

'<pad>'

In [11]:
if tokenizer.pad_token is None:
    print("Tokenizer does not have a pad_token. Setting it to eos_token.")
    tokenizer.pad_token = tokenizer.eos_token
    # Important: Update the model's config to reflect this change
    model.config.pad_token_id = tokenizer.pad_token_id


# --- Print All Configurations for Comparison ---

print("\n" + "="*40)
print("         TOKEN CONFIGURATION OVERVIEW")
print("="*40 + "\n")

# 1. Tokenizer
print("--- 1. Tokenizer ---")
print(f"{'BOS token:':<15} '{tokenizer.bos_token}' (ID: {tokenizer.bos_token_id})")
print(f"{'EOS token:':<15} '{tokenizer.eos_token}' (ID: {tokenizer.eos_token_id})")
print(f"{'PAD token:':<15} '{tokenizer.pad_token}' (ID: {tokenizer.pad_token_id})")
print(f"{'UNK token:':<15} '{tokenizer.unk_token}' (ID: {tokenizer.unk_token_id})")
print("-" * 40)

# 2. Model Config
print("--- 2. Model Config (`model.config`) ---")
print(f"{'BOS token ID:':<15} {model.config.bos_token_id}")
print(f"{'EOS token ID:':<15} {model.config.eos_token_id}")
print(f"{'PAD token ID:':<15} {model.config.pad_token_id}")
print("-" * 40)

# 3. Generation Config
print("--- 3. Generation Config (`model.generation_config`) ---")
print(f"{'BOS token ID:':<15} {model.generation_config.bos_token_id}")
print(f"{'EOS token ID:':<15} {model.generation_config.eos_token_id}")
print(f"{'PAD token ID:':<15} {model.generation_config.pad_token_id}")
print("="*40)

# --- Final Check for Alignment ---
if (tokenizer.pad_token_id == model.config.pad_token_id and
    tokenizer.eos_token_id == model.config.eos_token_id and
    tokenizer.bos_token_id == model.config.bos_token_id):
    print("\n✅ All configurations are aligned.")
else:
    print("\n⚠️ Warning: Mismatch detected between tokenizer and model configs.")


         TOKEN CONFIGURATION OVERVIEW

--- 1. Tokenizer ---
BOS token:      '<bos>' (ID: 2)
EOS token:      '<eos>' (ID: 1)
PAD token:      '<pad>' (ID: 0)
UNK token:      '<unk>' (ID: 3)
----------------------------------------
--- 2. Model Config (`model.config`) ---
BOS token ID:   2
EOS token ID:   1
PAD token ID:   0
----------------------------------------
--- 3. Generation Config (`model.generation_config`) ---
BOS token ID:   2
EOS token ID:   [1, 106]
PAD token ID:   0

✅ All configurations are aligned.


In [12]:
# Assuming 'tokenizer' is your loaded tokenizer object
secondary_stop_token_id = 106
decoded_token = tokenizer.decode([secondary_stop_token_id])

print(f"The token with ID {secondary_stop_token_id} is: '{decoded_token}'")

The token with ID 106 is: '<end_of_turn>'
