# **Training Gemma 3 Model for Chukchi to Russian Translation**

## **Imports**

In [1]:
!pip install bitsandbytes accelerate transformers peft datasets

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-c

In [None]:
# !pip install -U flash-attn --no-build-isolation

In [2]:
import os
import shutil
import torch
import gc
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig, DataCollatorForSeq2Seq, TrainerCallback
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from huggingface_hub import notebook_login, create_repo, upload_folder
import pandas as pd
import json

In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## **Connecting to Google Drive**

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
DRIVE_PATH = "/content/drive/MyDrive/gemma3_lora_ckt_ru"
os.makedirs(DRIVE_PATH, exist_ok=True)

## **GPU Configuration**

In [6]:
print(f"CUDA доступна: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

CUDA доступна: True
GPU: NVIDIA L4


In [7]:
device_map = "auto"
device_capability = torch.cuda.get_device_capability()[0]

## **Data Preparation**

In [8]:
# dataset = load_dataset("csv", data_files="ckt-ru_filtered.csv", sep=";")

df = pd.read_csv("ckt-ru_filtered.csv", sep=";")

# Converting to Dataset
dataset = Dataset.from_pandas(df)

In [9]:
dataset = dataset.rename_columns({
    "ckt": "ckt",
    "ru": "rus"
})

In [10]:
dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

### **Tokenizer Preparation**

In [11]:
model_name = 'google/gemma-3-1b-pt'
source_lang = 'ckt'
target_lang = 'ru'

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

In [13]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [14]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device_map,
    quantization_config=quant_config,
    # attn_implementation="flash_attention_2" if device_capability >= 8 else "sdpa",
    attn_implementation="eager",
    torch_dtype=torch.bfloat16,
)

config.json:   0%|          | 0.00/880 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.00G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

In [15]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id

## **LoRA Adapter Configuration**

In [16]:
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"]
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 1,490,944 || all params: 1,001,376,896 || trainable%: 0.1489


## **Preprocessing**

In [17]:
def generate_prompt(example):
    return f"Переведи с чукотского на русский: {example['ckt']}"

In [20]:
def tokenize(example):
    # Create full text: prompt + translation
    prompt = generate_prompt(example)
    full_text = f"{prompt}\n{example['rus']}"

    # Tokenize full text
    tokenized = tokenizer(
        full_text,
        truncation=True,
        max_length=512,
        return_tensors="pt",
    )

    # Calculate prompt length in tokens
    with tokenizer.as_target_tokenizer():
        prompt_ids = tokenizer(prompt)["input_ids"]

    input_length = len(prompt_ids)

    # Create loss mask (ignore prompt)
    labels = tokenized["input_ids"].clone()
    labels[:, :input_length] = -100

    return {
        "input_ids": tokenized["input_ids"][0],
        "attention_mask": tokenized["attention_mask"][0],
        "labels": labels[0]
    }

In [21]:
tokenized_train_dataset = train_dataset.map(tokenize, remove_columns=train_dataset.column_names)
tokenized_eval_dataset = eval_dataset.map(tokenize, remove_columns=eval_dataset.column_names)

Map:   0%|          | 0/53806 [00:00<?, ? examples/s]



Map:   0%|          | 0/13452 [00:00<?, ? examples/s]

In [22]:
tokenized_train_dataset.save_to_disk(f"{DRIVE_PATH}/train_dataset")
tokenized_eval_dataset.save_to_disk(f"{DRIVE_PATH}/eval_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/53806 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/13452 [00:00<?, ? examples/s]

In [23]:
# Filtering out overly long examples
max_length = 512
tokenized_train_dataset = tokenized_train_dataset.filter(
    lambda x: len(x["input_ids"]) <= max_length
)
tokenized_eval_dataset = tokenized_eval_dataset.filter(
    lambda x: len(x["input_ids"]) <= max_length
)

Filter:   0%|          | 0/53806 [00:00<?, ? examples/s]

Filter:   0%|          | 0/13452 [00:00<?, ? examples/s]

## **Model Training**

Custom class for saving checkpoints to Google Drive

In [24]:
class DriveCheckpointCallback(TrainerCallback):
    def __init__(self, max_to_keep=2):
        self.max_to_keep = max_to_keep

    def on_save(self, args, state, control, **kwargs):
        if state.is_world_process_zero:
            checkpoint_path = os.path.join(args.output_dir, f"checkpoint-{state.global_step}")
            drive_checkpoint_path = os.path.join(DRIVE_PATH, f"checkpoint-{state.global_step}")

            try:
                if not os.path.exists(drive_checkpoint_path):
                    shutil.copytree(checkpoint_path, drive_checkpoint_path)
                    print(f"Checkpoint saved to Google Drive: {drive_checkpoint_path}")
                else:
                    print(f"Skipped copying: {drive_checkpoint_path} already exists.")
            except Exception as e:
                print(f"Copy error: {str(e)}")

            # Cleaning up old checkpoints
            checkpoints = [d for d in os.listdir(DRIVE_PATH) if d.startswith("checkpoint-")]
            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[-1]))

            if len(checkpoints) > self.max_to_keep:
                for ckpt in checkpoints[:-self.max_to_keep]:
                    path_to_remove = os.path.join(DRIVE_PATH, ckpt)
                    try:
                        shutil.rmtree(path_to_remove)
                        print(f"Removed old checkpoint: {path_to_remove}")
                    except Exception as e:
                        print(f"Deletion error {path_to_remove}: {e}")

Custom class for saving logs to Google Drive (for later visualization)

In [25]:
class LossHistoryCallback(TrainerCallback):
    def __init__(self, save_path):
        self.save_path = save_path
        self.loss_history = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is None:
            return
        record = {"Step": state.global_step}
        if "loss" in logs:
            record["Training_Loss"] = logs["loss"]
        if "eval_loss" in logs:
            record["Validation_Loss"] = logs["eval_loss"]
        self.loss_history.append(record)

        # Saving to file
        with open(self.save_path, "w") as f:
            json.dump(self.loss_history, f, indent=2)

In [26]:
os.makedirs("/content/drive/MyDrive/gemma3_logs", exist_ok=True)

In [27]:
loss_json_path = "/content/drive/MyDrive/gemma3_logs/losses_gemma3_ckt_rus_b1.json"
loss_callback = LossHistoryCallback(save_path=loss_json_path)

In [28]:
# Dynamic padding collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    padding=True,
    pad_to_multiple_of=8,
    return_tensors="pt"
)

In [29]:
training_args = TrainingArguments(
    output_dir=DRIVE_PATH,
    logging_dir=os.path.join(DRIVE_PATH, "logs"),
    logging_steps=50,
    learning_rate=2e-4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    max_steps=1000,
    weight_decay=0.01,
    eval_strategy="steps",
    eval_steps=200,
    save_steps=200,
    gradient_checkpointing=True,
    bf16=device_capability >= 8,   # Modern GPUs
    fp16=device_capability < 8,    # Older GPUs
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    report_to="none",
    group_by_length=True,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

In [30]:
trainer = Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[loss_callback, DriveCheckpointCallback(max_to_keep=2)]
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [31]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
200,3.2021,3.242111
400,3.161,3.156963
600,3.0806,3.106785
800,3.0362,3.078249
1000,2.9733,3.071605


Skipped copying: /content/drive/MyDrive/gemma3_lora_ckt_ru/checkpoint-200 already exists.
Skipped copying: /content/drive/MyDrive/gemma3_lora_ckt_ru/checkpoint-400 already exists.




Skipped copying: /content/drive/MyDrive/gemma3_lora_ckt_ru/checkpoint-600 already exists.




Skipped copying: /content/drive/MyDrive/gemma3_lora_ckt_ru/checkpoint-800 already exists.
Skipped copying: /content/drive/MyDrive/gemma3_lora_ckt_ru/checkpoint-1000 already exists.


TrainOutput(global_step=1000, training_loss=3.147610626220703, metrics={'train_runtime': 7034.6786, 'train_samples_per_second': 1.137, 'train_steps_per_second': 0.142, 'total_flos': 1381899206270976.0, 'train_loss': 3.147610626220703, 'epoch': 0.14868230308887484})

In [None]:
# Memory cleanup
del trainer
torch.cuda.empty_cache()
gc.collect()

In [32]:
final_model_path = os.path.join(DRIVE_PATH, "final_model")
model.save_pretrained(final_model_path)
tokenizer.save_pretrained(final_model_path)

('/content/drive/MyDrive/gemma3_lora_ckt_ru/final_model/tokenizer_config.json',
 '/content/drive/MyDrive/gemma3_lora_ckt_ru/final_model/special_tokens_map.json',
 '/content/drive/MyDrive/gemma3_lora_ckt_ru/final_model/tokenizer.model',
 '/content/drive/MyDrive/gemma3_lora_ckt_ru/final_model/added_tokens.json',
 '/content/drive/MyDrive/gemma3_lora_ckt_ru/final_model/tokenizer.json')

In [33]:
# Weight merging for full model save
merged_model = model.merge_and_unload()
merged_model.save_pretrained(os.path.join(DRIVE_PATH, "merged_model"))



## **Model Testing**

In [36]:
# Load best model
model_path = os.path.join(DRIVE_PATH, "merged_model")
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [34]:
# Translation generation
test_prompt = 'Переведи с чукотского на русский: Аройыръыка қынур арака.'
inputs = tokenizer(test_prompt, return_tensors="pt", padding=True).to(model.device)

In [37]:
model.eval()
with torch.no_grad():
    output_ids = model.generate(
        **inputs,
        max_new_tokens=50,
        num_beams=5,
        early_stopping=True,
        temperature=0.7,
        do_sample=True
    )

In [38]:
translation = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("Translation result:", translation)

Translation result: Переведи с чукотского на русский: Аройыръыка қынур арака.
Они вышли из дома.
Они вышли из дома. Они вышли из дома. Они вышли из дома. Они вышли из дома. Они вышли из дома. Они вышли из дома. Они вышли из


## **Uploading Model to Hugging Face**

In [39]:
from huggingface_hub import HfApi

# Repository configuration
repo_name = "gemma3-1b-ckt-rus"
organization = "HSE-Chukchi-NLP"

# Creating repository
create_repo(f"{organization}/{repo_name}", repo_type="model")

# Uploading model
api = HfApi()
api.upload_folder(
    folder_path=os.path.join(DRIVE_PATH, "merged_model"),
    repo_id=f"{organization}/{repo_name}",
    repo_type="model"
)

print(f"Model uploaded: https://huggingface.co/{organization}/{repo_name}")

adapter_model.safetensors:   0%|          | 0.00/5.99M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.60G [00:00<?, ?B/s]

Model uploaded: https://huggingface.co/HSE-Chukchi-NLP/gemma3-1b-ckt-rus
