To run this, press "*Runtime*" and press "*Run all*" on a **free** Tesla T4 Google Colab instance!
<div class="align-center">
<a href="https://unsloth.ai/"><img src="https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png" width="115"></a>
<a href="https://discord.gg/unsloth"><img src="https://github.com/unslothai/unsloth/raw/main/images/Discord button.png" width="145"></a>
<a href="https://docs.unsloth.ai/"><img src="https://github.com/unslothai/unsloth/blob/main/images/documentation%20green%20button.png?raw=true" width="125"></a></a> Join Discord if you need help + ⭐ <i>Star us on <a href="https://github.com/unslothai/unsloth">Github</a> </i> ⭐
</div>

To install Unsloth on your own computer, follow the installation instructions on our Github page [here](https://docs.unsloth.ai/get-started/installing-+-updating).

You will learn how to do [data prep](#Data), how to [train](#Train), how to [run the model](#Inference), & [how to save it](#Save)


In [None]:
# Импорт необходимых библиотек
import json  # для работы с JSON
import pandas as pd  # для работы с DataFrame
from typing import List, Dict  # для аннотации типов

def extract_json_objects(text):
    """Извлекает все JSON-объекты из текста с помощью подсчёта скобок."""
    results = []  # список для хранения найденных JSON-строк
    text = str(text)  # преобразуем входные данные в строку (на случай, если это не строка)
    start = 0  # начальная позиция поиска

    # Проходим по всему тексту в поисках JSON-объектов
    while start < len(text):
        # Пропускаем символы, пока не найдем открывающую фигурную скобку
        if text[start] != '{':
            start += 1
            continue

        # Начинаем подсчет скобок для определения границ JSON-объекта
        brace_count = 1  # счетчик скобок (уже нашли одну открывающую)
        current = start + 1  # текущая позиция в тексте
        in_string = False  # флаг нахождения внутри строки (между кавычками)
        escape = False  # флаг экранирования символов

        # Продолжаем до конца текста или пока не закроются все скобки
        while current < len(text) and brace_count > 0:
            char = text[current]  # текущий символ

            # Обрабатываем кавычки (игнорируем их внутри строки)
            if char == '"' and not escape:
                in_string = not in_string  # переключаем флаг
            elif not in_string:  # если не внутри строки
                if char == '{':
                    brace_count += 1  # увеличиваем счетчик открывающих скобок
                elif char == '}':
                    brace_count -= 1  # уменьшаем счетчик закрывающих скобок

            # Обрабатываем экранированные символы
            escape = (char == '\\') and not escape
            current += 1  # переходим к следующему символу

        # Если все скобки закрыты - нашли валидный JSON
        if brace_count == 0:
            json_str = text[start:current]  # извлекаем подстроку с JSON
            results.append(json_str)  # добавляем в результаты
            start = current  # продолжаем поиск с текущей позиции
        else:
            start += 1  # если JSON невалиден, продолжаем поиск

    return results  # возвращаем список найденных JSON-строк

def repair_json(json_str):
    """Исправляет распространённые проблемы в JSON строках."""
    # Удаляем только самые проблемные символы
    cleaned = []  # список для "очищенных" символов
    for char in json_str:
        ord_char = ord(char)  # получаем числовое представление символа
        # Удаляем управляющие символы (0-31), кроме табуляции, перевода строки и возврата каретки
        if 0 <= ord_char <= 8 or 11 <= ord_char <= 14 or ord_char == 15 or ord_char == 127:
            continue  # пропускаем нежелательные символы
        cleaned.append(char)  # добавляем "хорошие" символы
    return ''.join(cleaned)  # собираем строку обратно

def parse_lora_dataframe(df: pd.DataFrame) -> List[Dict]:
    """Парсит DataFrame с данными в колонке Summary."""
    results = []  # список для хранения результатов парсинга

    # Проверяем наличие нужной колонки в DataFrame
    if 'Summary' not in df.columns:
        raise KeyError("Колонка 'Summary' не найдена в DataFrame")

    # Обрабатываем каждую запись в колонке Summary (игнорируя пропущенные значения)
    for summary in df['Summary'].dropna():
        try:
            # Извлекаем все JSON-объекты из текста
            json_objects = extract_json_objects(summary)

            # Обрабатываем каждый найденный JSON
            for json_str in json_objects:
                try:
                    # Чиним JSON и загружаем в словарь
                    json_str = repair_json(json_str)
                    data = json.loads(json_str)

                    # Проверяем обязательные поля и их содержание
                    required = ['source_lang', 'source_text', 'target_lang', 'target_text', 'needs_correction']

                    # Убеждаемся, что все поля присутствуют
                    if not all(key in data for key in required):
                        continue  # пропускаем JSON без обязательных полей

                    # Проверяем что тексты не пустые
                    source_text = str(data['source_text'])
                    target_text = str(data['target_text'])
                    if not source_text.strip() or not target_text.strip():
                        continue  # пропускаем пустые тексты

                    # Определяем output в зависимости от needs_correction
                    if str(data['needs_correction']).lower() == 'false':
                        output = "Translation is correct"  # если перевод корректен
                    else:
                        output = data.get('validation_notes', '')  # иначе берем заметки валидации

                    # Форматируем запись в нужном формате
                    results.append({
                        "instruction": "Find mistakes in translated text.",  # инструкция
                        "input": f"{data['source_lang']}: {source_text}\n{data['target_lang']}: {target_text}",  # входные данные
                        "output": output  # выходные данные (результат проверки)
                    })

                # Обрабатываем ошибки парсинга JSON и отсутствия ключей
                except (json.JSONDecodeError, KeyError) as e:
                    continue  # просто пропускаем проблемные записи

        # Обрабатываем любые другие ошибки
        except Exception as e:
            continue  # пропускаем проблемные записи

    return results  # возвращаем список отформатированных данных



In [None]:
# Основной код с правильной балансировкой
if __name__ == "__main__":
    # Укажите пути к файлам
    file_name_masterfile = "/content/Датасет для обучения Мастерфайл.xlsx"
    file_name_masterquest = "/content/Датасет для обучения Вопросники.xlsx"
    output_json = "/content/formatted_data.json"
    output_eval_json = "/content/eval_data.json"

    # Чтение и объединение данных
    df_masterfile = pd.read_excel(file_name_masterfile)
    df_masterquest = pd.read_excel(file_name_masterquest)

    # Нормализуем названия колонок
    df_masterfile.columns = df_masterfile.columns.str.strip()
    df_masterquest.columns = df_masterquest.columns.str.strip()

    combined_df = pd.concat([df_masterfile, df_masterquest], ignore_index=True)
    combined_df['Result_ai'] = combined_df['Result_ai'].str.strip()

    # Разделяем датафрейм на Fixed и No fix
    fixed_df = combined_df[combined_df['Result_ai'] == 'Fixed']
    no_fix_df = combined_df[combined_df['Result_ai'] == 'No fix']

    # Выделяем 10% от каждого класса для eval (сохраняем пропорции)
    eval_size = 0.1
    fixed_eval = fixed_df.sample(frac=eval_size, random_state=42)
    no_fix_eval = no_fix_df.sample(frac=eval_size, random_state=42)

    # Оставшиеся данные идут в train
    fixed_train = fixed_df.drop(fixed_eval.index)
    no_fix_train = no_fix_df.drop(no_fix_eval.index)

    # Увеличиваем количество Fixed в 2 раза
    fixed_train = pd.concat([fixed_train] * 2, ignore_index=True)

    # Вычисляем сколько нужно No fix примеров (Fixed должно быть в 1.2 раза больше)
    desired_no_fix_count = int(len(fixed_train) / 1.2)

    # Если No fix примеров больше нужного - берем подмножество
    if len(no_fix_train) > desired_no_fix_count:
        no_fix_train = no_fix_train.sample(n=desired_no_fix_count, random_state=42)
    else:
        # Иначе - дублируем существующие
        multiplier = desired_no_fix_count // len(no_fix_train) + 1
        no_fix_train = pd.concat([no_fix_train] * multiplier, ignore_index=True)
        no_fix_train = no_fix_train.sample(n=desired_no_fix_count, random_state=42)

    # Собираем итоговые train и eval датасеты
    train_df = pd.concat([fixed_train, no_fix_train], ignore_index=True).sample(frac=1, random_state=42)
    eval_df = pd.concat([fixed_eval, no_fix_eval], ignore_index=True).sample(frac=1, random_state=42)

    print(f"Исходные данные: Fixed {len(fixed_df)}, No fix {len(no_fix_df)}")
    print(f"Train dataset: Fixed {len(fixed_train)} (x2), No fix {len(no_fix_train)}")
    print(f"Eval dataset: Fixed {len(fixed_eval)}, No fix {len(no_fix_eval)}")
    print(f"Пропорция Fixed/No fix в трейне: {len(fixed_train)/len(no_fix_train):.2f}:1")

    # Обработка DataFrame
    train_parsed = parse_lora_dataframe(train_df)
    eval_parsed = parse_lora_dataframe(eval_df)

    # Сохранение результатов
    with open(output_json, 'w', encoding='utf-8') as f:
        json.dump(train_parsed, f, ensure_ascii=False, indent=2)

    with open(output_eval_json, 'w', encoding='utf-8') as f:
        json.dump(eval_parsed, f, ensure_ascii=False, indent=2)

    print(f"Успешно обработано {len(train_parsed)} train и {len(eval_parsed)} eval записей.")

Исходные данные: Fixed 3427, No fix 1551
Train dataset: Fixed 6168 (x2), No fix 5140
Eval dataset: Fixed 343, No fix 155
Пропорция Fixed/No fix в трейне: 1.20:1
Успешно обработано 11005 train и 484 eval записей.


### News

Unsloth now supports Text-to-Speech (TTS) models. Read our [guide here](https://docs.unsloth.ai/basics/text-to-speech-tts-fine-tuning).

Read our **[Qwen3 Guide](https://docs.unsloth.ai/basics/qwen3-how-to-run-and-fine-tune)** and check out our new **[Dynamic 2.0](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs)** quants which outperforms other quantization methods!

Visit our docs for all our [model uploads](https://docs.unsloth.ai/get-started/all-our-models) and [notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks).


### Installation

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

### Unsloth

`FastModel` supports loading nearly any model now! This includes Vision and Text models!

In [None]:
from unsloth import FastModel
import torch

fourbit_models = [
    # 4bit dynamic quants for superior accuracy and low memory use
    "unsloth/gemma-3-1b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-4b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-27b-it-unsloth-bnb-4bit",

    # Other popular models!
    "unsloth/Llama-3.1-8B",
    "unsloth/Llama-3.2-3B",
    "unsloth/Llama-3.3-70B",
    "unsloth/mistral-7b-instruct-v0.3",
    "unsloth/Phi-4",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    max_seq_length = 2048, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    # token = "hf_...", # use one if using gated models
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.7.3: Fast Gemma3 patching. Transformers: 4.53.1.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/2.84G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/70.0 [00:00<?, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

We now add LoRA adapters so we only need to update a small amount of parameters!

In [None]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # Turn off for just text!
    finetune_language_layers   = True,  # Should leave on!
    finetune_attention_modules = True,  # Attention good for GRPO
    finetune_mlp_modules       = True,  # SHould leave on always!

    r = 8,           # Larger = higher accuracy, but might overfit
    lora_alpha = 8,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
)

Unsloth: Making `model.base_model.model.model.language_model` require gradients


<a name="Data"></a>
### Data Prep
We now use the `Gemma-3` format for conversation style finetunes. We use [Maxime Labonne's FineTome-100k](https://huggingface.co/datasets/mlabonne/FineTome-100k) dataset in ShareGPT style. Gemma-3 renders multi turn conversations like below:

```
<bos><start_of_turn>user
Hello!<end_of_turn>
<start_of_turn>model
Hey there!<end_of_turn>
```

We use our `get_chat_template` function to get the correct chat template. We support `zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, phi3, llama3, phi4, qwen2.5, gemma3` and more.

In [None]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)

def format_to_gemma(example):
    return {
        "text": (
            "<start_of_turn>user\n" +
            f"{example['instruction']}\n{example['input']}<end_of_turn>\n" +
            "<start_of_turn>model\n" +
            f"{example['output']}<end_of_turn>"
        )
    }

In [None]:

from datasets import Dataset
train_dataset = Dataset.from_list(train_parsed).map(format_to_gemma)
eval_dataset = Dataset.from_list(eval_parsed).map(format_to_gemma)

Map:   0%|          | 0/11005 [00:00<?, ? examples/s]

Map:   0%|          | 0/484 [00:00<?, ? examples/s]

We now use `standardize_data_formats` to try converting datasets to the correct format for finetuning purposes!

In [None]:
from unsloth.chat_templates import standardize_data_formats
train_dataset = standardize_data_formats(train_dataset)
eval_dataset = standardize_data_formats(eval_dataset)

Let's see how row 100 looks like!

In [None]:
print(train_dataset[100])
print(eval_dataset[100])


{'instruction': 'Find mistakes in translated text.', 'input': "EN: Good, the camp's intact! Now, how do we get there without raising suspicion...\n\nMr. Fox, where are we...\nFR: Bon, le camp est intact ! Reste à savoir comment s'y rendre sans faire de vagues...\n\nM. Fox, où nous rendons... ", 'output': "The phrase 'où nous rendons' repeats the verb from earlier ('s'y rendre'), which could be avoided for clarity and readability. A suggested alternative is 'où devons-nous'.", 'text': "<start_of_turn>user\nFind mistakes in translated text.\nEN: Good, the camp's intact! Now, how do we get there without raising suspicion...\n\nMr. Fox, where are we...\nFR: Bon, le camp est intact ! Reste à savoir comment s'y rendre sans faire de vagues...\n\nM. Fox, où nous rendons... <end_of_turn>\n<start_of_turn>model\nThe phrase 'où nous rendons' repeats the verb from earlier ('s'y rendre'), which could be avoided for clarity and readability. A suggested alternative is 'où devons-nous'.<end_of_turn>"}


[Текст ссылки](https://)We now have to apply the chat template for `Gemma-3` onto the conversations, and save it to `text`. We remove the `<bos>` token using removeprefix(`'<bos>'`) since we're finetuning. The Processor will add this token before training and the model expects only one.

Let's see how the chat template did! Notice there is no `<bos>` token as the processor tokenizer will be adding one.

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`.

We also use Unsloth's `train_on_completions` method to only train on the assistant outputs and ignore the loss on the user's inputs. This helps increase accuracy of finetunes!

In [None]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=SFTConfig(
        dataset_text_field="text",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=100,
        learning_rate=2e-5,
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        seed=3407,
        report_to="none",
        # Доп. параметры для сохранения чекпоинтов (опционально)
        save_steps=25,           # Сохранять чекпоинты каждые 25 шагов
        output_dir="checkpoints", # Папка для чекпоинтов
    ),
)


Unsloth: Tokenizing ["text"] (num_proc=12):   0%|          | 0/11005 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=12):   0%|          | 0/484 [00:00<?, ? examples/s]

In [None]:
# Перед началом обучения установите уровень логирования
import transformers
transformers.logging.set_verbosity_info()  # Установить подробный уровень логирования

In [None]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)

Map (num_proc=12):   0%|          | 0/11005 [00:00<?, ? examples/s]

Map (num_proc=12):   0%|          | 0/484 [00:00<?, ? examples/s]

Let's verify masking the instruction part is done! Let's print the 100th row again.  Notice how the sample only has a single `<bos>` as expected!

In [None]:
tokenizer.decode(trainer.train_dataset[100]["input_ids"])

"<bos><start_of_turn>user\nFind mistakes in translated text.\nEN: Good, the camp's intact! Now, how do we get there without raising suspicion...\n\nMr. Fox, where are we...\nFR: Bon, le camp est intact ! Reste à savoir comment s'y rendre sans faire de vagues...\n\nM. Fox, où nous rendons... <end_of_turn>\n<start_of_turn>model\nThe phrase 'où nous rendons' repeats the verb from earlier ('s'y rendre'), which could be avoided for clarity and readability. A suggested alternative is 'où devons-nous'.<end_of_turn>"

Now let's print the masked out example - you should see only the answer is present:

In [None]:
tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[100]["labels"]]).replace(tokenizer.pad_token, " ")

"                                                                                  The phrase 'où nous rendons' repeats the verb from earlier ('s'y rendre'), which could be avoided for clarity and readability. A suggested alternative is 'où devons-nous'.<end_of_turn>"

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.557 GB.
12.639 GB of memory reserved.


Let's train the model! To resume a training run, set `trainer.train(resume_from_checkpoint = True)`

In [None]:
from transformers import TrainerCallback
import os

class RobustSaveBestModelCallback(TrainerCallback):
    def __init__(self, model, tokenizer, output_dir="best_model", save_interval=50, start_saving_step=50):
        self.model = model
        self.tokenizer = tokenizer
        self.best_loss = float("inf")
        self.output_dir = output_dir
        self.save_interval = save_interval
        self.start_saving_step = start_saving_step  # Новый параметр
        self.best_step = 0
        os.makedirs(output_dir, exist_ok=True)

    def on_step_end(self, args, state, control, **kwargs):
        try:
            if not hasattr(state, 'log_history') or not state.log_history:
                return

            last_log = state.log_history[-1]
            if not isinstance(last_log, dict) or 'loss' not in last_log:
                return

            current_loss = last_log['loss']

            # Сохраняем, если:
            # 1. Текущий шаг >= 50 (start_saving_step)
            # 2. Loss улучшился ИЛИ это шаг сохранения (кратный save_interval)
            if state.global_step >= self.start_saving_step:
                if current_loss < self.best_loss or state.global_step % self.save_interval == 0:
                    if current_loss < self.best_loss:
                        self.best_loss = current_loss
                        self.best_step = state.global_step
                    self._save_model(state.global_step)

        except Exception as e:
            print(f"⚠️ Ошибка в колбэке: {str(e)}")

    def _save_model(self, step):
        """Безопасное сохранение модели"""
        try:
            output_path = os.path.join(self.output_dir, f"step_{step}")
            self.model.save_pretrained(output_path)
            self.tokenizer.save_pretrained(output_path)
            print(f"🔥 [Step {step}] Новый рекорд! Loss: {self.best_loss:.4f}")
        except Exception as e:
            print(f"⚠️ Ошибка при сохранении модели: {str(e)}")

    def on_train_end(self, args, state, control, **kwargs):
        """Сохраняем окончательную лучшую версию"""
        try:
            if self.best_step > 0:
                final_path = os.path.join(self.output_dir, "final_best_model")
                model.save_pretrained(final_path)
                tokenizer.save_pretrained(final_path)
                print(f"🏆 Лучшая модель сохранена (шаг {self.best_step}, loss: {self.best_loss:.4f})")
        except Exception as e:
            print(f"⚠️ Ошибка при финальном сохранении: {str(e)}")

# Добавляем колбэк
trainer.add_callback(RobustSaveBestModelCallback(model, tokenizer, save_interval=50))


In [None]:
trainer_stats = trainer.train()
# Сохраняем модель и токенизатор
trainer.save_model("my_finetuned_model")  # Папка для сохранения

# Дополнительно: сохраняем логи обучения (если нужно)
trainer.save_state()  # Сохраняет trainer_state.json

The following columns in the Training set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: attention_mask, input, instruction, output, text. If attention_mask, input, instruction, output, text are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.
skipped Embedding(4096, 1152): 4.5M params
skipped Gemma3TextScaledWordEmbedding(262208, 3840, padding_idx=0): 964.734375M params
skipped: 964.734375M params
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 11,005 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 32,735,232 of 12,220,060,272 (0.27% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,5.8766
2,5.9063
3,6.9885
4,8.6467
5,7.5261
6,5.8033
7,5.6698
8,5.615
9,5.498
10,5.7928


Saving model checkpoint to checkpoints/checkpoint-25
Image processor saved in checkpoints/checkpoint-25/preprocessor_config.json
chat template saved in checkpoints/checkpoint-25/chat_template.jinja
processor saved in checkpoints/checkpoint-25/processor_config.json
Image processor saved in best_model/step_50/preprocessor_config.json
chat template saved in best_model/step_50/chat_template.jinja
processor saved in best_model/step_50/processor_config.json
Saving model checkpoint to checkpoints/checkpoint-50


🔥 [Step 50] Новый рекорд! Loss: 1.9939


Image processor saved in checkpoints/checkpoint-50/preprocessor_config.json
chat template saved in checkpoints/checkpoint-50/chat_template.jinja
processor saved in checkpoints/checkpoint-50/processor_config.json
Image processor saved in best_model/step_53/preprocessor_config.json
chat template saved in best_model/step_53/chat_template.jinja
processor saved in best_model/step_53/processor_config.json


🔥 [Step 53] Новый рекорд! Loss: 1.9136


Image processor saved in best_model/step_55/preprocessor_config.json
chat template saved in best_model/step_55/chat_template.jinja
processor saved in best_model/step_55/processor_config.json


🔥 [Step 55] Новый рекорд! Loss: 1.7373


Image processor saved in best_model/step_59/preprocessor_config.json
chat template saved in best_model/step_59/chat_template.jinja
processor saved in best_model/step_59/processor_config.json


🔥 [Step 59] Новый рекорд! Loss: 1.2854


Saving model checkpoint to checkpoints/checkpoint-75
Image processor saved in checkpoints/checkpoint-75/preprocessor_config.json
chat template saved in checkpoints/checkpoint-75/chat_template.jinja
processor saved in checkpoints/checkpoint-75/processor_config.json
Image processor saved in best_model/step_100/preprocessor_config.json
chat template saved in best_model/step_100/chat_template.jinja
processor saved in best_model/step_100/processor_config.json
Saving model checkpoint to checkpoints/checkpoint-100


🔥 [Step 100] Новый рекорд! Loss: 1.2854


Image processor saved in checkpoints/checkpoint-100/preprocessor_config.json
chat template saved in checkpoints/checkpoint-100/chat_template.jinja
processor saved in checkpoints/checkpoint-100/processor_config.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Image processor saved in best_model/final_best_model/preprocessor_config.json
chat template saved in best_model/final_best_model/chat_template.jinja
processor saved in best_model/final_best_model/processor_config.json
Saving model checkpoint to my_finetuned_model


🏆 Лучшая модель сохранена (шаг 59, loss: 1.2854)


Image processor saved in my_finetuned_model/preprocessor_config.json
chat template saved in my_finetuned_model/chat_template.jinja
processor saved in my_finetuned_model/processor_config.json


In [None]:
# @title Пишу в коммент, а появляется наверху
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

543.3535 seconds used for training.
9.06 minutes used for training.
Peak reserved memory = 12.928 GB.
Peak reserved memory for training = 0.289 GB.
Peak reserved memory % of max memory = 32.682 %.
Peak reserved memory for training % of max memory = 0.731 %.


<a name="Inference"></a>
### Inference
Let's run the model via Unsloth native inference! According to the `Gemma-3` team, the recommended settings for inference are `temperature = 1.0, top_p = 0.95, top_k = 64`

In [None]:
import torch
torch.cuda.empty_cache()
del model  # или del best_model, если была старая загружена


In [None]:
# @title Пишу в коммент, а появляется наверху
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

543.3535 seconds used for training.
9.06 minutes used for training.
Peak reserved memory = 12.928 GB.
Peak reserved memory for training = 0.289 GB.
Peak reserved memory % of max memory = 32.682 %.
Peak reserved memory for training % of max memory = 0.731 %.


загружаем в CPU

In [None]:
from unsloth import FastModel
import os
from datetime import datetime

# 1. Загружаем базовую модель
model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    max_seq_length = 2048,
    load_in_4bit = True,
    device_map = "cuda"
)

# 2. Загружаем веса LoRA отдельно
model.load_adapter("best_model/step_100")  # <-- вот этот шаг применяет LoRA

# 3. Сохраняем объединённую модель
save_dir = f"gemma3-12b-finetuned-best-{datetime.now().strftime('%Y%m%d_%H%M%S')}"
os.makedirs(save_dir, exist_ok=True)

model.save_pretrained(save_dir, safe_serialization=True)
tokenizer.save_pretrained(save_dir)

print(f"Модель (с LoRA) сохранена в: {os.path.abspath(save_dir)}")

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--gemma-3-12b-it-unsloth-bnb-4bit/snapshots/ae99a6b1e5587dce5f26e651ab40eca8cc997362/config.json
text_config is None, using default Gemma3TextConfig text config.
vision_config is None, using default SiglipVisionConfig vision config.
Model config Gemma3Config {
  "architectures": [
    "Gemma3ForConditionalGeneration"
  ],
  "boi_token_index": 255999,
  "bos_token_id": 2,
  "eoi_token_index": 256000,
  "eos_token_id": 106,
  "image_token_index": 262144,
  "initializer_range": 0.02,
  "mm_tokens_per_image": 256,
  "model_type": "gemma3",
  "pad_token_id": 0,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip

==((====))==  Unsloth 2025.7.3: Fast Gemma3 patching. Transformers: 4.53.1.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unslothai--repeat/snapshots/7c48478c02f84ed89f149b0815cc0216ee831fb0/config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unslothai--vram-40/snapshots/e334ea2975f98e60c52847955fdf1587bacb0b65/config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unslothai--1/snapshots/7ec782b7604cd9ea0781c23a4270f031650f5617/config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--gemma-3-12b-it-unsloth-bnb-4bit/snapshots/ae99a6b1e5587dce5f26e651ab40eca8cc997362/config.json
text_config is None, using default Gemma3TextConfig text config.
vision_config is None, using default SiglipVisionConfig vision config.
Model config Gemma3Config {
  "architectures": [
    "Gemma3ForConditionalGeneration"
  ],
  "boi_token_index": 255999,
  "bos_token_id": 2,
  "eoi_token_index":

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
	257160: AddedToken("<unused1258>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	257161: AddedToken("<unused1259>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	257162: AddedToken("<unused1260>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	257163: AddedToken("<unused1261>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	257164: AddedToken("<unused1262>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	257165: AddedToken("<unused1263>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	257166: AddedToken("<unused1264>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	257167: AddedToken("<unused1265>", rstrip=False, lstrip=False, single_word=False, normalized=False

Модель (с LoRA) сохранена в: /content/gemma3-12b-finetuned-best-20250714_191755


In [None]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)
messages = [{
    "role": "user",
    "content": [{
        "type" : "text",
        "text" :
'''
"Source text (English [EN]):
Marked by death at birth, Sofia thought her visions a blessing until the day they led four dark figures to her family home.

Translation (German [DE]):
Vom Tod bei der Geburt gezeichnet.

Find mistakes in translations and explain them.
'''
    }]
}]
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # Must add for generation
)
outputs = model.generate(
    **tokenizer([text], return_tensors = "pt").to("cuda"),
    max_new_tokens = 64, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 0.1, top_p = 0.95, top_k = 64,
)
tokenizer.batch_decode(outputs)

['<bos><start_of_turn>user\n"Source text (English [EN]):\nMarked by death at birth, Sofia thought her visions a blessing until the day they led four dark figures to her family home.\n\nTranslation (German [DE]):\nVom Tod bei der Geburt gezeichnet.\n\nFind mistakes in translations and explain them.<end_of_turn>\n<start_of_turn>model\nThe translation is incomplete. It only translates the first part of the sentence.<end_of_turn>']

In [None]:
messages = [{
    "role": "user",
    "content": [{
        "type" : "text",
        "text" :
'''
Translate from English to Korean:
The wind has picked up, and we're a bit cold
'''
    }]
}]
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # Must add for generation
)
outputs = model.generate(
    **tokenizer([text], return_tensors = "pt").to("cuda"),
    max_new_tokens = 64, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 0.3, top_p = 0.95, top_k = 64,
)
tokenizer.batch_decode(outputs)

['<bos><start_of_turn>user\nTranslate from English to Korean:\nThe wind has picked up, and we\'re a bit cold<end_of_turn>\n<start_of_turn>model\nHere are a few options for translating "The wind has picked up, and we\'re a bit cold" into Korean, with varying levels of formality:\n\n**1. Most Common & Natural (Casual):**\n\n*   **바람이 많이 불어서 좀 추워요.** (Barami mani bureose']

In [None]:
messages = [{
    "role": "user",
    "content": [{
        "type" : "text",
        "text" :
'''
Translate from Russian to English:
Обсоснодиджейск!
'''
    }]
}]
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # Must add for generation
)
outputs = model.generate(
    **tokenizer([text], return_tensors = "pt").to("cuda"),
    max_new_tokens = 64, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 0.3, top_p = 0.95, top_k = 64,
)
tokenizer.batch_decode(outputs)

['<bos><start_of_turn>user\nTranslate from Russian to English:\nОбсоснодиджейск!<end_of_turn>\n<start_of_turn>model\nThis is a tricky one! "Обсоснодиджейск!" is a playful, made-up word. It\'s a portmanteau (a blend of words) combining:\n\n*   **Об** (Ob) - A prefix often used to indicate "about" or "around"\n']

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [None]:
messages = [{
    "role": "user",
    "content": [{"type" : "text", "text" : '''
"Source text (English [EN]):
Marked by death at birth, Sofia thought her visions a blessing until the day they led four dark figures to her family home.

Translation (German [DE]):
Vom Tod bei der Geburt gezeichnet.

Find mistakes in translations and explain them.
''',}]
}]
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # Must add for generation
)

from transformers import TextStreamer
_ = model.generate(
    **tokenizer([text], return_tensors = "pt").to("cuda"),
    max_new_tokens = 64, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 1.0, top_p = 0.95, top_k = 64,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

Okay, let's analyze the German translation provided and identify the errors.

**The Problem**

The provided translation is incomplete: *Vom Tod bei der Geburt gezeichnet.* which translates to *Marked by death at birth*.

**Explanation**

The translation captures the first part of the English sentence,


In [None]:
messages = [{
    "role": "user",
    "content": [{"type" : "text", "text" : '''
"Source text (English [EN]):
Marked by death at birth, Sofia thought her visions a blessing until the day they led four dark figures to her family home.

Translation (German [DE]):
Vom Tod bei der Geburt gezeichnet.

Find mistakes in translations and explain them.
''',}]
}]
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # Must add for generation
)

from transformers import TextStreamer
_ = model.generate(
    **tokenizer([text], return_tensors = "pt").to("cuda"),
    max_new_tokens = 64, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 0.3, top_p = 0.95, top_k = 64,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

The translation is incomplete. It only translates the first part of the sentence. The rest of the sentence is missing.<end_of_turn>


In [None]:
messages = [{
    "role": "user",
    "content": [{"type" : "text", "text" : '''
"Source text (English [EN]):
Please construct your string sequence

Translation (Spanish (Latin America) [ES-LA]):
Selecciona las cadenas de apertura y cierre

Find mistakes in translations and explain them.
''',}]
}]
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # Must add for generation
)

from transformers import TextStreamer
_ = model.generate(
    **tokenizer([text], return_tensors = "pt").to("cuda"),
    max_new_tokens = 64, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 0.3, top_p = 0.95, top_k = 64,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

The translation "Selecciona las cadenas de apertura y cierre" is incorrect.

**Explanation of the Mistake:**

The original English phrase "Please construct your string sequence" is asking the user to *create* or *build* a sequence of strings.  The Spanish translation "Selecciona las cadenas de apertura y


In [None]:
messages = [{
    "role": "user",
    "content": [{"type" : "text", "text" : "Why is the sky blue?",}]
}]
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # Must add for generation
)

from transformers import TextStreamer
_ = model.generate(
    **tokenizer([text], return_tensors = "pt").to("cuda"),
    max_new_tokens = 64, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 1.0, top_p = 0.95, top_k = 64,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

The sky is blue due to a phenomenon called **Rayleigh scattering**. Here's a breakdown of what that means:

**1. Sunlight is a Rainbow (of Colors):**

* Sunlight appears white, but it's actually made up of all the colors of the rainbow: red, orange, yellow,


<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
model.save_pretrained("gemma-3")  # Local saving
tokenizer.save_pretrained("gemma-3")
# model.push_to_hub("HF_ACCOUNT/gemma-3", token = "...") # Online saving
# tokenizer.push_to_hub("HF_ACCOUNT/gemma-3", token = "...") # Online saving

Configuration saved in gemma-3/generation_config.json
Detected adapters on the model, saving the model in the PEFT format, only adapter weights will be saved.
To match the expected format of the PEFT library, all keys of the state dict of adapters will be prepended with `base_model.model`.
Model weights saved in gemma-3/adapter_model.safetensors
Image processor saved in gemma-3/preprocessor_config.json
chat template saved in gemma-3/chat_template.jinja
processor saved in gemma-3/processor_config.json


['gemma-3/processor_config.json']

Now if you want to load the LoRA adapters we just saved for inference, set


In [None]:
if True:
    from unsloth import FastModel
    model, tokenizer = FastModel.from_pretrained(
        model_name = "/content/my_finetuned_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = 2048,
        load_in_4bit = True,
    )

messages = [{
    "role": "user",
    "content": [{"type" : "text", "text" : "What is Gemma-3?",}]
}]
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # Must add for generation
)

from transformers import TextStreamer
_ = model.generate(
    **tokenizer([text], return_tensors = "pt").to("cuda"),
    max_new_tokens = 64, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 1.0, top_p = 0.95, top_k = 64,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--gemma-3-12b-it-unsloth-bnb-4bit/snapshots/ae99a6b1e5587dce5f26e651ab40eca8cc997362/config.json
text_config is None, using default Gemma3TextConfig text config.
vision_config is None, using default SiglipVisionConfig vision config.
Model config Gemma3Config {
  "architectures": [
    "Gemma3ForConditionalGeneration"
  ],
  "boi_token_index": 255999,
  "bos_token_id": 2,
  "eoi_token_index": 256000,
  "eos_token_id": 106,
  "image_token_index": 262144,
  "initializer_range": 0.02,
  "mm_tokens_per_image": 256,
  "model_type": "gemma3",
  "pad_token_id": 0,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip

==((====))==  Unsloth 2025.7.3: Fast Gemma3 patching. Transformers: 4.53.1.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unslothai--repeat/snapshots/7c48478c02f84ed89f149b0815cc0216ee831fb0/config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unslothai--vram-40/snapshots/e334ea2975f98e60c52847955fdf1587bacb0b65/config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unslothai--1/snapshots/7ec782b7604cd9ea0781c23a4270f031650f5617/config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--gemma-3-12b-it-unsloth-bnb-4bit/snapshots/ae99a6b1e5587dce5f26e651ab40eca8cc997362/config.json
text_config is None, using default Gemma3TextConfig text config.
vision_config is None, using default SiglipVisionConfig vision config.
Model config Gemma3Config {
  "architectures": [
    "Gemma3ForConditionalGeneration"
  ],
  "boi_token_index": 255999,
  "bos_token_id": 2,
  "eoi_token_index":

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
	257153: AddedToken("<unused1251>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	257154: AddedToken("<unused1252>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	257155: AddedToken("<unused1253>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	257156: AddedToken("<unused1254>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	257157: AddedToken("<unused1255>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	257158: AddedToken("<unused1256>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	257159: AddedToken("<unused1257>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	257160: AddedToken("<unused1258>", rstrip=False, lstrip=False, single_word=False, normalized=False

Gemma-3 is a family of open-weight, state-of-the-art large language models created by the Gemma team at Google DeepMind. Here's a breakdown of what we know so far, as of late May 2024:

**Key Facts and Features:**

*   


In [None]:
if True: # Change to True to save finetune!
    model.save_pretrained_merged("gemma-3-finetune", tokenizer)

Image processor saved in gemma-3-finetune/preprocessor_config.json
chat template saved in gemma-3-finetune/chat_template.jinja
processor saved in gemma-3-finetune/processor_config.json
text_config is None, using default Gemma3TextConfig text config.
vision_config is None, using default SiglipVisionConfig vision config.
text_config is None, using default Gemma3TextConfig text config.
vision_config is None, using default SiglipVisionConfig vision config.
text_config is None, using default Gemma3TextConfig text config.
vision_config is None, using default SiglipVisionConfig vision config.
Configuration saved in gemma-3-finetune/config.json
Configuration saved in gemma-3-finetune/generation_config.json
Model weights saved in gemma-3-finetune/model.safetensors


Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model-00001-of-00005.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Downloading safetensors index for unsloth/gemma-3-12b-it...


Unsloth: Merging weights into 16bit: 100%|██████████| 5/5 [03:27<00:00, 41.45s/it]


In [None]:
if False: # Change to True to upload finetune
    model.push_to_hub_merged(
        "HF_ACCOUNT/gemma-3-finetune", tokenizer,
        token = "hf_..."
    )

In [None]:
if True: # Change to True to save to GGUF
    model.save_pretrained_gguf(
        "gemma-3-finetune",
        quantization_type = "Q8_0", # For now only Q8_0, BF16, F16 supported
    )

Unsloth: Updating system package directories
Unsloth: Install GGUF and other packages
Unsloth GGUF:hf-to-gguf:Loading model: gemma-3-finetune
Unsloth GGUF:hf-to-gguf:Model architecture: Gemma3ForConditionalGeneration
Unsloth GGUF:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
Unsloth GGUF:hf-to-gguf:Exporting model...
Unsloth GGUF:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
Unsloth GGUF:hf-to-gguf:gguf: loading model part 'model-00001-of-00005.safetensors'
Unsloth GGUF:hf-to-gguf:token_embd.weight,                 torch.bfloat16 --> Q8_0, shape = {3840, 262208}
Unsloth GGUF:hf-to-gguf:gguf: loading model part 'model-00002-of-00005.safetensors'
Unsloth GGUF:hf-to-gguf:gguf: loading model part 'model-00003-of-00005.safetensors'
Unsloth GGUF:hf-to-gguf:gguf: loading model part 'model-00004-of-00005.safetensors'
Unsloth GGUF:hf-to-gguf:gguf: loading model part 'model-00005-of-00005.safetensors'
Unsloth GGUF:hf-to-gguf:output_norm.weight,  

Unsloth: GGUF conversion:   0%|          | 0/100 [00:00<?, ?it/s]

Unsloth GGUF:hf-to-gguf:Model successfully exported to ./
Unsloth: Converted to gemma-3-finetune.Q8_0.gguf with size = 12.5G
Unsloth: Successfully saved GGUF to:
gemma-3-finetune.Q8_0.gguf


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Поиск файла
!find /content/drive/MyDrive -name "gemma-3-finetune.Q8_0.gguf" 2>/dev/null

Mounted at /content/drive
/content/drive/MyDrive/gemma-3-finetune.Q8_0.gguf


In [None]:
from google.colab import drive
drive.mount('/content/drive')  # Смонтируйте ваш диск

# Скопируйте файл в Google Drive
!cp gemma-3-finetune.Q8_0.gguf /content/drive/MyDrive/

print("Файл сохранён в Google Drive: /MyDrive/gemma-3-finetune.Q8_0.gguf")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Файл сохранён в Google Drive: /MyDrive/gemma-3-finetune.Q8_0.gguf


In [None]:
# Установка Ollama
!curl -fsSL https://ollama.com/install.sh | sh

# Запуск сервера Ollama в фоновом режиме
import subprocess
import time

# Запускаем сервер Ollama
ollama_process = subprocess.Popen(["ollama", "serve"])

# Даем серверу время на запуск
time.sleep(5)

# Создаем Modelfile
with open("Modelfile", "w") as f:
    f.write(f"""
FROM ./gemma-3-finetune.Q8_0.gguf
PARAMETER num_ctx 8192
""")

# Создаем модель в Ollama
!ollama create my-gemma -f Modelfile

# Теперь можно запустить модель
print("Запускаем модель...")
!ollama run my-gemma "Привет! Напиши короткое стихотворение о искусственном интеллекте"

>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.
[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h

In [None]:
# Освободить память перед запуском
import gc
import torch
gc.collect()
torch.cuda.empty_cache()

Сохранение

Используем загруженную модель для перевода


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_path = f"/content/{save_dir}"  # Путь к вашей модели
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype=torch.float16)

In [None]:
print(type(text), text)


Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [None]:
if False:
    from unsloth import FastModel
    model, tokenizer = FastModel.from_pretrained(
        model_name="my_finetuned_model",
        max_seq_length=2048,
        load_in_4bit=True,
    )

messages = [{
    "role": "user",
    "content": [{"type": "text", "text": "What is Gemma-3?"}]
}]

import torch
from transformers import TextStreamer

# Формируем prompt
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=False
)

# Токенизируем
inputs = tokenizer(text, return_tensors="pt")

# Переносим входы на CUDA без изменения типа
inputs = {k: v.to("cuda") for k, v in inputs.items()}

# Переносим модель на CUDA без изменения типа
model = model.to("cuda")

# Создаем стример для вывода
streamer = TextStreamer(tokenizer, skip_prompt=True)

# Оборачиваем генерацию в autocast с float16
from torch import autocast

with autocast("cuda", dtype=torch.float16):
    _ = model.generate(
        **inputs,
        max_new_tokens=64,
        temperature=1.0,
        top_p=0.95,
        top_k=64,
        streamer=streamer,
    )


### Saving to float16 for VLLM

We also support saving to `float16` directly for deployment! We save it in the folder `gemma-3-finetune`. Set `if False` to `if True` to let it run!

In [None]:
if True: # Change to True to save finetune!
    model.save_pretrained_merged("gemma-3-finetune", tokenizer)

If you want to upload / push to your Hugging Face account, set `if False` to `if True` and add your Hugging Face token and upload location!

In [None]:
if False: # Change to True to upload finetune
    model.push_to_hub_merged(
        "HF_ACCOUNT/gemma-3-finetune", tokenizer,
        token = "hf_..."
    )

### GGUF / llama.cpp Conversion
To save to `GGUF` / `llama.cpp`, we support it natively now for all models! For now, you can convert easily to `Q8_0, F16 or BF16` precision. `Q4_K_M` for 4bit will come later!

In [None]:
if True: # Change to True to save to GGUF
    model.save_pretrained_gguf(
        "gemma-3-finetune",
        quantization_type = "Q8_0", # For now only Q8_0, BF16, F16 supported
    )

In [None]:
from unsloth import FastModel
import os
from datetime import datetime

# 1. Загружаем базовую модель
model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    max_seq_length = 2048,
    load_in_4bit = True,
)

# 2. Применяем LoRA веса
model.load_adapter("best_model/step_75")

# ✅ 3. Объединяем LoRA в основную модель
model = FastModel.merge_lora(model)

# ✅ 4. Сохраняем как GGUF
model.save_pretrained_gguf(
    "gemma3-12b-finetuned.gguf",  # путь к итоговому .gguf файлу
    quantization_type = "Q8_0",   # также доступно: "F16", "BF16"
)


Likewise, if you want to instead push to GGUF to your Hugging Face account, set `if False` to `if True` and add your Hugging Face token and upload location!

In [None]:
if False: # Change to True to upload GGUF
    model.push_to_hub_gguf(
        "gemma-3-finetune",
        quantization_type = "Q8_0", # Only Q8_0, BF16, F16 supported
        repo_id = "HF_ACCOUNT/gemma-finetune-gguf",
        token = "hf_...",
    )

Now, use the `gemma-3-finetune.gguf` file or `gemma-3-finetune-Q4_K_M.gguf` file in llama.cpp or a UI based system like Jan or Open WebUI. You can install Jan [here](https://github.com/janhq/jan) and Open WebUI [here](https://github.com/open-webui/open-webui)

And we're done! If you have any questions on Unsloth, we have a [Discord](https://discord.gg/unsloth) channel! If you find any bugs or want to keep updated with the latest LLM stuff, or need help, join projects etc, feel free to join our Discord!

Some other links:
1. Train your own reasoning model - Llama GRPO notebook [Free Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_(8B)-GRPO.ipynb)
2. Saving finetunes to Ollama. [Free notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-Ollama.ipynb)
3. Llama 3.2 Vision finetuning - Radiography use case. [Free Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_(11B)-Vision.ipynb)
6. See notebooks for DPO, ORPO, Continued pretraining, conversational finetuning and more on our [documentation](https://docs.unsloth.ai/get-started/unsloth-notebooks)!

<div class="align-center">
  <a href="https://unsloth.ai"><img src="https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png" width="115"></a>
  <a href="https://discord.gg/unsloth"><img src="https://github.com/unslothai/unsloth/raw/main/images/Discord.png" width="145"></a>
  <a href="https://docs.unsloth.ai/"><img src="https://github.com/unslothai/unsloth/blob/main/images/documentation%20green%20button.png?raw=true" width="125"></a>

  Join Discord if you need help + ⭐️ <i>Star us on <a href="https://github.com/unslothai/unsloth">Github</a> </i> ⭐️
</div>
