<a href="https://colab.research.google.com/github/lennartvoelz/fine_tune_hf/blob/main/FunctionGemma_(270M)_finetune_close.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install Unsloth and other dependencies

In [1]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9]{1,}\.[0-9]{1,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.33.post1" if v=="2.9" else "0.0.32.post2" if v=="2.8" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.57.3
!pip install --no-deps trl==0.22.2

In [3]:
!pip install modelscope
import json
os.environ['UNSLOTH_USE_MODELSCOPE'] = '1'

Collecting modelscope
  Downloading modelscope-1.34.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.3/43.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Downloading modelscope-1.34.0-py3-none-any.whl (6.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.1/6.1 MB[0m [31m92.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: modelscope
Successfully installed modelscope-1.34.0


### Unsloth

In [4]:
from unsloth import FastLanguageModel
import torch
from google.colab import userdata
from datasets import load_dataset

hf_token = userdata.get('HF_TOKEN')
max_seq_length = 2048

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "google/functiongemma-270m-it",
    max_seq_length = max_seq_length, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    load_in_16bit = False, # [NEW!] Enables 16bit LoRA
    full_finetuning = False, # [NEW!] We have full finetuning now!
    token = hf_token, # HF Token for gated models
)

==((====))==  Unsloth 2026.2.1: Fast Gemma3 patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.563 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.


We now add LoRA adapters so we only need to update a small amount of parameters!

In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0.1, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 42,
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.1.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.


Unsloth: Making `model.base_model.model.model` require gradients


Formatting is very important in the `functiongemma` for tool-calling.

In [10]:
import json

from google.colab import drive
drive.mount('/content/drive')

SAVE_DIR = "/content/drive/MyDrive/data/"

raw_data = []
with open(SAVE_DIR+"examples.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if line:
            raw_data.append(json.loads(line))

def prepare_example(example):
    user_msg = example["messages"][0]["content"]
    expected_calls = example.get("expected_calls", [])
    tools_list = example.get("tools", [])

    # 1. System Prompt
    messages = [
        {"role": "system", "content": "You are a helpful assistant with access to tools."},
        {"role": "user", "content": user_msg},
    ]

    # 2. Assistant Antwort im offiziellen HF-Format aufbauen
    if expected_calls:
        # Wenn wir einen Tool Call erwarten, dürfen wir ihn NICHT als String
        # ins 'content'-Feld schreiben. Wir MÜSSEN das 'tool_calls'-Feld nutzen,
        # damit Jinja2 nicht abstürzt!
        tool_calls = []
        for call in expected_calls:
            tool_calls.append({
                "type": "function",
                "function": {
                    "name": call["name"],
                    "arguments": call["arguments"] # Dict, NICHT json.dumps!
                }
            })

        messages.append({
            "role": "assistant",
            "content": "", # Muss leer sein, wenn nur ein Tool gerufen wird
            "tool_calls": tool_calls
        })
    else:
        # Kein Tool Call, ganz normale Textantwort
        messages.append({
            "role": "assistant",
            "content": example.get("expected_response", "")
        })

    # 3. Tools im HF-Format aufbauen
    adapted_tools = []
    for t in tools_list:
        name = t.get("name", "")
        description = t.get("description", "")
        parameters = t.get("parameters") or {
            "type": "object",
            "properties": {},
        }

        # HF verlangt auch bei den Definitionen zwingend dieses verschachtelte Format:
        adapted_tools.append({
            "type": "function",
            "function": {
                "name": name,
                "description": description,
                "parameters": parameters,
            }
        })

    return messages, adapted_tools

def format_tools_for_prompt(example):
    messages, tools = prepare_example(example)

    chat_str = tokenizer.apply_chat_template(
        messages,
        tokenize = False,
        add_generation_prompt = False,
        tools = tools,
    ).removeprefix("<bos>")

    return {
        "text": chat_str
    }

chat_data = []

for example in raw_data:
    formatted_example = format_tools_for_prompt(example)
    chat_data.append(formatted_example)

with open("chat_ready.jsonl", "w", encoding="utf-8") as f:
    for ex in chat_data:
        f.write(json.dumps(ex, ensure_ascii=False) + "\n")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
{'name': 'weather_sf', 'difficulty': 'easy', 'messages': [{'role': 'user', 'content': 'What is the weather in San Francisco?'}], 'tools': [{'name': 'get_weather', 'description': 'Get current weather for a location', 'parameters': {'type': 'object', 'properties': {'location': {'type': 'string', 'description': 'City name'}}, 'required': ['location']}}], 'expected_calls': [{'name': 'get_weather', 'arguments': {'location': 'San Francisco'}}]}
{'name': 'alarm_10am', 'difficulty': 'easy', 'messages': [{'role': 'user', 'content': 'Set an alarm for 10 AM.'}], 'tools': [{'name': 'set_alarm', 'description': 'Set an alarm for a given time', 'parameters': {'type': 'object', 'properties': {'hour': {'type': 'integer', 'description': 'Hour to set the alarm for'}, 'minute': {'type': 'integer', 'description': 'Minute to set the alarm for'}}, 'required': ['hour', 'minute']}}],

In [11]:
dataset = load_dataset("json", data_files="chat_ready.jsonl", split="train")

print(dataset[0])

Generating train split: 0 examples [00:00, ? examples/s]

{'text': '<start_of_turn>developer\nYou are a helpful assistant with access to tools.<start_function_declaration>declaration:get_weather{description:<escape>Get current weather for a location<escape>,parameters:{properties:{location:{description:<escape>City name<escape>,type:<escape>STRING<escape>}},required:[<escape>location<escape>],type:<escape>OBJECT<escape>}}<end_function_declaration><end_of_turn>\n<start_of_turn>user\nWhat is the weather in San Francisco?<end_of_turn>\n<start_of_turn>model\n<start_function_call>call:get_weather{location:<escape>San Francisco<escape>}<end_function_call><start_function_response>'}


<a name="Train"></a>
### Train the model
Now let's train our model. We do 500 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`.

In [12]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    eval_dataset = None, # Can set up evaluation!
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 2, # Use GA to mimic batch size!
        warmup_steps = 10,
        #num_train_epochs = 2, # Set this for 1 full training run.
        max_steps = 10,
        learning_rate = 8e-5, # Reduce to 2e-5 for long training runs
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.007,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use TrackIO/WandB etc
    ),
)

Unsloth: Switching to float32 training since model cannot work with float16


Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/100 [00:00<?, ? examples/s]

🦥 Unsloth: Padding-free auto-enabled, enabling faster training.


We also use Unsloth's `train_on_completions` method to only train on the assistant outputs and ignore the loss on the user's inputs. This helps increase accuracy of finetunes!

In [13]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)

Map (num_proc=6):   0%|          | 0/100 [00:00<?, ? examples/s]

Filter (num_proc=6):   0%|          | 0/100 [00:00<?, ? examples/s]

Let's verify masking the instruction part is done! Let's print the 100th row again.

In [14]:
tokenizer.decode(trainer.train_dataset[-1]["input_ids"])

"<bos><start_of_turn>developer\nYou are a helpful assistant with access to tools.<start_function_declaration>declaration:search_contacts{description:<escape>Search for a contact by name<escape>,parameters:{properties:{query:{description:<escape>Name to search for<escape>,type:<escape>STRING<escape>}},required:[<escape>query<escape>],type:<escape>OBJECT<escape>}}<end_function_declaration><start_function_declaration>declaration:send_message{description:<escape>Send a message to a contact<escape>,parameters:{properties:{message:{description:<escape>The message content to send<escape>,type:<escape>STRING<escape>},recipient:{description:<escape>Name of the person to send the message to<escape>,type:<escape>STRING<escape>}},required:[<escape>recipient<escape>,<escape>message<escape>],type:<escape>OBJECT<escape>}}<end_function_declaration><start_function_declaration>declaration:create_reminder{description:<escape>Create a reminder with a title and time<escape>,parameters:{properties:{time:{de

Now let's print the masked out example - you should see only the answer is present:

In [15]:
[tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[25]["labels"]]).replace(tokenizer.pad_token, "-")]

['-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------<start_function_call>call:set_alarm{hour:6,minute:45}<end_function_call><start_function_call>call:create_reminder{time:<escape>7:00 AM<escape>,title:<escape>take medicine<escape>}<end_function_call><start_function_response>']

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

Let's train the model! To resume a training run, set `trainer.train(resume_from_checkpoint = True)`

In [16]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100 | Num Epochs = 1 | Total steps = 10
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 2 x 1) = 8
 "-____-"     Trainable parameters = 3,796,992 of 271,895,168 (1.40% trained)


Step,Training Loss
1,1.161
2,1.1787
3,1.381
4,0.69
5,0.4053
6,0.1154
7,0.3909
8,0.2339
9,0.2254
10,0.2499


Unsloth: Will smartly offload gradients to save VRAM!


<a name="Inference"></a>
### Inference
Let's run the model via Unsloth native inference!

We will take only the first two `messages`, which is the `system` role and the `user` role while also passing the `tools` to the prompt.

In [21]:
FastLanguageModel.for_inference(model)

messages, tools = prepare_example(raw_data[0])

text = tokenizer.apply_chat_template(
    messages[:2],
    tools = tools,
    tokenize = False,
    add_generation_prompt = True, # Must add for generation
).removeprefix('<bos>')

from transformers import TextStreamer
streamer = TextStreamer(tokenizer, skip_prompt = False)

_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 256,
    streamer = streamer,
    do_sample = False,
    temperature = 0.0,
)

<bos><start_of_turn>developer
You are a helpful assistant with access to tools.<start_function_declaration>declaration:get_weather{description:<escape>Get current weather for a location<escape>,parameters:{properties:{location:{description:<escape>City name<escape>,type:<escape>STRING<escape>}},required:[<escape>location<escape>],type:<escape>OBJECT<escape>}}<end_function_declaration><end_of_turn>
<start_of_turn>user
What is the weather in San Francisco?<end_of_turn>
<start_of_turn>model
<start_function_call>call:get_weather{location:<escape>San Francisco<escape>}<end_function_call><start_function_response>


In [22]:
model.save_pretrained(SAVE_DIR+"functiongemma_lora")  # Local saving
tokenizer.save_pretrained(SAVE_DIR+"functiongemma_lora")

('/content/drive/MyDrive/data/functiongemma_lora/tokenizer_config.json',
 '/content/drive/MyDrive/data/functiongemma_lora/special_tokens_map.json',
 '/content/drive/MyDrive/data/functiongemma_lora/chat_template.jinja',
 '/content/drive/MyDrive/data/functiongemma_lora/tokenizer.model',
 '/content/drive/MyDrive/data/functiongemma_lora/added_tokens.json',
 '/content/drive/MyDrive/data/functiongemma_lora/tokenizer.json')