<a href="https://colab.research.google.com/github/lennartvoelz/fine_tune_hf/blob/main/FunctionGemma_(270M)_finetune_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install Unsloth and other dependencies

In [None]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9]{1,}\.[0-9]{1,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.33.post1" if v=="2.9" else "0.0.32.post2" if v=="2.8" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.57.3
!pip install --no-deps trl==0.22.2

In [None]:
!pip install modelscope
import json
os.environ['UNSLOTH_USE_MODELSCOPE'] = '1'

### Unsloth

In [None]:
from unsloth import FastLanguageModel
import torch
from google.colab import userdata
from datasets import load_dataset

hf_token = userdata.get('HF_TOKEN')
max_seq_length = 256

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "google/functiongemma-270m-it",
    max_seq_length = max_seq_length, # Choose any for long context!
    load_in_4bit = False,  # 4 bit quantization to reduce memory
    load_in_8bit = True, # [NEW!] A bit more accurate, uses 2x memory
    load_in_16bit = False, # [NEW!] Enables 16bit LoRA
    full_finetuning = False, # [NEW!] We have full finetuning now!
    token = hf_token, # HF Token for gated models
)

We now add LoRA adapters so we only need to update a small amount of parameters!

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj"],
    lora_alpha = 16,
    lora_dropout = 0.0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 42,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Formatting is very important in the `functiongemma` for tool-calling.

In [None]:
import json

from google.colab import drive
drive.mount('/content/drive')

SAVE_DIR = "/content/drive/MyDrive/data/"

raw_data = []
with open(SAVE_DIR+"examples.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if line:
            raw_data.append(json.loads(line))

def format_cactus_call(expected_calls):
    """Replicates the Cactus C++ expected output format"""
    if not expected_calls:
        return ""

    call = expected_calls[0]
    result = f"<start_function_call>call:{call['name']}{{"

    args = []
    for k, v in call["arguments"].items():
        if isinstance(v, str):
            args.append(f"{k}:<escape>{v}<escape>")
        else:
            args.append(f"{k}:{v}")

    result += ",".join(args) + "}<end_function_call>"
    return result

def format_cactus_tools(tools_list):
    """Replicates the Cactus format_tools() C++ function"""
    result = ""
    for t in tools_list:
        result += "<start_function_declaration>"
        result += f"declaration:{t['name']}{{"

        desc = t.get('description', '')
        result += f"description:<escape>{desc}<escape>"

        if "parameters" in t and "properties" in t["parameters"]:
            result += ",parameters:{properties:{"
            props = []
            for k, v in t["parameters"]["properties"].items():
                p_desc = v.get('description', '')
                p_type = v.get('type', 'string').upper()
                prop_str = f"{k}:{{description:<escape>{p_desc}<escape>,type:<escape>{p_type}<escape>}}"
                props.append(prop_str)
            result += ",".join(props) + "}"

            if "required" in t["parameters"]:
                reqs = [f"<escape>{r}<escape>" for r in t["parameters"]["required"]]
                result += f",required:[{','.join(reqs)}]"

            t_type = t['parameters'].get('type', 'object').upper()
            result += f",type:<escape>{t_type}<escape>}}"

        result += "}<end_function_declaration>"
    return result

def format_tools_for_prompt(example):
    user_msg = example["messages"][0]["content"]
    expected_calls = example.get("expected_calls", [])
    tools_list = example.get("tools", [])

    # --- 1. BUILD THE SYSTEM PROMPT (EXACTLY AS C++ DOES) ---
    system_content = "You are a helpful assistant that can use tools.\n"

    if tools_list:
        tools_json = format_cactus_tools(tools_list)
        system_content += "You are a model that can do function calling with the following functions."
        system_content += tools_json
        system_content += "\n\nWhen you decide to call a function, output it in this exact format:\n"
        system_content += "<start_function_call>call:function_name{arg1:<escape>value1<escape>,arg2:<escape>value2<escape>}<end_function_call>"

    # --- 2. BUILD THE ASSISTANT CONTENT (EXACTLY AS C++ EXPECTS) ---
    if expected_calls:
        assistant_content = format_cactus_call(expected_calls)
    else:
        assistant_content = example.get("expected_response", "")

    # --- 3. CONSTRUCT THE FULL CHAT STRING MANUALLY (BYPASSING JINJA/HF) ---
    # We build the exact string Gemma needs without apply_chat_template

    chat_str = (
        f"<start_of_turn>developer\n"
        f"{system_content}<end_of_turn>\n"
        f"<start_of_turn>user\n"
        f"{user_msg}<end_of_turn>\n"
        f"<start_of_turn>model\n"
        f"{assistant_content}"
    )

    # Notice we DO NOT add a final <end_of_turn>\n here because we want the model
    # to learn to generate it after outputting the function call!

    return {
        "text": chat_str
    }

chat_data = []

for example in raw_data:
    formatted_example = format_tools_for_prompt(example)
    chat_data.append(formatted_example)

with open("chat_ready.jsonl", "w", encoding="utf-8") as f:
    for ex in chat_data:
        f.write(json.dumps(ex, ensure_ascii=False) + "\n")

In [None]:
dataset = load_dataset("json", data_files="chat_ready.jsonl", split="train")

print(dataset[0])

<a name="Train"></a>
### Train the model
Now let's train our model. We do 500 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`.

In [None]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    eval_dataset = None, # Can set up evaluation!
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 2, # Use GA to mimic batch size!
        warmup_steps = 2,
        #num_train_epochs = 2, # Set this for 1 full training run.
        max_steps = 15,
        learning_rate = 8e-5, # Reduce to 2e-5 for long training runs
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.007,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use TrackIO/WandB etc
    ),
)

We also use Unsloth's `train_on_completions` method to only train on the assistant outputs and ignore the loss on the user's inputs. This helps increase accuracy of finetunes!

In [None]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)

Let's verify masking the instruction part is done! Let's print the 100th row again.

In [None]:
tokenizer.decode(trainer.train_dataset[-1]["input_ids"])

Now let's print the masked out example - you should see only the answer is present:

In [None]:
[tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[25]["labels"]]).replace(tokenizer.pad_token, "-")]

Let's train the model! To resume a training run, set `trainer.train(resume_from_checkpoint = True)`

In [None]:
trainer_stats = trainer.train()

In [None]:
# 1. Grab the raw example from your data
example = raw_data[0]
user_msg = example["messages"][0]["content"]
tools_list = example.get("tools", [])
eos_token_id = tokenizer.eos_token_id
end_call_token_id = tokenizer.convert_tokens_to_ids("<end_function_call>")

# 2. Build the exact Cactus System Prompt (reusing our logic from before)
system_content = "You are a helpful assistant that can use tools.\n"
if tools_list:
    tools_json = format_cactus_tools(tools_list) # Make sure this function is defined!
    system_content += "You are a model that can do function calling with the following functions."
    system_content += tools_json
    system_content += "\n\nWhen you decide to call a function, output it in this exact format:\n"
    system_content += "<start_function_call>call:function_name{arg1:<escape>value1<escape>,arg2:<escape>value2<escape>}<end_function_call>"

# 3. Manually construct the prompt string for generation
# Notice we add "<start_of_turn>model\n" at the end, which acts as the "add_generation_prompt=True"
text = (
    f"<bos><start_of_turn>system\n"
    f"{system_content}<end_of_turn>\n"
    f"<start_of_turn>user\n"
    f"{user_msg}<end_of_turn>\n"
    f"<start_of_turn>model\n"
)

# 4. Tokenize and prepare for Unsloth / HF Generation
inputs = tokenizer(text, return_tensors="pt").to("cuda")

from transformers import TextStreamer
# skip_prompt=True keeps the output clean so you only see the generated function call
streamer = TextStreamer(tokenizer, skip_prompt=True)

# 5. Generate with strict deterministic settings
_ = model.generate(
    **inputs,
    max_new_tokens=256,
    streamer=streamer,
    do_sample=False,   # Greedy decoding is required for JSON/Function calls
    temperature=0.0,
    eos_token_id=[eos_token_id, end_call_token_id]
)

In [None]:
print(text)

In [None]:
model.save_pretrained(SAVE_DIR+"functiongemma_lora")  # Local saving
tokenizer.save_pretrained(SAVE_DIR+"functiongemma_lora")