<a href="https://colab.research.google.com/github/lennartvoelz/fine_tune_hf/blob/main/generate_function_calling_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q google-genai datasets pandas nest_asyncio

from google import genai
import pandas as pd
from google.colab import userdata
from datasets import Dataset
import json
import re

# Initialize the client with the API key directly
API_KEY = userdata.get('Gemini_API_KEY')
client = genai.Client(api_key=API_KEY)

In [None]:
TOOL_GET_WEATHER = {
    "name": "get_weather",
    "description": "Get current weather for a location",
    "parameters": {
        "type": "object",
        "properties": {
            "location": {"type": "string", "description": "City name"}
        },
        "required": ["location"],
    },
}

TOOL_SET_ALARM = {
    "name": "set_alarm",
    "description": "Set an alarm for a given time",
    "parameters": {
        "type": "object",
        "properties": {
            "hour": {"type": "integer", "description": "Hour to set the alarm for"},
            "minute": {"type": "integer", "description": "Minute to set the alarm for"},
        },
        "required": ["hour", "minute"],
    },
}

TOOL_SEND_MESSAGE = {
    "name": "send_message",
    "description": "Send a message to a contact",
    "parameters": {
        "type": "object",
        "properties": {
            "recipient": {"type": "string", "description": "Name of the person to send the message to"},
            "message": {"type": "string", "description": "The message content to send"},
        },
        "required": ["recipient", "message"],
    },
}

TOOL_CREATE_REMINDER = {
    "name": "create_reminder",
    "description": "Create a reminder with a title and time",
    "parameters": {
        "type": "object",
        "properties": {
            "title": {"type": "string", "description": "Reminder title"},
            "time": {"type": "string", "description": "Time for the reminder (e.g. 3:00 PM)"},
        },
        "required": ["title", "time"],
    },
}

TOOL_SEARCH_CONTACTS = {
    "name": "search_contacts",
    "description": "Search for a contact by name",
    "parameters": {
        "type": "object",
        "properties": {
            "query": {"type": "string", "description": "Name to search for"},
        },
        "required": ["query"],
    },
}

TOOL_PLAY_MUSIC = {
    "name": "play_music",
    "description": "Play a song or playlist",
    "parameters": {
        "type": "object",
        "properties": {
            "song": {"type": "string", "description": "Song or playlist name"},
        },
        "required": ["song"],
    },
}

TOOL_SET_TIMER = {
    "name": "set_timer",
    "description": "Set a countdown timer",
    "parameters": {
        "type": "object",
        "properties": {
            "minutes": {"type": "integer", "description": "Number of minutes"},
        },
        "required": ["minutes"],
    },
}


ALL_TOOLS = [
    TOOL_GET_WEATHER,
    TOOL_SET_ALARM,
    TOOL_SEND_MESSAGE,
    TOOL_CREATE_REMINDER,
    TOOL_SEARCH_CONTACTS,
    TOOL_PLAY_MUSIC,
    TOOL_SET_TIMER,
]

In [None]:
SEED_EXAMPLES = [
    {
        "name": "weather_sf",
        "difficulty": "easy",
        "messages": [{"role": "user", "content": "What is the weather in San Francisco?"}],
        "tools": [TOOL_GET_WEATHER],
        "expected_calls": [{"name": "get_weather", "arguments": {"location": "San Francisco"}}],
    },
    {
        "name": "alarm_10am",
        "difficulty": "easy",
        "messages": [{"role": "user", "content": "Set an alarm for 10 AM."}],
        "tools": [TOOL_SET_ALARM],
        "expected_calls": [{"name": "set_alarm", "arguments": {"hour": 10, "minute": 0}}],
    },
    {
        "name": "message_alice",
        "difficulty": "easy",
        "messages": [{"role": "user", "content": "Send a message to Alice saying good morning."}],
        "tools": [TOOL_SEND_MESSAGE],
        "expected_calls": [{"name": "send_message", "arguments": {"recipient": "Alice", "message": "good morning"}}],
    },
    {
        "name": "weather_london",
        "difficulty": "easy",
        "messages": [{"role": "user", "content": "What's the weather like in London?"}],
        "tools": [TOOL_GET_WEATHER],
        "expected_calls": [{"name": "get_weather", "arguments": {"location": "London"}}],
    },
    {
        "name": "alarm_6am",
        "difficulty": "easy",
        "messages": [{"role": "user", "content": "Wake me up at 6 AM."}],
        "tools": [TOOL_SET_ALARM],
        "expected_calls": [{"name": "set_alarm", "arguments": {"hour": 6, "minute": 0}}],
    },
    {
        "name": "play_bohemian",
        "difficulty": "easy",
        "messages": [{"role": "user", "content": "Play Bohemian Rhapsody."}],
        "tools": [TOOL_PLAY_MUSIC],
        "expected_calls": [{"name": "play_music", "arguments": {"song": "Bohemian Rhapsody"}}],
    },
    {
        "name": "timer_5min",
        "difficulty": "easy",
        "messages": [{"role": "user", "content": "Set a timer for 5 minutes."}],
        "tools": [TOOL_SET_TIMER],
        "expected_calls": [{"name": "set_timer", "arguments": {"minutes": 5}}],
    },
    {
        "name": "reminder_meeting",
        "difficulty": "easy",
        "messages": [{"role": "user", "content": "Remind me about the meeting at 3:00 PM."}],
        "tools": [TOOL_CREATE_REMINDER],
        "expected_calls": [{"name": "create_reminder", "arguments": {"title": "meeting", "time": "3:00 PM"}}],
    },
    {
        "name": "search_bob",
        "difficulty": "easy",
        "messages": [{"role": "user", "content": "Find Bob in my contacts."}],
        "tools": [TOOL_SEARCH_CONTACTS],
        "expected_calls": [{"name": "search_contacts", "arguments": {"query": "Bob"}}],
    },
    {
        "name": "weather_paris",
        "difficulty": "easy",
        "messages": [{"role": "user", "content": "How's the weather in Paris?"}],
        "tools": [TOOL_GET_WEATHER],
        "expected_calls": [{"name": "get_weather", "arguments": {"location": "Paris"}}],
    },
    {
        "name": "message_among_three",
        "difficulty": "medium",
        "messages": [{"role": "user", "content": "Send a message to John saying hello."}],
        "tools": [TOOL_GET_WEATHER, TOOL_SEND_MESSAGE, TOOL_SET_ALARM],
        "expected_calls": [{"name": "send_message", "arguments": {"recipient": "John", "message": "hello"}}],
    },
    {
        "name": "weather_among_two",
        "difficulty": "medium",
        "messages": [{"role": "user", "content": "What's the weather in Tokyo?"}],
        "tools": [TOOL_GET_WEATHER, TOOL_SEND_MESSAGE],
        "expected_calls": [{"name": "get_weather", "arguments": {"location": "Tokyo"}}],
    },
    {
        "name": "alarm_among_three",
        "difficulty": "medium",
        "messages": [{"role": "user", "content": "Set an alarm for 8:15 AM."}],
        "tools": [TOOL_SEND_MESSAGE, TOOL_SET_ALARM, TOOL_GET_WEATHER],
        "expected_calls": [{"name": "set_alarm", "arguments": {"hour": 8, "minute": 15}}],
    },
    {
        "name": "music_among_three",
        "difficulty": "medium",
        "messages": [{"role": "user", "content": "Play some jazz music."}],
        "tools": [TOOL_SET_ALARM, TOOL_PLAY_MUSIC, TOOL_GET_WEATHER],
        "expected_calls": [{"name": "play_music", "arguments": {"song": "jazz"}}],
    },
    {
        "name": "reminder_among_four",
        "difficulty": "medium",
        "messages": [{"role": "user", "content": "Remind me to call the dentist at 2:00 PM."}],
        "tools": [TOOL_GET_WEATHER, TOOL_SEND_MESSAGE, TOOL_CREATE_REMINDER, TOOL_SET_ALARM],
        "expected_calls": [{"name": "create_reminder", "arguments": {"title": "call the dentist", "time": "2:00 PM"}}],
    },
    {
        "name": "timer_among_three",
        "difficulty": "medium",
        "messages": [{"role": "user", "content": "Set a timer for 10 minutes."}],
        "tools": [TOOL_SET_ALARM, TOOL_SET_TIMER, TOOL_PLAY_MUSIC],
        "expected_calls": [{"name": "set_timer", "arguments": {"minutes": 10}}],
    },
    {
        "name": "search_among_four",
        "difficulty": "medium",
        "messages": [{"role": "user", "content": "Look up Sarah in my contacts."}],
        "tools": [TOOL_SEND_MESSAGE, TOOL_GET_WEATHER, TOOL_SEARCH_CONTACTS, TOOL_SET_ALARM],
        "expected_calls": [{"name": "search_contacts", "arguments": {"query": "Sarah"}}],
    },
    {
        "name": "weather_among_four",
        "difficulty": "medium",
        "messages": [{"role": "user", "content": "What's the weather in Berlin?"}],
        "tools": [TOOL_SEND_MESSAGE, TOOL_SET_ALARM, TOOL_PLAY_MUSIC, TOOL_GET_WEATHER],
        "expected_calls": [{"name": "get_weather", "arguments": {"location": "Berlin"}}],
    },
    {
        "name": "message_among_four",
        "difficulty": "medium",
        "messages": [{"role": "user", "content": "Text Dave saying I'll be late."}],
        "tools": [TOOL_GET_WEATHER, TOOL_SET_TIMER, TOOL_SEND_MESSAGE, TOOL_PLAY_MUSIC],
        "expected_calls": [{"name": "send_message", "arguments": {"recipient": "Dave", "message": "I'll be late"}}],
    },
    {
        "name": "alarm_among_five",
        "difficulty": "medium",
        "messages": [{"role": "user", "content": "Set an alarm for 9 AM."}],
        "tools": [TOOL_SEND_MESSAGE, TOOL_GET_WEATHER, TOOL_PLAY_MUSIC, TOOL_SET_TIMER, TOOL_SET_ALARM],
        "expected_calls": [{"name": "set_alarm", "arguments": {"hour": 9, "minute": 0}}],
    },
    {
        "name": "message_and_weather",
        "difficulty": "hard",
        "messages": [{"role": "user", "content": "Send a message to Bob saying hi and get the weather in London."}],
        "tools": [TOOL_GET_WEATHER, TOOL_SEND_MESSAGE, TOOL_SET_ALARM],
        "expected_calls": [
            {"name": "send_message", "arguments": {"recipient": "Bob", "message": "hi"}},
            {"name": "get_weather", "arguments": {"location": "London"}},
        ],
    },
    {
        "name": "alarm_and_weather",
        "difficulty": "hard",
        "messages": [{"role": "user", "content": "Set an alarm for 7:30 AM and check the weather in New York."}],
        "tools": [TOOL_GET_WEATHER, TOOL_SET_ALARM, TOOL_SEND_MESSAGE],
        "expected_calls": [
            {"name": "set_alarm", "arguments": {"hour": 7, "minute": 30}},
            {"name": "get_weather", "arguments": {"location": "New York"}},
        ],
    },
    {
        "name": "timer_and_music",
        "difficulty": "hard",
        "messages": [{"role": "user", "content": "Set a timer for 20 minutes and play lo-fi beats."}],
        "tools": [TOOL_SET_TIMER, TOOL_PLAY_MUSIC, TOOL_GET_WEATHER, TOOL_SET_ALARM],
        "expected_calls": [
            {"name": "set_timer", "arguments": {"minutes": 20}},
            {"name": "play_music", "arguments": {"song": "lo-fi beats"}},
        ],
    },
    {
        "name": "reminder_and_message",
        "difficulty": "hard",
        "messages": [{"role": "user", "content": "Remind me about groceries at 5:00 PM and text Lisa saying see you tonight."}],
        "tools": [TOOL_CREATE_REMINDER, TOOL_SEND_MESSAGE, TOOL_GET_WEATHER, TOOL_SET_ALARM],
        "expected_calls": [
            {"name": "create_reminder", "arguments": {"title": "groceries", "time": "5:00 PM"}},
            {"name": "send_message", "arguments": {"recipient": "Lisa", "message": "see you tonight"}},
        ],
    },
    {
        "name": "search_and_message",
        "difficulty": "hard",
        "messages": [{"role": "user", "content": "Find Tom in my contacts and send him a message saying happy birthday."}],
        "tools": [TOOL_SEARCH_CONTACTS, TOOL_SEND_MESSAGE, TOOL_GET_WEATHER, TOOL_PLAY_MUSIC],
        "expected_calls": [
            {"name": "search_contacts", "arguments": {"query": "Tom"}},
            {"name": "send_message", "arguments": {"recipient": "Tom", "message": "happy birthday"}},
        ],
    },
    {
        "name": "alarm_and_reminder",
        "difficulty": "hard",
        "messages": [{"role": "user", "content": "Set an alarm for 6:45 AM and remind me to take medicine at 7:00 AM."}],
        "tools": [TOOL_SET_ALARM, TOOL_CREATE_REMINDER, TOOL_SEND_MESSAGE, TOOL_PLAY_MUSIC],
        "expected_calls": [
            {"name": "set_alarm", "arguments": {"hour": 6, "minute": 45}},
            {"name": "create_reminder", "arguments": {"title": "take medicine", "time": "7:00 AM"}},
        ],
    },
    {
        "name": "weather_and_music",
        "difficulty": "hard",
        "messages": [{"role": "user", "content": "Check the weather in Miami and play summer hits."}],
        "tools": [TOOL_GET_WEATHER, TOOL_PLAY_MUSIC, TOOL_SET_TIMER, TOOL_SEND_MESSAGE],
        "expected_calls": [
            {"name": "get_weather", "arguments": {"location": "Miami"}},
            {"name": "play_music", "arguments": {"song": "summer hits"}},
        ],
    },
    {
        "name": "message_weather_alarm",
        "difficulty": "hard",
        "messages": [{"role": "user", "content": "Text Emma saying good night, check the weather in Chicago, and set an alarm for 5 AM."}],
        "tools": [TOOL_SEND_MESSAGE, TOOL_GET_WEATHER, TOOL_SET_ALARM, TOOL_PLAY_MUSIC, TOOL_SET_TIMER],
        "expected_calls": [
            {"name": "send_message", "arguments": {"recipient": "Emma", "message": "good night"}},
            {"name": "get_weather", "arguments": {"location": "Chicago"}},
            {"name": "set_alarm", "arguments": {"hour": 5, "minute": 0}},
        ],
    },
    {
        "name": "timer_music_reminder",
        "difficulty": "hard",
        "messages": [{"role": "user", "content": "Set a 15 minute timer, play classical music, and remind me to stretch at 4:00 PM."}],
        "tools": [TOOL_SET_TIMER, TOOL_PLAY_MUSIC, TOOL_CREATE_REMINDER, TOOL_GET_WEATHER, TOOL_SEND_MESSAGE],
        "expected_calls": [
            {"name": "set_timer", "arguments": {"minutes": 15}},
            {"name": "play_music", "arguments": {"song": "classical music"}},
            {"name": "create_reminder", "arguments": {"title": "stretch", "time": "4:00 PM"}},
        ],
    },
    {
        "name": "search_message_weather",
        "difficulty": "hard",
        "messages": [{"role": "user", "content": "Look up Jake in my contacts, send him a message saying let's meet, and check the weather in Seattle."}],
        "tools": [TOOL_SEARCH_CONTACTS, TOOL_SEND_MESSAGE, TOOL_GET_WEATHER, TOOL_SET_ALARM, TOOL_PLAY_MUSIC],
        "expected_calls": [
            {"name": "search_contacts", "arguments": {"query": "Jake"}},
            {"name": "send_message", "arguments": {"recipient": "Jake", "message": "let's meet"}},
            {"name": "get_weather", "arguments": {"location": "Seattle"}},
        ],
    },
]

pd.DataFrame(SEED_EXAMPLES).head()

In [None]:
EXAMPLE_SCHEMA = {
    "type": "object",
    "properties": {
        "user": {"type": "string"},
        "difficulty": {"type": "string", "enum": ["easy", "medium", "hard"]},
        "expected_calls": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "name": {"type": "string"},
                    "arguments": {"type": "object"},
                },
                "required": ["name", "arguments"],
            },
        },
    },
    "required": ["user", "difficulty", "expected_calls"],
}

SYSTEM_INSTRUCTIONS = """You generate supervised fine-tuning data for a tool-calling assistant.
Return ONLY raw JSON (no markdown fences, no explanation).
Follow the provided tool schemas strictly — only use argument keys defined in the schema.

Rules:
- expected_calls: [] when no tool is needed (general knowledge, math, chit-chat).
- expected_calls: [{{...}}] with exactly 1 call for single-action requests.
- expected_calls: [{{...}}, {{...}}] with 2 calls only when the user explicitly asks for two independent actions.
- Argument values must be clean strings or integers — no punctuation artifacts.
- Generate diverse, realistic user phrasing. Avoid repeating the tool name verbatim in the user message."""

In [None]:
from google.genai import types

PROMPT_TEMPLATE = """
Generate exactly {num_examples} NEW tool-calling examples in EXACT JSON format.

You MUST output a JSON array of objects. Output ONLY JSON array. No markdown.

Difficulty distribution REQUIREMENT for this batch:
- 30% easy
- 30% medium
- 40% hard

Rules:
- Each object must have keys: name, difficulty, messages, expected_calls
- difficulty must be exactly one of: easy, medium, hard
- messages is a list with exactly one item: {{"role":"user","content":"..."}}
- expected_calls is [] if no tool is needed
- Otherwise expected_calls is a list of 1 call (or 2 only if the user clearly asked for two actions)
- Every call must match one of the tool names below and use only valid argument keys.

TOOLS (JSON):
{tools_json}

SEED EXAMPLE:
{seed_example}

Output ONLY JSON array
"""


import json
from typing import List, Dict

def robust_json_parse(text: str, min_expected: int = 1) -> List[Dict]:
    """Robust Gemini JSON parser - no scope issues"""
    # Clean markdown
    text = re.sub(r'^```(?:json)?\s*|\s*```$', '', text, flags=re.MULTILINE).strip()

    # Try direct parse
    try:
        parsed = json.loads(text)
        if isinstance(parsed, list) and len(parsed) > 0:
            return parsed
    except json.JSONDecodeError:
        pass

    # Extract JSON array candidates
    json_candidates = re.findall(r'\[.*?\]', text, re.DOTALL)
    for candidate in json_candidates[-3:]:  # Last 3 (most likely complete)
        try:
            parsed = json.loads(candidate)
            if isinstance(parsed, list) and len(parsed) > 0 and isinstance(parsed[0], dict):
                return parsed
        except:
            continue

    # Line-by-line object extraction (fallback)
    objects = []
    for line in text.split('\n'):
        line = line.strip()
        if line.startswith('{') and line.endswith('}'):
            try:
                obj = json.loads(line)
                # UPDATED: tool-calling keys
                if all(k in obj for k in ['name', 'difficulty', 'messages', 'expected_calls']):
                    objects.append(obj)
            except:
                pass

    # Return if reasonable yield
    if len(objects) >= min_expected // 5:  # At least 20% expected
        return objects

    print(" Parse failed. Raw:", text[:300] + "..." if len(text) > 300 else text)
    return []


def generate_gemini_batch(num_examples: int) -> list:
    """Synchronous generation call to Gemini."""
    # Escape { } in the JSON string so python's .format() doesn't break
    seed_str = json.dumps(SEED_EXAMPLES[0], ensure_ascii=False)
    escaped_seed = seed_str.replace('{', '{{').replace('}', '}}')

    tools_str = json.dumps(ALL_TOOLS, ensure_ascii=False)
    escaped_tools = tools_str.replace('{', '{{').replace('}', '}}')

    prompt = PROMPT_TEMPLATE.format(
        num_examples=num_examples,
        seed_example=escaped_seed,
        tools_json=escaped_tools,
    )

    # Optional but recommended: push JSON output mode into the config
    # response_mime_type="application/json" nudges the model to emit raw JSON.
    response = client.models.generate_content(
        model="gemini-2.5-flash-lite",
        contents=prompt,
        config=types.GenerateContentConfig(
            temperature=0.6,
            max_output_tokens=8192,
            response_mime_type="application/json",
        )
    )

    return robust_json_parse(response.text, num_examples)

# batch = generate_gemini_batch(10)
# print(json.dumps(batch[0], indent=2) if batch else "Empty batch")


In [None]:
BATCH_SIZE = 20
TOTAL_EXAMPLES = 180

all_examples = SEED_EXAMPLES.copy()
print(f"Starting generation of {TOTAL_EXAMPLES} total examples...")

for i in range(len(SEED_EXAMPLES), TOTAL_EXAMPLES, BATCH_SIZE):
    target_size = min(BATCH_SIZE, TOTAL_EXAMPLES - len(all_examples))
    print(f"Generating batch of {target_size} (Current total: {len(all_examples)})...")

    # Retry logic per batch
    attempts = 0
    while attempts < 1:
        batch = generate_gemini_batch(target_size)
        if batch:
            all_examples.extend(batch)
            print(f"  Success! Got {len(batch)} items.")
            break
        attempts += 1
        print(f"  Retrying batch (Attempt {attempts}/3)...")

print(f"Generation complete! Generated a total of {len(all_examples)} examples.")


In [None]:
df = pd.DataFrame(all_examples)

TOOL_BY_NAME = {t["name"]: t for t in ALL_TOOLS}

def validate_format(row):
    try:
        # messages must be a list with exactly one user message
        messages = row.get("messages", [])
        if not isinstance(messages, list) or len(messages) != 1:
            return False
        if messages[0].get("role") != "user":
            return False
        if not isinstance(messages[0].get("content"), str) or not messages[0]["content"].strip():
            return False

        # difficulty must be one of the expected values
        if row.get("difficulty") not in ("easy", "medium", "hard"):
            return False

        # expected_calls must be a list
        calls = row.get("expected_calls")
        if not isinstance(calls, list):
            return False

        # empty list = no tool call, that's valid
        if len(calls) == 0:
            return True

        # Validate each call
        seen_calls = set()
        for call in calls:
            name = call.get("name")
            args = call.get("arguments")

            # name must exist and be a known tool
            if name not in TOOL_BY_NAME:
                return False

            # arguments must be a dict
            if not isinstance(args, dict):
                return False

            tool = TOOL_BY_NAME[name]
            required_keys = set(tool["parameters"].get("required", []))
            valid_keys    = set(tool["parameters"]["properties"].keys())

            # all required args must be present
            if not required_keys.issubset(args.keys()):
                return False

            # no hallucinated arg keys
            if not set(args.keys()).issubset(valid_keys):
                return False

            # arg values must be non-empty
            for v in args.values():
                if v is None or (isinstance(v, str) and not v.strip()):
                    return False

            # no duplicate tool calls within one example
            call_key = (name, json.dumps(args, sort_keys=True))
            if call_key in seen_calls:
                return False
            seen_calls.add(call_key)

        return True

    except Exception:
        return False  # anything unexpected → mark invalid

df['valid'] = df.apply(validate_format, axis=1)

valid_count = df['valid'].sum()
print(f"Validation passed for {valid_count}/{len(df)} examples ({df['valid'].mean():.1%})")

print("\n Sample of Valid Data:")
pd.set_option('display.max_colwidth', 120)
display(df[df['valid']][['messages', 'expected_calls', 'difficulty']].head(5))

# Mount Drive and export clean dataset
from google.colab import drive
drive.mount('/content/drive')

SAVE_DIR = "/content/drive/MyDrive/tool_sft_data"
import os
os.makedirs(SAVE_DIR, exist_ok=True)

valid_df = df[df['valid']].drop(columns=['valid'])
valid_df.to_json(f'{SAVE_DIR}/tool_sft_clean.jsonl', orient='records', lines=True)
valid_df.to_csv(f'{SAVE_DIR}/tool_sft_clean.csv', index=False)

print(f"\n Saved {len(valid_df)} clean examples to '{SAVE_DIR}'")
