In [1]:
import os
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('OpenAI')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import torch
model_name = "Llama-3.2-3B-Instruct-FuncCalling6"
LORA_DIR = f"/content/drive/MyDrive/models/lora/{model_name}"
BASE_MODEL_ID = "unsloth/Llama-3.2-3B-Instruct"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
from datasets import load_dataset

TEST_PATH = "/content/drive/MyDrive/models/training_data/testset_50_unseen.json"

raw_test = load_dataset("json", data_files=TEST_PATH)["train"]

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
print(raw_test)

Dataset({
    features: ['messages'],
    num_rows: 50
})


In [6]:
raw_test[0]

{'messages': [{'content': 'You are an expert in composing functions. ... (full system)\n\nHere is a list of functions:\n{\n "functions": [\n   {"name": "get_current_wind", "parameters": {"city": "string"}},\n   {"name": "get_current_weather", "parameters": {"city": "string"}},\n   {"name": "get_city_temperature", "parameters": {"city": "string"}}\n ]\n}\n',
   'role': 'system'},
  {'content': 'Before I enter the international bathtub sailing championship, can you check wind, weather, and temperature in Copenhagen?',
   'role': 'user'},
  {'content': '[get_current_weather(city="Copenhagen"), get_current_wind(city="Copenhagen"), get_city_temperature(city="Copenhagen")]',
   'role': 'assistant'}]}

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [6]:
from peft import PeftModel
from tqdm import tqdm

def generate_all_answers(model, tokenizer, eval_subset, max_new_tokens=256):
    answers = []
    for ex in tqdm(eval_subset, desc="Answers"):
        prompt_str = tokenizer.apply_chat_template(
            ex["messages"],
            tokenize=False,
            add_generation_prompt=True,
        )

        inputs = tokenizer(
            prompt_str,
            return_tensors="pt",
            truncation=True,
            max_length=2048,
        ).to(DEVICE)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id,
            )

        gen_ids = outputs[0][inputs["input_ids"].shape[1]:]
        answer = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
        answers.append(answer)
    return answers

In [9]:
"""Original base model"""
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

model = AutoModelForCausalLM.from_pretrained(
  BASE_MODEL_ID,
  torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
  device_map="auto" if DEVICE == "cuda" else None,
)
model.eval()
print("Loaded base model")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/890 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

Loaded base model


In [10]:
base_answers = generate_all_answers(model, tokenizer, raw_test)

Answers:   0%|          | 0/50 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Answers: 100%|██████████| 50/50 [02:49<00:00,  3.38s/it]


In [11]:
import json

with open("/content/drive/MyDrive/models/ans/base_func_calling_v4.json", "w", encoding="utf-8") as f:
    json.dump(base_answers, f, indent=2, ensure_ascii=False)

In [7]:
import json
with open("/content/drive/MyDrive/models/ans/base_func_calling_v4.json", "r") as f:
    base_answers = json.load(f)


In [None]:
# Reset GPU VRAM
model.to("cpu")
del model
torch.cuda.empty_cache()

In [8]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
    device_map="auto" if DEVICE == "cuda" else None,
)
ft_model = PeftModel.from_pretrained(base_model, LORA_DIR)
ft_model.eval()
print("Loaded finetuned model")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/890 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

Loaded finetuned model


In [9]:
ft_answers = generate_all_answers(ft_model, tokenizer, raw_test)

Answers:   0%|          | 0/50 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Answers: 100%|██████████| 50/50 [02:20<00:00,  2.81s/it]


In [10]:
import json

with open("/content/drive/MyDrive/models/ans/ft_func_calling_v6.json", "w", encoding="utf-8") as f:
    json.dump(base_answers, f, indent=2, ensure_ascii=False)

In [11]:
def save_prompt_and_reference(example):
    """
    Extract:
    - first 'human' turn as the user prompt
    - first 'gpt' turn *after* that human turn as the reference answer
    """
    user = None
    assistant = None

    for turn in example["messages"]:
        if turn["role"] == "user" and user is None:
            user = turn["content"]
        elif turn["role"] == "assistant" and user is not None:
            assistant = turn["content"]
            break

    return {"prompt": user, "reference": assistant}

prompt_list = []
for ex in raw_test:
    prompt_list.append(save_prompt_and_reference(ex))

In [12]:
def pretty_print_comparison(eval_subset, base_answers, ft_answers, n=None):
    """
    Print prompt, reference, and both model answers in a clean format.
    n = how many samples to print (default = all)
    """
    if n is None:
        n = len(eval_subset)

    for i in range(n):
        ex = eval_subset[i]
        base = base_answers[i]
        ft = ft_answers[i]

        print("=" * 80)
        print(f"🧩 Sample {i}")
        print("=" * 80)

        print("\n🎯 PROMPT:")
        print(ex["prompt"])

        print("\n📘 REFERENCE ANSWER:")
        print(ex["reference"])

        print("\n🤖 BASE MODEL ANSWER:")
        print(base)

        print("\n🚀 FINETUNED MODEL ANSWER:")
        print(ft)

        print("\n" + "-" * 80 + "\n")


In [13]:
pretty_print_comparison(prompt_list, base_answers, ft_answers, n=50)


🧩 Sample 0

🎯 PROMPT:
Before I enter the international bathtub sailing championship, can you check wind, weather, and temperature in Copenhagen?

📘 REFERENCE ANSWER:
[get_current_weather(city="Copenhagen"), get_current_wind(city="Copenhagen"), get_city_temperature(city="Copenhagen")]

🤖 BASE MODEL ANSWER:
I'll compose the functions to get the current weather, wind, and temperature in Copenhagen.

Here are the results:

```javascript
// get_current_weather function
function get_current_weather(city) {
  // Assuming a mock API to get the current weather
  const weather = {
    "Copenhagen": {
      "temperature": 12,
      "humidity": 60,
      "windSpeed": 15,
      "weatherDescription": "Partly Cloudy"
    }
  };
  return weather[city];
}

// get_current_wind function
function get_current_wind(city) {
  // Assuming a mock API to get the current wind speed
  const wind = {
    "Copenhagen": 15
  };
  return wind[city];
}

// get_city_temperature function
function get_city_temperature(ci

In [None]:
# model.to("cpu")
# del model
# torch.cuda.empty_cache()

In [14]:
!pip install --upgrade openai


Collecting openai
  Downloading openai-2.9.0-py3-none-any.whl.metadata (29 kB)
Downloading openai-2.9.0-py3-none-any.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 2.8.1
    Uninstalling openai-2.8.1:
      Successfully uninstalled openai-2.8.1
Successfully installed openai-2.9.0


In [15]:
from openai import OpenAI
client = OpenAI()  # uses OPENAI_API_KEY env var


In [16]:
def judge_pair_openai(prompt, base_answer, ft_answer, reference, model_name="gpt-4.1-mini"):
    """
    Judge which candidate answer (base_answer or ft_answer) better follows
    the tool-calling / system-message behavior defined in the synthetic dataset.
    """
    JUDGE_SYSTEM_PROMPT = """
You are an impartial evaluator of assistant responses in a tool-using agent framework.

You will be given:
- The user message.
- A reference assistant answer (ground truth from a curated dataset).
- Two candidate answers: A and B.

Your job:
- Decide which candidate that follows/mimics the reference best.
- If both candidates are roughly equally good, you may answer "tie".
"""

    user_prompt = f"""
USER MESSAGE:
{prompt}

REFERENCE ASSISTANT ANSWER (ground truth behavior, do NOT copy it literally):
{reference}

CANDIDATE A:
{base_answer}

CANDIDATE B:
{ft_answer}

Question: Which candidate is better considering the referrence?

Reply with exactly one word:
A
B
tie
"""

    response = client.chat.completions.create(
        model=model_name,
        messages=[
            {"role": "system", "content": JUDGE_SYSTEM_PROMPT},
            {"role": "user", "content": user_prompt},
        ],
        max_tokens=4,
        temperature=0.0,
    )

    raw = response.choices[0].message.content.strip()
    choice = raw.split()[0].upper()
    if choice not in ["A", "B", "TIE"]:
        choice = "TIE"
    return choice


In [17]:
from collections import Counter
from tqdm import tqdm

def evaluate_with_judge(list_p, base_answers, ft_answers, model_name="gpt-4.1-mini", max_samples=None):
    """
    Run LLM-as-a-judge pairwise comparison.
    Also store all non-tie samples for later inspection.
    """

    assert len(list_p) == len(base_answers) == len(ft_answers), "Length mismatch!"

    if max_samples is None:
        max_samples = len(list)

    decisions = []
    detailed = [] # store all samples + decision
    non_ties = [] # store only A/B

    for i in tqdm(range(max_samples), desc="Judging"):
        prompt = list_p[i]["prompt"]
        reference = list_p[i]["reference"]
        base = base_answers[i]
        ft = ft_answers[i]

        decision = judge_pair_openai(prompt, base, ft, reference, model_name=model_name)
        decisions.append(decision)

        record = {
            "idx": i,
            "prompt": prompt,
            "base_answer": base,
            "ft_answer": ft,
            "decision": decision,
            "reference": reference,
        }
        detailed.append(record)

        if decision in ["A", "B"]:
            non_ties.append(record)

    # summary
    counts = Counter(decisions)
    total = len(decisions)

    print("=== Judge results ===")
    print(f"Total comparisons: {total}")
    print(f"Base wins (A):     {counts.get('A', 0)}  ({counts.get('A', 0)/total:.1%})")
    print(f"FT wins   (B):     {counts.get('B', 0)}  ({counts.get('B', 0)/total:.1%})")
    print(f"Ties:             {counts.get('TIE', 0)} ({counts.get('TIE', 0)/total:.1%})")

    return decisions, counts, detailed, non_ties


In [18]:
decisions, counts, detailed, non_ties = evaluate_with_judge(
    prompt_list,
    base_answers=base_answers,
    ft_answers=ft_answers,
    model_name="gpt-4.1-mini",
    max_samples=50,
)

Judging: 100%|██████████| 50/50 [00:21<00:00,  2.29it/s]

=== Judge results ===
Total comparisons: 50
Base wins (A):     10  (20.0%)
FT wins   (B):     24  (48.0%)
Ties:             16 (32.0%)





This cell runs the eval providing the answers of the model in the other order.

In [19]:
def pretty_print_comparison_of_ties(record):
    print("="*80)
    print(f"Sample index: {record['idx']}  (decision={record['decision']})")
    print("="*80)

    print("\n🎯 PROMPT:")
    print(record["prompt"])

    print("\n🤖 BASE ANSWER:")
    print(record["base_answer"])

    print("\n🚀 FT ANSWER:")
    print(record["ft_answer"])

    print("\n📘 REFERENCE ANSWER:")
    print(record["reference"])

    print("\n" + "-"*80 + "\n")

See how the judge does if prompts are exactly the same

Print all the non-ties to double check that the LLM is correct.

In [20]:
for rec in non_ties:
    pretty_print_comparison_of_ties(rec)


Sample index: 0  (decision=B)

🎯 PROMPT:
Before I enter the international bathtub sailing championship, can you check wind, weather, and temperature in Copenhagen?

🤖 BASE ANSWER:
I'll compose the functions to get the current weather, wind, and temperature in Copenhagen.

Here are the results:

```javascript
// get_current_weather function
function get_current_weather(city) {
  // Assuming a mock API to get the current weather
  const weather = {
    "Copenhagen": {
      "temperature": 12,
      "humidity": 60,
      "windSpeed": 15,
      "weatherDescription": "Partly Cloudy"
    }
  };
  return weather[city];
}

// get_current_wind function
function get_current_wind(city) {
  // Assuming a mock API to get the current wind speed
  const wind = {
    "Copenhagen": 15
  };
  return wind[city];
}

// get_city_temperature function
function get_city_temperature(city) {
  // Assuming a mock API to get the current temperature
  const temperature = {
    "Copenhagen": 12
  };
  return temper