# üì¶ Export FunctionGemma to ONNX for Transformers.js

This notebook exports a fine-tuned FunctionGemma model to ONNX format with proper quantization.

## Expected Output Sizes
| Variant | Size |
|---------|------|
| `model.onnx` (fp32) | ~1.1 GB |
| `model_fp16.onnx` | ~570 MB |
| `model_q8.onnx` | ~280 MB |

## 1. Install Dependencies

In [None]:
!pip install -q transformers huggingface_hub onnx onnxslim
!pip install -q "optimum[onnx]" onnxruntime

## 2. Clone Transformers.js Repository

In [None]:
!git clone --depth 1 https://github.com/huggingface/transformers.js.git
%cd transformers.js

## 3. Login to Hugging Face Hub

In [None]:
from huggingface_hub import login
login()

## 4. Configuration

‚ö†Ô∏è **Edit these values for your model:**

In [None]:
MODEL_ID = "harlley/functiongemma-square-color"
OUTPUT_DIR = "./models"
TASK = "text-generation-with-past"
HUB_REPO = "harlley/functiongemma-square-color-ONNX"

print(f"Source Model: {MODEL_ID}")
print(f"Output: {OUTPUT_DIR}")
print(f"Hub Repo: {HUB_REPO}")

## 5. Export to ONNX

In [None]:
# Patch quantize.py to skip Q4 if not available
import os

quantize_file = "scripts/quantize.py"
with open(quantize_file, 'r') as f:
    content = f.read()

if "matmul_4bits_quantizer" in content:
    patched = content.replace(
        "from onnxruntime.quantization.matmul_4bits_quantizer import MatMul4BitsQuantizer",
        """try:
    from onnxruntime.quantization.matmul_4bits_quantizer import MatMul4BitsQuantizer
    HAS_4BIT = True
except ImportError:
    HAS_4BIT = False
    print("Warning: 4-bit quantization not available")"""
    )
    with open(quantize_file, 'w') as f:
        f.write(patched)
    print("‚úÖ Patched quantize.py")

In [None]:
!python -m scripts.convert \
    --model_id {MODEL_ID} \
    --task {TASK} \
    --output_parent_dir {OUTPUT_DIR}

## 6. FP16 Quantization

In [None]:
import onnx
from onnxruntime.transformers.float16 import convert_float_to_float16
import os

onnx_folder = os.path.join(OUTPUT_DIR, MODEL_ID, "onnx")
model_path = os.path.join(onnx_folder, "model.onnx")
model_fp16_path = os.path.join(onnx_folder, "model_fp16.onnx")

if os.path.exists(model_path):
    print(f"Loading {model_path}...")
    model = onnx.load(model_path)
    
    print("Converting to FP16...")
    model_fp16 = convert_float_to_float16(
        model,
        keep_io_types=True,
        disable_shape_infer=True,
        min_positive_val=1e-7,
        max_finite_val=65504.0
    )
    
    onnx.save(model_fp16, model_fp16_path)
    
    orig = os.path.getsize(model_path) / (1024**2)
    fp16 = os.path.getsize(model_fp16_path) / (1024**2)
    print(f"\n‚úÖ FP32: {orig:.0f} MB ‚Üí FP16: {fp16:.0f} MB ({(1-fp16/orig)*100:.0f}% reduction)")

## 7. Q8 Quantization

In [None]:
from onnxruntime.quantization import quantize_dynamic, QuantType
import os

onnx_folder = os.path.join(OUTPUT_DIR, MODEL_ID, "onnx")
model_path = os.path.join(onnx_folder, "model.onnx")
model_q8_path = os.path.join(onnx_folder, "model_q8.onnx")

if os.path.exists(model_path):
    print("Quantizing to Q8...")
    quantize_dynamic(
        model_input=model_path,
        model_output=model_q8_path,
        weight_type=QuantType.QInt8,
        per_channel=False,
        reduce_range=False
    )
    
    orig = os.path.getsize(model_path) / (1024**2)
    q8 = os.path.getsize(model_q8_path) / (1024**2)
    print(f"\n‚úÖ FP32: {orig:.0f} MB ‚Üí Q8: {q8:.0f} MB ({(1-q8/orig)*100:.0f}% reduction)")

## 8. Add Chat Template

In [None]:
import json
import os
from huggingface_hub import hf_hub_download

REFERENCE_MODEL = "onnx-community/functiongemma-270m-it-ONNX"

print(f"Downloading chat_template from {REFERENCE_MODEL}...")
ref_path = hf_hub_download(REFERENCE_MODEL, "tokenizer_config.json")

with open(ref_path, 'r') as f:
    ref_config = json.load(f)

model_folder = os.path.join(OUTPUT_DIR, MODEL_ID)
output_path = os.path.join(model_folder, "tokenizer_config.json")

with open(output_path, 'r') as f:
    your_config = json.load(f)

if 'chat_template' in ref_config:
    your_config['chat_template'] = ref_config['chat_template']
    with open(output_path, 'w') as f:
        json.dump(your_config, f, indent=2)
    print("‚úÖ chat_template added!")

## 9. Verify Structure

In [None]:
import os

model_folder = os.path.join(OUTPUT_DIR, MODEL_ID)

def list_files(folder, prefix=""):
    total = 0
    for f in sorted(os.listdir(folder)):
        path = os.path.join(folder, f)
        if os.path.isfile(path):
            size = os.path.getsize(path)
            total += size
            print(f"{prefix}üìÑ {f}: {size/(1024**2):.1f} MB")
        else:
            print(f"{prefix}üìÇ {f}/")
            total += list_files(path, prefix + "  ")
    return total

print(f"üìÅ {model_folder}:")
print("="*50)
total = list_files(model_folder)
print("="*50)
print(f"üìä Total: {total/(1024**2):.0f} MB")

## 10. Test Model

‚ö†Ô∏è **Skip this step if testing fails - just proceed to upload.**

In [None]:
import onnxruntime as ort
from transformers import AutoTokenizer
import numpy as np
import os

model_folder = os.path.join(OUTPUT_DIR, MODEL_ID)
onnx_folder = os.path.join(model_folder, "onnx")

tokenizer = AutoTokenizer.from_pretrained(model_folder)

# Load ONNX model directly with onnxruntime
model_path = os.path.join(onnx_folder, "model.onnx")
session = ort.InferenceSession(model_path, providers=['CPUExecutionProvider'])
print(f"‚úÖ Loaded: model.onnx")

# Tools
tools = [
    {"type": "function", "function": {
        "name": "set_square_color",
        "description": "Sets the color of the square.",
        "parameters": {"type": "object", "properties": {"color": {"type": "string"}}, "required": ["color"]}
    }},
    {"type": "function", "function": {
        "name": "get_square_color",
        "description": "Gets the color.",
        "parameters": {"type": "object", "properties": {}, "required": []}
    }}
]

# Test
msgs = [{"role": "user", "content": "Change the color to blue"}]
fmt = tokenizer.apply_chat_template(msgs, tools=tools, tokenize=False, add_generation_prompt=True)
print(f"\nüìù Formatted input length: {len(fmt)} chars")
print("\n‚úÖ Tokenizer and chat template working!")
print("\n‚ö†Ô∏è Full generation test skipped - will test in browser after upload.")

## 11. Upload to Hub

In [None]:
from huggingface_hub import HfApi
import os

model_folder = os.path.join(OUTPUT_DIR, MODEL_ID)

api = HfApi()
api.create_repo(repo_id=HUB_REPO, exist_ok=True)
api.upload_folder(
    folder_path=model_folder,
    repo_id=HUB_REPO,
    commit_message="Upload ONNX model with FP16, Q8 and chat_template"
)

print(f"\n‚úÖ Done!")
print(f"üîó https://huggingface.co/{HUB_REPO}")

## üéâ Usage

```javascript
import { AutoModelForCausalLM, AutoTokenizer } from '@huggingface/transformers';

const model = await AutoModelForCausalLM.from_pretrained(
    'harlley/functiongemma-square-color-ONNX',
    { dtype: 'q8', device: 'webgpu' }  // or 'fp16'
);
```