In [10]:
import xgrammar as xgr
import torch
import numpy as np
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM


* https://xgrammar.mlc.ai/docs/how_to/json_generation.html#how-to-json-generation

In [4]:
model_id = "/Users/id4thomas/models/Llama-3.2-1B-Instruct"

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
config = AutoConfig.from_pretrained(model_id)
# This can be larger than tokenizer.vocab_size due to paddings
full_vocab_size = config.vocab_size
tokenizer_info = xgr.TokenizerInfo.from_huggingface(tokenizer, vocab_size=full_vocab_size)

compiler = xgr.GrammarCompiler(tokenizer_info, max_threads=8)

In [6]:
from pydantic import BaseModel

class Person(BaseModel):
    name: str
    age: int

compiled_grammar = compiler.compile_json_schema(Person)

In [7]:
compiled_grammar

<xgrammar.compiler.CompiledGrammar at 0x31a7d2fe0>

In [8]:
matcher = xgr.GrammarMatcher(compiled_grammar)
token_bitmask = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size)

In [9]:
xgr_logits_processor = xgr.contrib.hf.LogitsProcessor(compiled_grammar)

In [16]:
model = AutoModelForCausalLM.from_pretrained(
    model_id, torch_dtype=torch.bfloat16
).to("mps")

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Introduce yourself in JSON with two fields: name and age."},
]
texts = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
model_inputs = tokenizer(texts, return_tensors="pt").to(model.device)

In [17]:
generated_ids = model.generate(
    **model_inputs, max_new_tokens=512, logits_processor=[xgr_logits_processor]
)
generated_ids = generated_ids[0][len(model_inputs.input_ids[0]) :]
print(tokenizer.decode(generated_ids, skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


{"name": "Assistant", "age": 0}
