## llm output json with lm-format-enforcer

### Dependencies

In [None]:
#!pip install torch --index-url https://download.pytorch.org/whl/cu118
#!pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir

In [1]:
import torch
print(torch.cuda.is_available())

True


#### Download LLM Model GGUF File

In [2]:
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
downloaded_model_path = hf_hub_download(repo_id="TheBloke/Llama-2-7b-Chat-GGUF", filename="llama-2-7b-chat.Q5_K_M.gguf")
llm = Llama(model_path=downloaded_model_path)
#downloaded_model_path = hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", filename="mistral-7b-instruct-v0.1.Q5_K_M.gguf")
#llm = Llama(model_path=downloaded_model_path, n_gpu_layers=48,n_ctx=512,n_threads=16,main_gpu=0,seed=123,verbose=False)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /mnt/datadisk/cache/huggingface/hub/models--TheBloke--Llama-2-7b-Chat-GGUF/snapshots/191239b3e26b2882fb562ffccdd1cf0f65402adb/llama-2-7b-chat.Q5_K_M.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q5_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q5_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q5_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q5_K     [  4096,  40

Setting up the prompt for the specific language model. We simplify the implementation a bit as we don't need chat history for this demo.

### Standard Query

In [3]:
DEFAULT_SYSTEM_PROMPT = """\
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\
"""
def get_prompt(message: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
    return f'<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n{message} [/INST]'

### LM Format Enforcer Logits Processor

In [None]:
#!pip install lm-format-enforcer huggingface-hub

In [4]:
from typing import Optional
from llama_cpp import LogitsProcessorList
from lmformatenforcer import CharacterLevelParser
from lmformatenforcer.integrations.llamacpp import build_llamacpp_logits_processor
from lmformatenforcer import JsonSchemaParser
from pydantic import BaseModel
from typing import List
from IPython.display import display, Markdown

def display_header(text):
    display(Markdown(f'**{text}**'))

def display_content(text):
    display(Markdown(f'```\n{text}\n```'))

def llamacpp_with_character_level_parser(llm: Llama, prompt: str, character_level_parser: Optional[CharacterLevelParser]) -> str:
    logits_processors: Optional[LogitsProcessorList] = None
    if character_level_parser:
        logits_processors = LogitsProcessorList([build_llamacpp_logits_processor(llm, character_level_parser)])
    
    output = llm(prompt, logits_processor=logits_processors)
    text: str = output['choices'][0]['text']
    return text

#### Example-1  Output a single JSON object

In [5]:
class PlayerSchema(BaseModel):
    first_name: str
    last_name: str
    year_of_birth: int
    num_seasons_in_nba: int

question = 'Please give me information about Michael Jordan. You MUST answer using the following json schema: '
question_with_schema = f'{question}{PlayerSchema.schema_json()}'
prompt = get_prompt(question_with_schema)

/tmp/ipykernel_139569/1468049766.py:8: PydanticDeprecatedSince20: The `schema_json` method is deprecated; use `model_json_schema` and json.dumps instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.4/migration/
  question_with_schema = f'{question}{PlayerSchema.schema_json()}'


In [6]:
display_header("Standard LLM Output:")
result = llamacpp_with_character_level_parser(llm, prompt, None)
display_content(result)

**Standard LLM Output:**


llama_print_timings:        load time = 14317.16 ms
llama_print_timings:      sample time =    43.86 ms /    92 runs   (    0.48 ms per token,  2097.39 tokens per second)
llama_print_timings: prompt eval time = 14317.06 ms /   294 tokens (   48.70 ms per token,    20.53 tokens per second)
llama_print_timings:        eval time = 20549.76 ms /    91 runs   (  225.82 ms per token,     4.43 tokens per second)
llama_print_timings:       total time = 35189.19 ms


```
  Of course! I'd be happy to provide information about Michael Jordan using the provided JSON schema.
{
"first_name": "Michael",
"last_name": "Jordan",
"year_of_birth": 1963,
"num_seasons_in_nba": 15
}
I hope this helps! Let me know if you have any other questions.
```

output from llama-2 7b chat:  
 Of course! I'd be happy to provide information about Michael Jordan using the provided JSON schema.
{
"first_name": "Michael",
"last_name": "Jordan",
"year_of_birth": 1963,
"num_seasons_in_nba": 15
}
I hope this helps! Let me know if you have any other questions.

output from mistral 7b instruct:  
{
"first\_name": "Michael",
"last\_name": "Jordan",
"year\_of\_birth": 1963,
"num\_seasons\_in\_nba": 15
}

In [8]:
display_header("LLM Output with json schema enforcing:")
result = llamacpp_with_character_level_parser(llm, prompt, JsonSchemaParser(PlayerSchema.schema()))
display_content(result)

**LLM Output with json schema enforcing:**

/tmp/ipykernel_139569/975306117.py:2: PydanticDeprecatedSince20: The `schema` method is deprecated; use `model_json_schema` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.4/migration/
  result = llamacpp_with_character_level_parser(llm, prompt, JsonSchemaParser(PlayerSchema.schema()))
Llama.generate: prefix-match hit

llama_print_timings:        load time = 14317.16 ms
llama_print_timings:      sample time =    25.68 ms /    53 runs   (    0.48 ms per token,  2063.62 tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time = 11898.52 ms /    53 runs   (  224.50 ms per token,     4.45 tokens per second)
llama_print_timings:       total time = 12248.30 ms


```
  { "first_name": "Michael", "last_name": "Jordan", "year_of_birth": 1963, "num_seasons_in_nba": 15 }



```

#### Example-2  Output List of JSON Object

In [22]:
message="Q:please give me a list of planets in the solar system? A: "
prompt=get_prompt(message,DEFAULT_SYSTEM_PROMPT)
output = llm(prompt,max_tokens=512,stop=["Q:"])
text: str = output['choices'][0]['text']
display_header("LLM standard output")
print(text)

Llama.generate: prefix-match hit

llama_print_timings:        load time = 14317.16 ms
llama_print_timings:      sample time =    35.35 ms /    83 runs   (    0.43 ms per token,  2347.75 tokens per second)
llama_print_timings: prompt eval time =  1258.37 ms /    23 tokens (   54.71 ms per token,    18.28 tokens per second)
llama_print_timings:        eval time = 18237.59 ms /    82 runs   (  222.41 ms per token,     4.50 tokens per second)
llama_print_timings:       total time = 19745.09 ms


**LLM standard output**

  Of course! I'd be happy to help you with that. The eight planets in our solar system are:
1. Mercury
2. Venus
3. Earth
4. Mars
5. Jupiter
6. Saturn
7. Uranus
8. Neptune

I hope that helps! Let me know if you have any other questions.


In [26]:
## llm
llm = Llama(model_path=downloaded_model_path, n_ctx=4096,n_threads=16,verbose=False)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /mnt/datadisk/cache/huggingface/hub/models--TheBloke--Llama-2-7b-Chat-GGUF/snapshots/191239b3e26b2882fb562ffccdd1cf0f65402adb/llama-2-7b-chat.Q5_K_M.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q5_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q5_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q5_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q5_K     [  4096,  40

In [29]:
from typing import List
from pydantic import BaseModel

## schema
class PlanetSchema(BaseModel):
    planet_name: str

class PlanetList(BaseModel):
    planets: List[PlanetSchema]

## question
question = 'please give me a list of planets in the solar system?. You MUST answer using the following json schema: '
question_with_schema = f'{question}{PlanetList.schema_json()}'
prompt = get_prompt(question_with_schema)
#display_content(prompt)

## response
display_header("LLM Output with json schema enforcing:")
result = llamacpp_with_character_level_parser(llm, prompt, JsonSchemaParser(PlanetList.schema()))
display_content(result)

/tmp/ipykernel_139569/2718732823.py:13: PydanticDeprecatedSince20: The `schema_json` method is deprecated; use `model_json_schema` and json.dumps instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.4/migration/
  question_with_schema = f'{question}{PlanetList.schema_json()}'


**LLM Output with json schema enforcing:**

/tmp/ipykernel_139569/2718732823.py:19: PydanticDeprecatedSince20: The `schema` method is deprecated; use `model_json_schema` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.4/migration/
  result = llamacpp_with_character_level_parser(llm, prompt, JsonSchemaParser(PlanetList.schema()))


```
  { "planets": [ { "planet_name": "Mercury" }, { "planet_name": "Venus" }, { "planet_name": "Earth" }, { "planet_name": "Mars" }, { "planet_name": "Jupiter" }, { "planet_name": "Saturn" }, { "planet_name": "Uranus" }, { "planet_name": "Neptune" } ] }






```