# Demonstration of the Granite answer relevancy intrisic

This notebook shows the usage of the IO processor for the Granite answer relevancy intrisic, 
also known as the [LoRA Adapter for Answer Relevancy Classification](
    https://huggingface.co/ibm-granite/granite-3.3-8b-lora-rag-answer-relevance-classifier
)

This notebook can run its own vLLM server to perform inference, or you can host the 
models on your own server. To use your own server, set the `run_server` variable below
to `False` and set appropriate values for the constants 
`openai_base_url`, `openai_base_model_name` and `openai_lora_model_name`.

In [None]:
# Imports go here
from granite_io.backend.vllm_server import LocalVLLMServer
from granite_io import UserMessage, make_backend
from granite_io.io.granite_3_3.input_processors.granite_3_3_input_processor import (
    Granite3Point3Inputs,
)
from granite_io.io.answer_relevance import AnswerRelevanceIOProcessor

In [None]:
# Constants go here
base_model_name = "ibm-granite/granite-3.3-8b-instruct"
lora_model_name = (
    "/proj/dmfexp/8cc/huaiyu/data/answer_relevance/"
    "granite-3.3/lora/250719/output/checkpoint-7300/"
)
run_server = False

In [8]:
if run_server:
    # Start by firing up a local vLLM server and connecting a backend instance to it.
    server = LocalVLLMServer(
        base_model_name, lora_adapters=[(lora_model_name, lora_model_name)]
    )
    server.wait_for_startup(200)
    lora_backend = server.make_lora_backend(lora_model_name)
    backend = server.make_backend()
else:  # if not run_server
    # Use an existing server.
    # Modify the constants here as needed.
    openai_base_url = "http://p5-r10-n3.bluevela.rmf.ibm.com:36101/v1"
    # openai_base_url = "http://localhost:55555/v1"
    openai_api_key = "granite_intrinsics_1234"
    openai_base_model_name = base_model_name
    openai_lora_model_name = lora_model_name
    backend = make_backend(
        "openai",
        {
            "model_name": openai_base_model_name,
            "openai_base_url": openai_base_url,
            "openai_api_key": openai_api_key,
        },
    )
    lora_backend = make_backend(
        "openai",
        {
            "model_name": openai_lora_model_name,
            "openai_base_url": openai_base_url,
            "openai_api_key": openai_api_key,
        },
    )

In [15]:
# Create an example chat completion with a user question and two documents.
chat_input0 = Granite3Point3Inputs.model_validate(
    {
        "messages": [
            {"role": "user", "content": "Who was in the meeting?"},
        ],
        "generate_inputs": {"temperature": 0.0},
    }
)

variations = [
    ("Alice and Jeff.", "Pertinent"),
    ("I don't know.", "Relevant but not complete"),
    ("The document does not contain this information", "Relevant but not complete"),
    ("The meeting was attended by many people.", "Too vague or generic"),
    (
        "The product planning meeting?  That had a good attendence.",
        "Too vague or generic",
    ),
    ("There will be a follow up meeting.", "No attempt"),
    (
        "Alice and Bob attended.  The meeting lasted 2 hours.",
        "Excessive irrelevant information",
    ),
    (
        "Alice and Jeff attended. \nQuestion: When is the meeting?  10:30am.",
        "Excessive irrelevant information",
    ),
]

chat_input = chat_input0.__deepcopy__()
chat_input.messages.append(
    {
        "role": "assistant",
        "content": variations[0][0],
    }
)

chat_input

Granite3Point3Inputs(messages=[UserMessage(content='Who was in the meeting?', role='user'), {'role': 'assistant', 'content': 'Alice and Jeff.'}], tools=[], generate_inputs=GenerateInputs(prompt=None, model=None, best_of=None, echo=None, frequency_penalty=None, logit_bias=None, logprobs=None, max_tokens=None, n=None, presence_penalty=None, stop=None, stream=None, stream_options=None, suffix=None, temperature=0.0, top_p=None, user=None, extra_headers=None, extra_body={}), documents=[], controls=None, thinking=False, sanitize=None)

In [16]:
# Instantiate the I/O processor for the answer relevancy LoRA adapter
io_proc = AnswerRelevanceIOProcessor(lora_backend)

# Pass our example input thorugh the I/O processor and retrieve the result
chat_result = await io_proc.acreate_chat_completion(chat_input)
chat_result.results[0].next_message.content

  PydanticSerializationUnexpectedValue(Expected `UserMessage` - serialized value may not be as expected [input_value={'role': 'assistant', 'co...ent': 'Alice and Jeff.'}, input_type=dict])
  PydanticSerializationUnexpectedValue(Expected `AssistantMessage` - serialized value may not be as expected [input_value={'role': 'assistant', 'co...ent': 'Alice and Jeff.'}, input_type=dict])
  PydanticSerializationUnexpectedValue(Expected `ToolResultMessage` - serialized value may not be as expected [input_value={'role': 'assistant', 'co...ent': 'Alice and Jeff.'}, input_type=dict])
  PydanticSerializationUnexpectedValue(Expected `SystemMessage` - serialized value may not be as expected [input_value={'role': 'assistant', 'co...ent': 'Alice and Jeff.'}, input_type=dict])
  serialized_value = nxt(self)


<|start_of_role|>system<|end_of_role|>Knowledge Cutoff Date: April 2024.
Today's Date: July 22, 2025.
You are Granite, developed by IBM. You are a helpful AI assistant.<|end_of_text|>
<|start_of_role|>user<|end_of_role|>Who was in the meeting?<|end_of_text|>
<|start_of_role|>assistant<|end_of_role|>Alice and Jeff.<|end_of_text|>
<|start_of_role|>user<|end_of_role|>answer_relevance<|end_of_text|>


APIConnectionError: Connection error.

In [None]:
# Try some variations on the answers


for variation in variations:
    updated_messages = chat_input.messages.copy()
    updated_messages[-1] = UserMessage(content=variation)
    chat_result = await io_proc.acreate_chat_completion(
        chat_input.model_copy(update={"messages": updated_messages})
    )
    print(f"'{variation}' => {chat_result.results[0].next_message.content}")

In [None]:
# Free up GPU resources
if "server" in locals():
    server.shutdown()