# Introduction to granite-common and the Granite RAG Intrinsics Library

This notebook provides a high-level introduction to the `granite-common` library and to the [Granite RAG Intrinsics Library](https://huggingface.co/generative-computing/rag-intrinsics-lib).

You will need a hosted vLLM server to perform inference. See the library above for scripts to host the models on your own server.

Other notebooks in this directory provide a more in-depth treatment of concepts covered
in this notebook:

* Intro to `granite-common` and simple interface to call each intrinsic: [intrinsics_openai.ipynb](./intrinsics_openai.ipynb) and [intrinsics_transformers.ipynb](./intrinsics_transformers.ipynb) 
* Advanced end-to-end Retrieval Augmented Generation flows: [rag.ipynb](./rag.ipynb)

In [1]:
# Imports go in this cell
import pathlib
import os
import json
import openai

from IPython.display import display, Markdown

import granite_common
from granite_common.base.types import (
    ChatCompletion,
    VLLMExtraBody,
)

from granite_common.retrievers.util import download_mtrag_embeddings
from granite_common.retrievers import (
    ElasticsearchRetriever,
    InMemoryRetriever,
    Retriever,
)

In [2]:
# Constants go here
CORPUS_NAMES_MAPPINGS = {
    "banking": "mt-rag-banking-elser-512-100-20250205",
    "clapnq": "mt-rag-clapnq-elser-512-100-20240503",
    "fiqa": "mt-rag-fiqa-beir-elser-512-100-20240501",
    "govt": "mt-rag-govt-elser-512-100-20240611",
    "ibmcloud": "mt-rag-ibmcloud-elser-512-100-20240502",
    "scifact": "mt-rag-scifact-beir-elser-512-100-20240501",
    "telco": "mt-rag-telco-elser-512-100-20241210",
}

DEFAULT_CANNED_RESPONSE = (
    "Sorry, but I am unable to answer this question from the documents retrieved."
)

target_model_name = "granite-3.3-8b-instruct"
base_model_name = f"ibm-granite/{target_model_name}"

# OpenAI compatible server - e.g. vLLM
openai_base_url = "http://localhost:55555/v1"
openai_base_url = "http://p6-r19-n3.bluevela.rmf.ibm.com:55555/v1"
openai_api_key = "rag_intrinsics_1234"

intrinsic_names = [
    "citations",
    "query_rewrite",
    "answerability",
    "hallucination_detection",
    "uncertainty",
]

# retriever_name = "elasticsearch"
retriever_name = "embeddings"
corpus_name = "govt"

if retriever_name == "elasticsearch":
    # Elasticsearch retriever
    elasticsearch_host = "https://localhost:32765"
    elasticsearch_host = "https://ibm_cloud_4ae4bca5_f6aa_43b6_93a3_befbd8fcb0e7:7d325be7af5de8c018b3284d754eb264995a56e4dfeba63fdeb6db1ff37dbd19@dbcc936c-8274-450e-9cb1-44a30ec26d88.c13paqsd05a0ept695ng.databases.appdomain.cloud:32765"
elif retriever_name == "embeddings":
    # Embeddings retriever
    temp_data_dir = "../data/test_retrieval_temp"
    embeddings_data_file = pathlib.Path(temp_data_dir) / f"{corpus_name}_embeds.parquet"
    embedding_model_name = "multi-qa-mpnet-base-dot-v1"

    # Download the indexed corpus if it hasn't already been downloaded.
    # This notebook uses a subset of the government corpus from the MTRAG benchmark.
    embeddings_location = f"{temp_data_dir}/{corpus_name}_embeds.parquet"
    if not os.path.exists(embeddings_location):
        download_mtrag_embeddings(
            embedding_model_name, corpus_name, embeddings_location
        )

In [3]:
# Intrinsics
# Load config files and create objects

intrinsic_rewriters = {}
intrinsic_result_processors = {}
for intrinsic_name in intrinsic_names:
    io_yaml_file = granite_common.intrinsics.util.obtain_io_yaml(
        intrinsic_name, target_model_name
    )

    intrinsic_rewriter = granite_common.IntrinsicsRewriter(config_file=io_yaml_file)
    intrinsic_result_processor = granite_common.IntrinsicsResultProcessor(
        config_file=io_yaml_file
    )

    intrinsic_rewriters[intrinsic_name] = intrinsic_rewriter
    intrinsic_result_processors[intrinsic_name] = intrinsic_result_processor

# Connect to the inference server
client = openai.OpenAI(base_url=openai_base_url, api_key=openai_api_key)

In [4]:
# Retriever

if retriever_name == "elasticsearch":
    # Connect to the Elasticsearch server.
    # Due to the setup, we have to open a retriever connection for each corpus.
    retrievers = {}
    for corpus_name, actual_corpus_name in CORPUS_NAMES_MAPPINGS.items():
        retriever = ElasticsearchRetriever(
            corpus_name=actual_corpus_name,
            host=elasticsearch_host,
            verify_certs=False,
            ssl_show_warn=False,
        )
        retrievers[corpus_name] = retriever
elif retriever_name == "embeddings":
    pass

In [5]:
# Functions


def call_intrinsic(
    intrinsic_name: str,
    chat_completion_request: dict,
    **kwargs,
) -> openai.types.chat.ChatCompletion:
    """
    Call an intrinsic with OpenAI Python API objects on input and output.

    :param intrinsic_name: Name of intrinsic to invoke
    :param chat_completion_request: Chat completion request to make; can be dict or
        OpenAI dataclass
    :param kwargs: Optional named argument(s) for intrinsic

    :returns: OpenAI Python API chat completion containing processed intrinsic outputs
    """
    # Some intrinsics modify the chat object.
    _chat_completion_request = chat_completion_request.model_copy(deep=True)

    rewriter = intrinsic_rewriters[intrinsic_name]
    result_processor = intrinsic_result_processors[intrinsic_name]
    rewritten_request = rewriter.transform(_chat_completion_request, **kwargs)

    # Set model name manually for now, because vLLM does not maintain any kind of
    # metadata that would allow us to determine the right model name.
    rewritten_request.model = intrinsic_name

    response = client.chat.completions.create(**rewritten_request.model_dump())
    # return response
    transformed_response = result_processor.transform(response, rewritten_request)

    # Convert to same type as OpenAI API
    return openai.types.chat.ChatCompletion.model_validate(
        transformed_response.model_dump()
    )


def retrieve_snippets(retriever: Retriever, query: str, top_k: int = 3):
    return retriever.retrieve(query, top_k=top_k)

## granite-common

TODO: Update description of `granite-common`

The `granite-common` library provides input and output processing for large language models.
In this context, *input and output processing* refers to the steps that happen 
immediately before and after low-level model inference. These steps include:

* **Input processing:** Translating application data structures such as messages and 
  documents into a string prompt for a particular model
* **Output processing:** Parsing the raw string output of a language model into 
  structured application data
* **Constrained decoding:** Constraining the raw string output of an LLM to ensure that
  the model's output will always parse into structured application data
* **Inference-time scaling:** Extracting a higher-quality answer from an LLM by 
  combining the results of multiple inference calls.


`granite-common` includes three main types of entry points:
* **Backend connectors** connect the `granite-io` library to different model inference 
  engines and vector databases.
  The other components of `granite-io` use these adapters to invoke model inference with
  exactly the right low-level parameters for each model and inference layer.
* **InputOutputProcessors** provide input and output processing for specific models.
  An InputOutputProcessor exposes a "chat completions" interface, where the input is the
  structured representation of a conversation and the output is the next turn of the
  conversation.
  For some models, such as [IBM Granite 3.3](https://huggingface.co/collections/ibm-granite/granite-33-language-models-67f65d0cca24bcbd1d3a08e3), we also provide
  separate APIs that only perform input processing or output processing.
* **RequestProcessors** rewrite chat completion requests in various ways, such as 
  rewording messages, attaching RAG documents, or filtering documents. You can chain
  one or more RequestProcessors with an InputOutputProcessor to implement a custom 
  inference workflow.

The chat completions API in `granite-common` runs low-level inference on the target
model, passing in raw string prompts and inference paramters and receiving back raw 
string results:

In [6]:
completion = client.completions.create(
    prompt="Complete this sequence: 2, 3, 5, 7, 11, 13, ",
    model=base_model_name,
    temperature=0.0,
    max_tokens=12,
)

print(completion.model_dump_json(indent=2))

{
  "id": "cmpl-2e156362b635482fb8fccfc9364b2c55",
  "choices": [
    {
      "finish_reason": "length",
      "index": 0,
      "logprobs": null,
      "text": "17, 19, 23, ",
      "stop_reason": null,
      "token_ids": null,
      "prompt_logprobs": null,
      "prompt_token_ids": null
    }
  ],
  "created": 1761946464,
  "model": "ibm-granite/granite-3.3-8b-instruct",
  "object": "text_completion",
  "system_fingerprint": null,
  "usage": {
    "completion_tokens": 12,
    "prompt_tokens": 25,
    "total_tokens": 37,
    "completion_tokens_details": null,
    "prompt_tokens_details": null
  },
  "service_tier": null,
  "kv_transfer_params": null
}


Most users don't interact with the low-level backend API directly. The recommended way
to use `granite-common` is via the InputOutputProcessor APIs, which convert high-level 
request into the specific combination of inference paramters that the model needs,
run inference, and then convert the model's raw output into something that an 
application can use directly.

Let's create an example chat completion request so we can show how the high-level 
InputOutputProcessor API works.

In [7]:
chat_input = ChatCompletion.model_validate(
    {
        "messages": [
            {
                "role": "assistant",
                "content": "Welcome to the City of Dublin, CA help desk.",
            },
            {
                "role": "user",
                "content": "Hi there. Can you answer questions about fences?",
            },
            {
                "role": "assistant",
                "content": "Absolutely, I can provide general information about "
                "fences in Dublin, CA.",
            },
            {
                "role": "user",
                "content": "Great. I want to add one in my front yard. Do I need a "
                "permit?",
            },
        ],
        "temperature": 0.0,
        "max_tokens": 4096,
    }
)


def print_chat(c):
    display(
        Markdown(
            "\n".join([f"**{m.role.capitalize()}:** {m.content}\n" for m in c.messages])
        )
    )


print_chat(chat_input)

**Assistant:** Welcome to the City of Dublin, CA help desk.

**User:** Hi there. Can you answer questions about fences?

**Assistant:** Absolutely, I can provide general information about fences in Dublin, CA.

**User:** Great. I want to add one in my front yard. Do I need a permit?


This chat completion request models a scenario where the user is talking to the 
automated help desk for the City of Dublin, CA and has just asked a question about 
permitting for installing fences. Running this chat completion request should produce
an assistant response to this question.

If we pass our chat completion (`chat_input`) to a `granite-common` InputOutputProcessor's 
`create_chat_completion()` method, the InputOutputProcessor will create a string prompt
for the model, set up model-specific generation parameters, invoke model inference, and
parse the model's raw output into a structured message.

Here we create an InputOutputProcessor for the [IBM Granite 3.3](
    https://huggingface.co/ibm-granite/granite-3.3-8b-instruct) model and point that InputOutputProcessor at the backend we used previously.

In [8]:
# Pass the example through Granite to get an answer.
chat_input.model = base_model_name
non_rag_completion = client.chat.completions.create(**chat_input.model_dump())

display(Markdown(non_rag_completion.choices[0].message.content))

Yes, in Dublin, California, you typically need a permit for any new fence that exceeds 6 feet in height or is located within the front yard setback. The setback is the area between your property line and the street. It's usually 20 feet from the property line for front yard fences. However, these rules can vary based on specific circumstances, so it's always best to check with the Dublin Planning Department or visit their official website for the most accurate and current information.

The model's response here is generic and vague, because the model's training data does 
not cover obscure zoning ordinances of small cities in northern California.

We can use the 
[Uncertainty LoRA](
    https://huggingface.co/generative-computing/core-intrinsics-lib/blob/main/uncertainty/README.md)
model to flag cases such as this one that are not covered by the base model's 
training data. 

This model comes packaged as a LoRA adapter on top of Granite 3.3. To run the model, we
create an instance of `CertaintyIOProcessor` -- the `granite-io` InputOutputProcessor
for this model -- and point this InputOutputProcessor at a Backend that we have
connected to the model's LoRA adapter. Then we can pass the same chat completion request
into the model to compute a certainty score from 0 to 1.0.

In [9]:
response = call_intrinsic("uncertainty", chat_input)
certainty_score = round(json.loads(response.choices[0].message.content)["certainty"], 2)

print(f"Certainty score is {certainty_score} out of 1.0")

Certainty score is 0.48 out of 1.0


The low certainty score indicates that the model's training data does not align closely
with this question.

To answer this question properly, we need to provide the model with domain-specific 
information. One of the most popular ways to add domain-specific information to an LLM
is to use the Retrieval-Augmented Generation (RAG) pattern. RAG involves retrieving
snippets of text from a collection of documents and adding those snippets to the model's
prompt.


In this case, the relevant information can be found in the Government 
corpus of the [MTRAG multi-turn RAG benchmark](https://github.com/IBM/mt-rag-benchmark).
Similar to its connectors for inference backends, `granite-io` has adapters for 
RAG retrieval backends.

Let's spin up a connection in-memory vector database, using embeddings that we've 
precomputed offline from the MTRAG Government corpus.

In [10]:
if retriever_name == "elasticsearch":
    retriever = retrievers[corpus_name]
elif retriever_name == "embeddings":
    retriever = InMemoryRetriever(embeddings_data_file, embedding_model_name)

`granite-io` also includes a RequestProcessor that performs the retrieval phase of
RAG. This class, called `RetrievalRequestProcessor`, takes as input a chat completion
request. The RequestProcessor uses the text of the last user turn to query a `Retriever`
instance and fetch document snippets.

In [11]:
# retrieval_request_proc = RetrievalRequestProcessor(retriever, top_k=3)
# chat_input_with_docs = retrieval_request_proc.process(chat_input)[0]
# chat_input_with_docs.model_dump()

# The database fetches document snippets that match a given query.
# For example, the user's question in the conversation above:
query = chat_input.messages[-1].content
print(f"Query is: '{query}'")
print("Matching document snippets:")
documents = retrieve_snippets(retriever, query, top_k=3)
documents

Query is: 'Great. I want to add one in my front yard. Do I need a permit?'
Matching document snippets:
{'id': '69bc80181880fd8b', 'url': 'https://www.parks.ca.gov/?page_id=29229', 'title': 'Unmanned Aircraft System (Drones) in State Parks', 'begin': 2873, 'end': 5475, 'text': 'Each park unit may have its own posted orders. Even absent a posted order on drones, it is within the discretion of park staff to contact drone operators when drones threaten visitors, property, wildlife, or privacy. If a drone operator continues to fly in a dangerous or reckless manner, they may be asked to stop flying and remove the drone from park boundaries.It is recommended that recreational drone operators consult the Federal Aviation Administration (FAA) rules and regulations on the proper use of recreational drones and use common sense when operating these devices around crowded public areas, wildlife, or historic resources.\nDPR requires compliance with the FAA guidelines for recreational Unmanned Aircra

Unfortunately, the last user turn in this conversation is:
> **User:** Great. I want to add one in my front yard. Do I need a permit?

This text is missing key details for retrieving relevant documents: What does the 
user want to add to their front yard, and what city's municipal code applies to this
yard? As a result, the retrieved documents aren't actually relevant to the user's 
question.

The [LoRA Adapter for Answerability Classification](
    https://huggingface.co/generative-computing/rag-intrinsics-lib/blob/main/answerability/README.md)
provides a robust way to detect this kind of problem. Here's what happens if we 
run the chat completion request with irrelevant document snippets through the 
answerability model, using the
`granite_common` processor for the model to handle input and output:

In [12]:
# Retrieval step from before...
chat_input_with_docs = chat_input.model_copy(deep=True)
chat_input_with_docs.extra_body = VLLMExtraBody(documents=documents)
chat_input_with_docs.model_dump()

# ...followed by an answerability check
response = call_intrinsic("answerability", chat_input_with_docs)
answerability_likelihood = json.loads(response.choices[0].message.content)[
    "answerability_likelihood"
]
answerability_likelihood

2.2696519511450435e-08

We can use use the [LoRA Adapter for Query Rewrite](
    https://huggingface.co/ibm-granite/granite-3.3-8b-rag-agent-lib/blob/main/query_rewrite_lora/README.md) to rewrite
the last user turn into a string that is more useful for retrieving document snippets.
`granite-io` includes an InputOutputProcessor for running this model.
Here's how to use this InputOutputProcessor to apply this model to our example 
conversation:

In [13]:
response = call_intrinsic("query_rewrite", chat_input_with_docs)
rewritten_question = json.loads(response.choices[0].message.content)[
    "rewritten_question"
]
rewritten_question

'Do I need a permit to add a fence in my front yard in Dublin, CA?'

The query rewrite model turns the last user turn in this conversation from:
> **User:** Great. I want to add one in my front yard. Do I need a permit?

...to a version of the same question that includes vital additional context:
> **User:** Do I need a permit to add a fence in my front yard in Dublin, CA?

This more specific query should allow the retriever to fetch better document snippets.

The following code snippet uses `granite-io` APIs to rewrite the user query, then
fetch relevant document snippets.

In [14]:
# Redo initialization so this cell can run independently of previous cells

# Rewrite the last user turn into something more suitable for retrieval.
response = call_intrinsic("query_rewrite", chat_input)
rewritten_question = json.loads(response.choices[0].message.content)[
    "rewritten_question"
]

# Retrieve document snippets based on the rewritten turn and attach them to the chat
# completion request.
query = rewritten_question
documents = retrieve_snippets(retriever, query, top_k=3)

chat_input_with_docs = chat_input.model_copy(deep=True)
chat_input_with_docs.extra_body = VLLMExtraBody(documents=documents)

chat_input_with_docs.model_dump()

{'id': 'd1b1394a4f3c5a68', 'url': 'https://dublin.ca.gov/faq.aspx?TID=16', 'title': 'FAQs • Code Enforcement', 'begin': 0, 'end': 2672, 'text': '\n\nFAQs • Code Enforcement\n\n \n\nSkip to Main Content\n\nCreate a Website Account - Manage notification subscriptions, save form progress and more. \xa0\xa0\n\nWebsite Sign In\n\nGovernmentServicesBusinessI Want To...Explore\n\n \n\n \n\n \n\n \n\n \n\n \n\nSearch\n\n \n \n \n\n       \n\n \n \n \n\n \nHomeFAQs\n\nSearch\n\nAll categories\nAnimal Control\nBuilding & Safety \nBusiness Licenses\nCity Clerk\nCity Clerk - Elections\nCity Government\nCode Enforcement\nDisaster Preparedness\nDistrict-Based Elections\nDublin Poet Laureate\nFinance & Administrative Services\nFire Services\nHistory of Dublin\nHousing\nHuman Resources\nPassport Services\nPlanning\nPolice Services\nPublic Works\nSpecial Needs Registry\nWireless\n\nCategories\n\nAll Categories\n\nAnimal Control\n\nBuilding & Safety \n\nBusiness Licenses\n\nCity Clerk\n\nCity Clerk - El

{'messages': [{'content': 'Welcome to the City of Dublin, CA help desk.',
   'role': 'assistant'},
  {'content': 'Hi there. Can you answer questions about fences?',
   'role': 'user'},
  {'content': 'Absolutely, I can provide general information about fences in Dublin, CA.',
   'role': 'assistant'},
  {'content': 'Great. I want to add one in my front yard. Do I need a permit?',
   'role': 'user'}],
 'model': 'ibm-granite/granite-3.3-8b-instruct',
 'extra_body': {'documents': None},
 'temperature': 0.0,
 'max_tokens': 4096}

Attaching relevant information causes the model to respond with a more specific and 
detailed answer. Here's the result that we get when we pass the rewritten chat 
completion request to the InputOutputProcessor for Granite 3.2:

In [15]:
rag_completion = client.chat.completions.create(**chat_input_with_docs.model_dump())
display(Markdown(rag_completion.choices[0].message.content))

Yes, in Dublin, California, you typically need a permit for any new fence that exceeds 6 feet in height or is located within the front yard setback. The setback is the area between your property line and the street. It's usually 20 feet from the property line for front yard fences. However, these rules can vary based on specific circumstances, so it's always best to check with the Dublin Planning Department or visit their official website for the most accurate and current information.

The answer contains specific details about permits for building fences in Dublin, CA.
These facts should grounded in documents retrieved from the corpus. We would like
to be able to prove that the model used the data from the corpus and did not 
hallucinate a fictitious building code.

We can use the [LoRA Adapter for Citation Generation](
    https://huggingface.co/generative-computing/rag-intrinsics-lib/blob/main/citations/README.md
) to explain exactly how this response is grounded in the documents that the rewritten
user query retrieves. As with the other models we've shown so far, `granite-common` includes
an InputOutputProcessor for this model. We can use this InputOutputProcessor to add
citations to the assistant response from the previous cell:

In [16]:
# Generate a Granite response.
chat_input_citations = chat_input_with_docs.model_copy(deep=True)
chat_input_citations.messages.append(rag_completion.choices[0].message)

response = call_intrinsic("citations", chat_input_citations)
citations = json.loads(response.choices[0].message.content)

print("Assistant response:")
display(Markdown(chat_input_citations.messages[-1].content))
print("Citations:")
print(json.dumps(citations, indent=2))

  PydanticSerializationUnexpectedValue(Expected `UserMessage` - serialized value may not be as expected [field_name='messages', input_value=ChatCompletionMessage(con... reasoning_content=None), input_type=ChatCompletionMessage])
  PydanticSerializationUnexpectedValue(Expected `AssistantMessage` - serialized value may not be as expected [field_name='messages', input_value=ChatCompletionMessage(con... reasoning_content=None), input_type=ChatCompletionMessage])
  PydanticSerializationUnexpectedValue(Expected `ToolResultMessage` - serialized value may not be as expected [field_name='messages', input_value=ChatCompletionMessage(con... reasoning_content=None), input_type=ChatCompletionMessage])
  PydanticSerializationUnexpectedValue(Expected `SystemMessage` - serialized value may not be as expected [field_name='messages', input_value=ChatCompletionMessage(con... reasoning_content=None), input_type=ChatCompletionMessage])
  serialized_value = nxt(self)


Assistant response:


Yes, in Dublin, California, you typically need a permit for any new fence that exceeds 6 feet in height or is located within the front yard setback. The setback is the area between your property line and the street. It's usually 20 feet from the property line for front yard fences. However, these rules can vary based on specific circumstances, so it's always best to check with the Dublin Planning Department or visit their official website for the most accurate and current information.

Citations:
[]


TODO: Update CitationsWidget()

In [17]:
# citations_io_proc = CitationsIOProcessor(citations_lora_backend)

# # Add the assistant response to the original chat completion request
# input_with_next_message = input.with_next_message(rag_result.results[0].next_message)

# # Augment this response with citations to the RAG document snippets
# results_with_citations = citations_io_proc.create_chat_completion(
#     input_with_next_message
# )
# CitationsWidget().show(input_with_next_message, results_with_citations)

We can also use the [LoRA Adapter for Hallucination Detection in RAG outputs](
    https://huggingface.co/ibm-granite/granite-3.3-8b-rag-agent-lib/blob/main/hallucination_detection_lora/README.md
) to check whether each sentence of the assistant response is consistent with the
information in the retrieved documents.

In [18]:
# Generate a Granite response.
chat_input_hallucinations = chat_input_with_docs.model_copy(deep=True)
chat_input_hallucinations.messages.append(rag_completion.choices[0].message)

response = call_intrinsic("hallucination_detection", chat_input_hallucinations)
hallucinations = json.loads(response.choices[0].message.content)

print("Assistant response:")
display(Markdown(chat_input_hallucinations.messages[-1].content))
print("Hallucination Checks:")
print(json.dumps(hallucinations, indent=2))

Assistant response:


Yes, in Dublin, California, you typically need a permit for any new fence that exceeds 6 feet in height or is located within the front yard setback. The setback is the area between your property line and the street. It's usually 20 feet from the property line for front yard fences. However, these rules can vary based on specific circumstances, so it's always best to check with the Dublin Planning Department or visit their official website for the most accurate and current information.

Hallucination Checks:
[
  {
    "response_begin": 0,
    "response_end": 154,
    "response_text": "Yes, in Dublin, California, you typically need a permit for any new fence that exceeds 6 feet in height or is located within the front yard setback. ",
    "faithfulness_likelihood": 0.1316583594433584,
    "explanation": "This sentence makes a factual claim about the need for a permit in Dublin, California for new fences. However, the provided context does not contain any information about fences or permits in Dublin, CA. Therefore, the faithfulness of this statement cannot be determined based on the provided context."
  },
  {
    "response_begin": 154,
    "response_end": 380,
    "response_text": "The setback is the area between your property line and the street. ",
    "faithfulness_likelihood": 0.0002695652632046661,
    "explanation": "This sentence makes a factual claim about the need for a permit in Dublin, California for new fences. However, the provided context does not contai

The `granite-common` library also allows developers to create their own custom 
InputOutputProcessors. For example, here's an InputOutputProcessor that rolls up the rewrite, retrieval, and citations processing steps from this notebook into a single custom `chat_completion()` call:

In [19]:
import json
import openai

from IPython.display import display, Markdown

from granite_common.base.types import (
    ChatCompletion,
    ChatCompletionResponse,
    VLLMExtraBody,
)
from granite_common.retrievers.elasticsearch import ElasticsearchRetriever


class MyRAGIOProcessor:
    def __init__(
        self,
        client: openai.OpenAI,
        retriever: dict[str, ElasticsearchRetriever],
    ):
        self.client = client
        self.retriever = retriever

    def call_intrinsic(
        self,
        intrinsic_name: str,
        chat_completion_request: dict,
        **kwargs,
    ) -> openai.types.chat.ChatCompletion:
        """
        Call an intrinsic with OpenAI Python API objects on input and output.

        :param intrinsic_name: Name of intrinsic to invoke
        :param chat_completion_request: Chat completion request to make;
            can be dict or OpenAI dataclass
        :param kwargs: Optional named argument(s) for intrinsic

        :returns: OpenAI Python API chat completion containing processed
            intrinsic outputs
        """
        # Some intrinsics modify the chat object.
        _chat_completion_request = chat_completion_request.model_copy(deep=True)

        rewriter = intrinsic_rewriters[intrinsic_name]
        result_processor = intrinsic_result_processors[intrinsic_name]
        rewritten_request = rewriter.transform(_chat_completion_request, **kwargs)

        # Set model name manually for now, because vLLM does not maintain any kind of
        # metadata that would allow us to determine the right model name.
        rewritten_request.model = intrinsic_name

        response = self.client.chat.completions.create(**rewritten_request.model_dump())
        # return response
        transformed_response = result_processor.transform(response, rewritten_request)

        # Convert to same type as OpenAI API
        return openai.types.chat.ChatCompletion.model_validate(
            transformed_response.model_dump()
        )

    def retrieve_snippets(
        self, retriever: Retriever, query: str, top_k: int = 3
    ) -> dict:
        return retriever.retrieve(query, top_k=top_k)

    def chat_completion(
        self,
        chat_input: ChatCompletion,
    ) -> ChatCompletionResponse:
        """Placeholder for a proper IO processor base class."""

        chat_input_with_docs = chat_input.model_copy(deep=True)

        # Rewrite the last user turn for retrieval
        response = self.call_intrinsic("query_rewrite", chat_input_with_docs)
        rewritten_question = json.loads(response.choices[0].message.content)[
            "rewritten_question"
        ]

        # Retrieve documents with the rewritten last turn
        query = rewritten_question
        documents = self.retrieve_snippets(self.retriever, query, top_k=3)
        chat_input_with_docs.extra_body = VLLMExtraBody(documents=documents)

        # Generate a response
        rag_completion = self.client.chat.completions.create(
            **chat_input_with_docs.model_dump()
        )
        chat_response = chat_input_with_docs.model_copy(deep=True)
        chat_response.messages.append(rag_completion.choices[0].message)

        # Generate citations
        chat_input_citations = chat_response.model_copy(deep=True)

        response = self.call_intrinsic("citations", chat_input_citations)
        citations = json.loads(response.choices[0].message.content)

        return chat_response, citations

We can wrap all of the functionality we've shown so far in a single class that 
inherits from the `InputOutputProcessor` interface in `granite-common`. Packaging things
this way lets applications treat this multi-step flow as if it was a single chat 
completion request to a base model.

In [20]:
chat_input = ChatCompletion.model_validate(
    {
        "messages": [
            {
                "role": "assistant",
                "content": "Welcome to the City of Dublin, CA help desk.",
            },
            {
                "role": "user",
                "content": "Hi there. Can you answer questions about fences?",
            },
            {
                "role": "assistant",
                "content": "Absolutely, I can provide general information about "
                "fences in Dublin, CA.",
            },
            {
                "role": "user",
                "content": "Great. I want to add one in my front yard. Do I need a "
                "permit?",
            },
        ],
        "temperature": 0.0,
        "max_tokens": 4096,
        "model": base_model_name,
    }
)

rag_io_proc = MyRAGIOProcessor(
    client=client,
    retriever=retriever,
)

rag_completion, citations = rag_io_proc.chat_completion(chat_input)

print("Assistant response:")
display(Markdown(rag_completion.messages[-1].content))
print("Citations:")
print(json.dumps(citations, indent=2))
# CitationsWidget().show(input_with_next_message, rag_results)

{'id': 'd1b1394a4f3c5a68', 'url': 'https://dublin.ca.gov/faq.aspx?TID=16', 'title': 'FAQs • Code Enforcement', 'begin': 0, 'end': 2672, 'text': '\n\nFAQs • Code Enforcement\n\n \n\nSkip to Main Content\n\nCreate a Website Account - Manage notification subscriptions, save form progress and more. \xa0\xa0\n\nWebsite Sign In\n\nGovernmentServicesBusinessI Want To...Explore\n\n \n\n \n\n \n\n \n\n \n\n \n\nSearch\n\n \n \n \n\n       \n\n \n \n \n\n \nHomeFAQs\n\nSearch\n\nAll categories\nAnimal Control\nBuilding & Safety \nBusiness Licenses\nCity Clerk\nCity Clerk - Elections\nCity Government\nCode Enforcement\nDisaster Preparedness\nDistrict-Based Elections\nDublin Poet Laureate\nFinance & Administrative Services\nFire Services\nHistory of Dublin\nHousing\nHuman Resources\nPassport Services\nPlanning\nPolice Services\nPublic Works\nSpecial Needs Registry\nWireless\n\nCategories\n\nAll Categories\n\nAnimal Control\n\nBuilding & Safety \n\nBusiness Licenses\n\nCity Clerk\n\nCity Clerk - El

Yes, in Dublin, California, you typically need a permit for any new fence that exceeds 6 feet in height or is located within the front yard setback. The setback is the area between your property line and the street. It's usually 20 feet from the property line for front yard fences. However, these rules can vary based on specific circumstances, so it's always best to check with the Dublin Planning Department or visit their official website for the most accurate and current information.

Citations:
[]
