# (WIP) Evaluation for Citation (Unsupervised)

## 1. Format of citations

- Output parser
  - May need tune for coherence
- Agent
  - https://python.langchain.com/v0.1/docs/modules/agents/how_to/agent_structured/
  - Benefit may be easier to handle parsing errors

## 2. Evaluate citations

### M1: Sentence(s) to Sentence(s)

- Requires regular expression to break up text & citations
  - It should be trivial if (1) formatting of citation is done strictly
- Pros:
  - Accuracy will be higher (??)
  - Reasoning will be clearer
  - Validation is easier

### M2: LLM-based

- Framework: `DeepEval`
- Another distilled model to evaluate the generated answer with citations

### References

- https://christophergs.com/blog/ai-engineering-evaluation-with-deepeval-and-open-source-models

## 3. Improve citations

### M1: NLI

### M2: Error handling

- Ask for re-generation


In [1]:
from pydantic import BaseModel, Field
from typing import List
from pydantic.dataclasses import dataclass


@dataclass(frozen=True)
class TextWithCitation:
    text: str = Field(description="Generated Text")
    citation: int = Field(
        description="Context ID that is the in-text citation for the generated text"
    )
    # Consider adding a custom validator to validate against the ids in citation to be only the document ids
    # Consider adding a custom validator here to validate text and the context


class AnswerWithCitation(BaseModel):
    answer: List[TextWithCitation] = Field(
        description="List of objects of type TextWithCitation",
        name="answer",
        # name="answer", description="""[{"text":"", "citation":""}]"""
    )

In [2]:
import os

from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]

generation_model = "mistralai/mixtral-8x7b-instruct-v0.1"
eval_model = "microsoft/Phi-3-mini-4k-instruct"
template = """
You are expert in extracting key information from a given set of documents to answer a question.

I want you to only use the given documents given to answer the question given. For each sentence, include the document ID that you have used to generate the sentence. Do not use anything outside of the given documents.
Say "I don't know" if you cannot answer the question with the given documents.

# Question
{question}

# Documents
{context}

{format_instructions}

# Answer
"""

In [6]:
from pydantic import BaseModel


class Context(BaseModel):
    id: str
    doc: str


class Dataset(BaseModel):
    question: str
    ground_truth: str
    docs: List[Context]


def get_sample_data(path: str) -> Dataset:
    import json

    with open(path, "r") as f:
        data = json.load(f)
    return Dataset.model_validate(
        {
            "question": data["question"],
            "ground_truth": data["answer_with_citation"],
            "answer_with_citation": data["answer_with_citation"],
            "docs": [
                Context.model_validate({"id": doc["id"], "doc": doc["text"]})
                for doc in data["docs"]
            ],
        }
    )


data = get_sample_data("../sample/data/raw_q1.json")

In [7]:
from langchain_community.llms import HuggingFaceEndpoint
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain.agents import AgentExecutor
from langchain_core.exceptions import OutputParserException
import logging

llm = HuggingFaceEndpoint(
    repo_id=generation_model,
    temperature=0.01,
    huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
)

parser = PydanticOutputParser(pydantic_object=AnswerWithCitation)

# format_instructions = parser.get_format_instructions()

## Refactored the format instructions to include sample based on data model
format_instructions = """
The output should be formatted as a JSON instance that conforms to the JSON schema below.Here is the output schema:\n```\n{"$defs": {"TextWithCitation": {"properties": {"text": {"description": "Generated Text", "title": "Text", "type": "string"}, "citation": {"description": "Context ID that is the in-text citation for the generated text", "title": "Citation", "type": "integer"}}, "required": ["text", "citation"], "title": "TextWithCitation", "type": "object"}}, "properties": {"answer": {"description": "List of objects of type TextWithCitation", "items": {"$ref": "#/$defs/TextWithCitation"}, "name": "answer", "title": "Answer", "type": "array"}}, "required": ["answer"]}\nAs an example, the object {"answer": [{"text": "bar", "citation": 1}]} is a well-formatted instance of the schema. The object {[{"text": "bar", "citation": 1}]} is not well-formatted"""

prompt = PromptTemplate(
    template=template,
    input_variables=["question", "context"],
    partial_variables={"format_instructions": format_instructions},
)

chain = prompt | llm | parser
context_json = [doc.model_dump() for doc in data.docs]

try:
    response = chain.invoke({"question": data.question, "context": context_json})
except OutputParserException as parser_exception:
    llm_response = parser_exception.llm_output
    logging.error("OutputParserException: " + parser_exception.message)
    logging.error("Error parsing message: " + llm_response)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/qingru/.cache/huggingface/token
Login successful


In [5]:
concatenated_answer = ""

for ans in response.answer:
    concatenated_answer += f"""{ans.text}[{ans.citation}]"""

concatenated_answer

'Maintaining a balanced weight is one of the key benefits of a healthy diet.[1]A diet rich in fruits, vegetables, and whole grains can significantly improve heart health by reducing cholesterol levels.[2]Boosting the immune system can be achieved by consuming vitamins and minerals found in a balanced diet.[3]Mental health benefits from a healthy diet include improved mood and cognitive function.[4]Chronic diseases such as diabetes and hypertension can be prevented with a balanced and nutritious diet.[5]Proper hydration is also a component of a healthy diet, which helps in maintaining bodily functions.[6]A diet that includes a variety of foods can ensure the body gets a range of essential nutrients.[7]Nutrient-dense foods like leafy greens, berries, and nuts can provide significant health benefits.[8]'

In [6]:
context_dict = dict()
for doc in data.docs:
    context_dict[int(doc.id)] = doc.doc

In [24]:
from deepeval import evaluate
from deepeval.metrics import FaithfulnessMetric
from deepeval.test_case import LLMTestCase
from transformers import AutoModelForCausalLM, AutoTokenizer

## NOTE: Needs to use local model / download directly to use with deepeval metrics
# Needs to install pytorch - conda install (https://pytorch.org/get-started/locally/)
tokenizer = AutoTokenizer.from_pretrained(eval_model)
llm = AutoModelForCausalLM.from_pretrained(eval_model, trust_remote_code=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Downloading shards: 100%|██████████| 2/2 [03:04<00:00, 92.49s/it] 
Loading checkpoint sha

In [25]:
from deepeval.models.base_model import DeepEvalBaseLLM


class CustomModel(DeepEvalBaseLLM):
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        model = self.load_model()
        device = "cpu"
        model_inputs = self.tokenizer([prompt], return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(
            **model_inputs, max_new_tokens=100, do_sample=True
        )
        return self.tokenizer.batch_decode(generated_ids)[0]

    async def a_generate(self, prompt: str) -> str:
        return self.generate(prompt)

    def get_model_name(self):
        return "Custom Model"

In [26]:
custom_eval_model = CustomModel(model=llm, tokenizer=tokenizer)

In [28]:
from deepeval import evaluate
from deepeval.metrics import FaithfulnessMetric
from deepeval.test_case import LLMTestCase

actual_output = "We offer a 30-day full refund at no extra cost."

retrieval_context = [
    "All customers are eligible for a 30 day full refund at no extra cost."
]

# TODO: this model doesn't meet the metric output requirements - Seems like a common issue for Hugging Face models which has not yet been solved (https://github.com/confident-ai/deepeval/issues/743)
metric = FaithfulnessMetric(threshold=0.7, model=custom_eval_model, include_reason=True)
test_case = LLMTestCase(
    input="What if these shoes don't fit?",
    actual_output=actual_output,
    retrieval_context=retrieval_context,
)

# TODO: This seems to keep running event loop
metric.measure(test_case)
print(metric.score)
print(metric.reason)

# or evaluate test cases in bulk
evaluate([test_case], [metric])

KeyboardInterrupt: 