# Evaluate SR output - hallucations and faithfullness

In [1]:
import sys

sys.path = ['/Users/jonathan/Projects/semantic-retrieval/python/src'] + sys.path

## Reproduce reports

In [2]:
# in semantic-retrieval/
# Reproduce net income report
!python python/src/semantic_retrieval/examples/financial_report/generate_report.py --retrieval-query="annual net income 2022" --data-extraction-prompt="annual net income in 2022" --overfetch-factor=20

python: can't open file '/Users/jonathan/Projects/semantic-retrieval/python/src/semantic_retrieval/examples/financial_report/python/src/semantic_retrieval/examples/financial_report/generate_report.py': [Errno 2] No such file or directory


In [3]:
# Reproduce covid report
# in semantic-retrieval/
!python python/src/semantic_retrieval/examples/financial_report/generate_report.py --retrieval-query="covid 19 impact" --data-extraction-prompt="covid 19 impact" --overfetch-factor=20 --log-level=DEBUG --client-name=client_a

python: can't open file '/Users/jonathan/Projects/semantic-retrieval/python/src/semantic_retrieval/examples/financial_report/python/src/semantic_retrieval/examples/financial_report/generate_report.py': [Errno 2] No such file or directory


## Example eval input: 
* Client portfolio database (local CSV)
* Intermediate SR results: raw retrieved document chunks
* SR final output saved in local files

In [4]:
import sys
import pandas as pd

from semantic_retrieval.retrieval.csv_retriever import CSVRetriever
from semantic_retrieval.common import types
from semantic_retrieval.evaluation.lib import (
    LocalFileSystemGenLLMEvalDataset,
    evaluate_llm_eval_dataset_local_filesystem,
    evaluate_sample_local_filesystem,
    local_filesystem_dataset_to_df,
    # LocalFileSystemNumericalEvalDatasetConfig,
    LocalFileSystemIDSetPairEvalDatasetConfig,
    file_contents,
    # IDSetPairEvalDataPathMuncher,
    IDSetPairEvalDataset,
)

import json

from semantic_retrieval.evaluation.metrics import jaccard_similarity

import glob
import os




ROOT_DATA_DIR = "../../../../../examples/example_data/financial_report/"

ARTIFACTS = os.path.join(ROOT_DATA_DIR, "artifacts")
ARTIFACTS

print("Ready")

Ready


### Define evaluation test cases
* The first two test the final output against the raw retrieved data
  for faithfullness (LLM hallucination), and the second two test e2e
  for faithfullness against the input structured data (portfolio).
* In a production system, the equivalent of this
    would be written to an RDB system automatically
    during retrieval system operation.

## Configuration & Customization

In [5]:
# Test data: Define the input and output data to compare.

In [6]:
# In tests 1-2, we will run custom tests of hallucination and faithfullness

# comparing the LLM's data input and output.
# In tests 3-4, we will run custom tests of correctness against the application's
# structured DB (client portfolio).
TEST_CASES = [
    (name, os.path.join(ARTIFACTS, ip), os.path.join(ARTIFACTS, op))
    for name, ip, op in [
        ("net_income_vs_retrieved", "raw_retrieved_chunks_10k_net_income.json", "portfolio_10k_net_income_report.txt"),
        ("covid_vs_retrieved", "raw_retrieved_chunks_10k_covid.json", "portfolio_10k_covid_report.txt"),
    ]
] + [
    (name, os.path.join(ROOT_DATA_DIR, ip), os.path.join(ARTIFACTS, op))
    for name, ip, op in [
        ("net_income_e2e", "portfolios/sarmad_portfolio.csv", "portfolio_10k_net_income_report.txt"),
        ("covid_e2e", "portfolios/sarmad_portfolio.csv", "portfolio_10k_covid_report.txt"),                    
    ]
]

In [7]:
# Define custom data munchers: 
# take the raw test input/output pairs,
# and return a standard data structure for that pair
# which can then be passed to standard (or custom) measurement functions.

import re

from typing import List, Tuple

def parse_raw_re(s_out: str) -> List[Tuple[str, str, str, str, str]]:
    try:
        return re.findall(
            # r"([A-Z]+)[.*]*([ \$])([\d,\.]+)( *)(million|billion)?",
            r"([A-Z]+)['\)].*\n",
            s_out,
            flags=re.IGNORECASE,
        )
    except Exception as e:
        # todo deal with this
        logger.warning("parse output exn, returning empty list, exn=", str(e))
        return []


async def portfolio_data_muncher(
    path: str
):
    portfolio = await CSVRetriever(path)\
        .retrieve_data(None)
    

    return set(portfolio.keys())
    
def parse_re_output_to_df(re_output: List[Tuple[str, str, str, str, str]]):
    out = []
    for row in re_output:
        # ticker, _, number, _, units = row
        ticker = row
        out.append({"ticker": ticker})
        continue

        # NOT NEEDED FOR THIS METRIC
        # number_parsed = float(number.replace(",", ""))
        # convert_factor = 1

        # if units.lower().startswith("b"):
        #     convert_factor = 1000

        # out.append(
        #     {"ticker": ticker, "value_raw": number, "value_millions": int(number_parsed * convert_factor)}
        # )
    return pd.DataFrame.from_records(out)


def gen_output_to_df(s_out: str) -> pd.DataFrame:
    return parse_re_output_to_df(parse_raw_re(s_out))

def raw_retrieved_chunks_data_muncher(
    path: str
):
    print(f"{path=}")
    contents = file_contents(path)
    # print(f"{contents=}")
    obj = json.loads(contents)
    return {obj_['company'] for obj_ in obj}



def completion_model_output_portfolio_data_muncher(
    path: str
):
    contents = file_contents(path)
    df = gen_output_to_df(contents)
    # if len(df) == 0:
    #     print(f"HERE, {contents=}")

    return set(df.ticker)

        
async def test_1_2_data_muncher(
    path_input: str,
    path_output: str   
):    
    input_set = raw_retrieved_chunks_data_muncher(path_input)
    output_set = completion_model_output_portfolio_data_muncher(path_output)
    return IDSetPairEvalDataset(input_set=input_set, output_set=output_set)

async def test_3_4_data_muncher(
    path_input: str,
    path_output: str    
):
    portfolio_set = await portfolio_data_muncher(path_input)
    output_set = completion_model_output_portfolio_data_muncher(path_output) 

    return IDSetPairEvalDataset(input_set=portfolio_set, output_set=output_set)


## Evaluation and inspection using LM-SR library

In [8]:

print("Jaccard similarity, input portfolio vs. portfolio of LLM output.")
print("\n`name` is the test name, and value ranges from 0 (worst) to 1 (perfect).")

llm_eval_dataset = LocalFileSystemGenLLMEvalDataset.from_list(TEST_CASES)
data_munchers = [test_1_2_data_muncher] * 2 + [test_3_4_data_muncher] * 2
metrics = [jaccard_similarity] * 4

lfs_eval_configs = [
    LocalFileSystemIDSetPairEvalDatasetConfig(
        fn_path_muncher=data_muncher,
        metric=metric
    )
    for data_muncher, metric in zip(data_munchers, metrics)
]
    

await evaluate_llm_eval_dataset_local_filesystem(llm_eval_dataset, lfs_eval_configs)

Jaccard similarity, input portfolio vs. portfolio of LLM output.

`name` is the test name, and value ranges from 0 (worst) to 1 (perfect).
path='../../../../../examples/example_data/financial_report/artifacts/raw_retrieved_chunks_10k_net_income.json'
path='../../../../../examples/example_data/financial_report/artifacts/raw_retrieved_chunks_10k_covid.json'


Unnamed: 0,name,value
0,net_income_vs_retrieved,1.0
1,covid_vs_retrieved,1.0
2,net_income_e2e,1.0
3,covid_e2e,1.0


### Optional: peek into the contents manually for better visibility.

In [9]:
df = local_filesystem_dataset_to_df(llm_eval_dataset)

pd.set_option("display.max_colwidth", 200)
display(df)

Unnamed: 0,name,data_input,data_output
0,net_income_vs_retrieved,"[\n {\n ""company"": ""AAPL"",\n ""details"": ""net (\""OI&E\"") for 2022, 2021 and 2020 was as follows (dollars in millions): | | | | | | | | | | | | | | | | | | | | | | | | | | | | | ---|---...","Report:\n\n1. AAPL's annual net income in 2022: $2,825.\n2. PEP's annual net income in 2022: $86,392.\n3. AMZN's annual net income in 2022: $1.3 billion.\n4. JNJ's annual net income in 2022: -$1,7..."
1,covid_vs_retrieved,"[\n {\n ""company"": ""NVDA"",\n ""details"": ""The COVID-19 pandemic has affected and could continue to have a material adverse impact on our financial condition and results of operations. ...","1. Apple (AAPL): The COVID-19 pandemic has had a significant impact on Apple's business, with restrictions on travel and business operations, temporary closures of businesses, and supply shortages..."
2,net_income_e2e,"Company,Shares,,,,,\nAAPL,20,,,,,\nMSFT,,,,,,\nAMZN,30,,,,,\nNVDA,100,,,,,\nTSLA,,,,,,\nGOOG,,,,,,\nBRK.B,,,,,,\nMETA,,,,,,\nUNH,30,,,,,\nXOM,,,,,,\nLLY,,,,,,\nJPM,,,,,,\nJNJ,100,,,,,\nV,,,,,,\nPG...","Report:\n\n1. AAPL's annual net income in 2022: $2,825.\n2. PEP's annual net income in 2022: $86,392.\n3. AMZN's annual net income in 2022: $1.3 billion.\n4. JNJ's annual net income in 2022: -$1,7..."
3,covid_e2e,"Company,Shares,,,,,\nAAPL,20,,,,,\nMSFT,,,,,,\nAMZN,30,,,,,\nNVDA,100,,,,,\nTSLA,,,,,,\nGOOG,,,,,,\nBRK.B,,,,,,\nMETA,,,,,,\nUNH,30,,,,,\nXOM,,,,,,\nLLY,,,,,,\nJPM,,,,,,\nJNJ,100,,,,,\nV,,,,,,\nPG...","1. Apple (AAPL): The COVID-19 pandemic has had a significant impact on Apple's business, with restrictions on travel and business operations, temporary closures of businesses, and supply shortages..."


# STOP HERE - FUTURE DEMO BELOW, WIP.

In [10]:
input_companies = set([input_record['company'] for input_record in completion_generator_input_data])
output_companies = set(df_report_parsed.ticker)

assert input_companies == output_companies

print("Pass")

NameError: name 'completion_generator_input_data' is not defined

## Check number faithfulness

In [None]:
hallucinations = 0

for company, value_raw in df_report_parsed.set_index("ticker").value_raw.to_dict().items():
    # print(company, value_raw)
    input_data_for_company = [
        input_record["details"] for input_record in completion_generator_input_data
        if company == input_record["company"]
    ][0]

    # Remove the period that the LLM might have put on to end the sentence.
    if value_raw.rstrip(".") not in input_data_for_company:
        hallucinations += 1

    # print(value_raw)
    # print(input_data_for_company)

hr = round(hallucinations / len(df_report_parsed), 2)
print(f"Hallucination rate: {100.0 * hr}%")

## E2e: Check final output's faithfulness to portfolio (structured table)

In [None]:
assert set(df_report_parsed.ticker) == portfolio.keys()

print("Pass")