In [1]:
import sys
import pandas as pd


sys.path = ['/Users/jonathan/Projects/semantic-retrieval/python/src'] + sys.path

In [2]:
from semantic_retrieval.evaluation.metrics import accuracy_metric, SampleEvalDataset

accuracy_metric(
    SampleEvalDataset(
    output=[1,2,3], 
    ground_truth=[1,2,4]
    )
)

print("Hello, world")

Hello, world


# Hallucinations: compare gen LLM output to input data

In [3]:
from semantic_retrieval.examples.financial_report.evaluate_report import file_contents

report = file_contents("../../../../../examples/example_data/financial_report/portfolio_10k_net_income_report.txt")
print(report)

Report:

1. AAPL's annual net income in 2022: $2,825.
2. PEP's annual net income in 2022: $86,392.
3. AMZN's annual net income in 2022: $1.3 billion.
4. JNJ's annual net income in 2022: -$1,796
5. UNH's annual net income in 2022: $20,639.
6. NVDA's annual net income in 2022: $4,368.
7. MRK's annual net income in 2022: $16,444.


In [7]:
# Simple regexp parsing from LLM's structured output

from semantic_retrieval.examples.financial_report.evaluate_report import gen_output_to_df
    
df_report_parsed = gen_output_to_df(report)
df_report_parsed

Unnamed: 0,ticker,value_raw,value_millions
0,AAPL,2825.0,2825
1,PEP,86392.0,86392
2,AMZN,1.3,1300
3,JNJ,1796.0,1796
4,UNH,20639.0,20639
5,NVDA,4368.0,4368
6,MRK,16444.0,16444


In [10]:
import json

gen_model_input_data = json.loads(file_contents("../../../../../examples/example_data/financial_report/artifacts/raw_retrieved_chunks_10k.json"))

print(gen_model_input_data[0])

{'company': 'AAPL', 'details': 'net ("OI&E") for 2022, 2021 and 2020 was as follows (dollars in millions): | | | | | | | | | | | | | | | | | | | | | | | | | | | | | ---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|--- | 2022| | Change| | 2021| | Change| | 2020 Interest and dividend income | $| 2,825 | | | | | $| 2,843 | | | | | $| 3,763 | Interest expense | (2,931)| | | | | (2,645)| | | | | (2,873)| Other income/(expense), net| (228)| | | | | 60 | | | | | (87)|'}


## Check ticker agreement

In [None]:
input_companies = set([input_record['company'] for input_record in gen_model_input_data])
output_companies = set(df_report_parsed.ticker)

assert input_companies == output_companies

print("Pass")

## Check number faithfulness

In [25]:
hallucinations = 0

for company, value_raw in df_report_parsed.set_index("ticker").value_raw.to_dict().items():
    # print(company, value_raw)
    input_data_for_company = [
        input_record["details"] for input_record in gen_model_input_data
        if company == input_record["company"]
    ][0]

    # Remove the period that the LLM might have put on to end the sentence.
    if value_raw.rstrip(".") not in input_data_for_company:
        hallucinations += 1

    # print(value_raw)
    # print(input_data_for_company)

hr = round(hallucinations / len(df_report_parsed), 2)
print(f"Hallucination rate: {100.0 * hr}%")

Hallucination rate: 0.0%


## E2e: Check final output's faithfulness to portfolio (structured table)

In [37]:
from semantic_retrieval.retrieval.csv_retriever import CSVRetriever

portfolio = await CSVRetriever("../../../../../examples/example_data/financial_report/portfolios/client_a_portfolio.csv")\
    .retrieve_data(None)

portfolio

{'AAPL': 20.0,
 'AMZN': 30.0,
 'NVDA': 100.0,
 'UNH': 30.0,
 'JNJ': 100.0,
 'MRK': 40.0,
 'PEP': 200.0}

In [38]:
assert set(df_report_parsed.ticker) == portfolio.keys()

print("Pass")

Pass
