Licensed under the MIT License.

Copyright (c) 2025-2035. All rights reserved by Hanhan Wu.

Permission is hereby granted to view this code for evaluation purposes only.
You may not reuse, copy, modify, merge, publish, distribute, sublicense,
or exploit this code without Hanhan Wu's EXPLICIT written permission.


# Auto Evaluation with Confidence Score

* Evaluate RAG retrieval and answer generaiton quality
* LLM-as-Judge, using OpenAI, Gemini and Mishal models
* Each evaluated record has a confidence score

In [12]:
%load_ext autoreload
%autoreload 2

from datasets import load_dataset
import pandas as pd
import yaml
import nest_asyncio
nest_asyncio.apply()

from utils import *

import warnings
warnings.filterwarnings('ignore')


with open('prompt_versions.yaml', 'r') as file:
    prompt_versions = yaml.safe_load(file)


llm_model_str1 = 'gpt-5-nano'
llm_model_str2 = 'gemini-2.5-flash-lite'
llm_model_str3_lst = ['voxtral-small-2507', 'magistral-small-2507', 'open-mistral-nemo', 'mistral-small-2506']

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load Data Input

In [2]:
fiqa_eval = load_dataset("explodinggradients/fiqa", "ragas_eval")['baseline']

rag_lst = []
for record in fiqa_eval:
    rag_lst.append({
        'query': record['question'],
        'retrieved_content': record['contexts'],
        'referenced_answer': record['ground_truths'][0],
        'answer': record['answer'].strip()
    })

rag_df = pd.DataFrame(rag_lst)
print(rag_df.shape)
rag_df.head()

(30, 4)


Unnamed: 0,query,retrieved_content,referenced_answer,answer
0,How to deposit a cheque issued to an associate...,[Just have the associate sign the back and the...,Have the check reissued to the proper payee.Ju...,The best way to deposit a cheque issued to an ...
1,Can I send a money order from USPS as a business?,[Sure you can. You can fill in whatever you w...,Sure you can. You can fill in whatever you wa...,"Yes, you can send a money order from USPS as a..."
2,1 EIN doing business under multiple business n...,[You're confusing a lot of things here. Compan...,You're confusing a lot of things here. Company...,"Yes, it is possible to have one EIN doing busi..."
3,Applying for and receiving business credit,[Set up a meeting with the bank that handles y...,"""I'm afraid the great myth of limited liabilit...",Applying for and receiving business credit can...
4,401k Transfer After Business Closure,[The time horizon for your 401K/IRA is essenti...,You should probably consult an attorney. Howev...,If your employer has closed and you need to tr...


### Evaluate Retrieval Quality

In [4]:
rr_df1 = asyncio.run(get_retrieval_relevancy_output_async(rag_df, llm_model_str1,
                                                          prompt_versions['rr_prompt_template'],
                                                          model='openai'))
rr_df2 = asyncio.run(get_retrieval_relevancy_output_async(rag_df, llm_model_str2,
                                                          prompt_versions['rr_prompt_template'],
                                                          model='vertexai'))
for llm_model_str3 in llm_model_str3_lst:
    try:
        rr_df3 = asyncio.run(get_retrieval_relevancy_output_async(rag_df, llm_model_str3,
                                                        prompt_versions['rr_prompt_template'], 
                                                        model='mistral'))
        break
    except:
        continue

print(rr_df1.shape, rr_df2.shape, rr_df3.shape)
display(rr_df1.head(n=2))
display(rr_df2.head(n=2))
display(rr_df3.head(n=2))

(30, 6) (30, 6) (30, 6)


Unnamed: 0,query,retrieved_content,referenced_answer,answer,rr_auto_score,rr_reasoning
0,How to deposit a cheque issued to an associate...,[Just have the associate sign the back and the...,Have the check reissued to the proper payee.Ju...,The best way to deposit a cheque issued to an ...,3,The retrieved content directly addresses depos...
1,Can I send a money order from USPS as a business?,[Sure you can. You can fill in whatever you w...,Sure you can. You can fill in whatever you wa...,"Yes, you can send a money order from USPS as a...",3,The retrieved content directly addresses sendi...


Unnamed: 0,query,retrieved_content,referenced_answer,answer,rr_auto_score,rr_reasoning
0,How to deposit a cheque issued to an associate...,[Just have the associate sign the back and the...,Have the check reissued to the proper payee.Ju...,The best way to deposit a cheque issued to an ...,3,The retrieved content directly addresses the u...
1,Can I send a money order from USPS as a business?,[Sure you can. You can fill in whatever you w...,Sure you can. You can fill in whatever you wa...,"Yes, you can send a money order from USPS as a...",3,The retrieved content directly answers the use...


Unnamed: 0,query,retrieved_content,referenced_answer,answer,rr_auto_score,rr_reasoning
0,How to deposit a cheque issued to an associate...,[Just have the associate sign the back and the...,Have the check reissued to the proper payee.Ju...,The best way to deposit a cheque issued to an ...,3,The RETRIEVED CONTENT directly addresses the U...
1,Can I send a money order from USPS as a business?,[Sure you can. You can fill in whatever you w...,Sure you can. You can fill in whatever you wa...,"Yes, you can send a money order from USPS as a...",3,The RETRIEVED CONTENT directly addresses the U...


In [18]:
print(rr_df1['rr_auto_score'].value_counts(normalize=True))   
print(rr_df2['rr_auto_score'].value_counts(normalize=True))
print(rr_df3['rr_auto_score'].value_counts(normalize=True))


rr_df = rr_df1[['query', 'rr_auto_score']]\
        .merge(rr_df2[['query', 'rr_auto_score']], on='query', suffixes=('_gpt5', '_gemini'))\
        .merge(rr_df3[['query', 'rr_auto_score']], on='query')
rr_df = rr_df.rename(columns={'rr_auto_score': 'rr_auto_score_mistral'})
rr_df['rr_confidence_score'] = rr_df.apply(calculate_confidence, axis=1)

display(rr_df.head())
print(rr_df['rr_confidence_score'].value_counts(normalize=True))

rr_auto_score
3    0.8
2    0.2
Name: proportion, dtype: float64
rr_auto_score
3    0.633333
2    0.333333
1    0.033333
Name: proportion, dtype: float64
rr_auto_score
3    0.533333
2    0.466667
Name: proportion, dtype: float64


Unnamed: 0,query,rr_auto_score_gpt5,rr_auto_score_gemini,rr_auto_score_mistral,rr_confidence_score
0,How to deposit a cheque issued to an associate...,3,3,3,1.0
1,Can I send a money order from USPS as a business?,3,3,3,1.0
2,1 EIN doing business under multiple business n...,3,2,2,0.666667
3,Applying for and receiving business credit,3,3,3,1.0
4,401k Transfer After Business Closure,3,3,2,0.666667


rr_confidence_score
1.000000    0.566667
0.666667    0.400000
0.333333    0.033333
Name: proportion, dtype: float64


### Evaluate Answer Quality

In [None]:
au_df1 = asyncio.run(get_answer_usefulness_output_async(rr_df1, llm_model_str1,
                                                        prompt_versions['au_prompt_template'],
                                                        model='openai'))
au_df2 = asyncio.run(get_answer_usefulness_output_async(rr_df2, llm_model_str2,
                                                        prompt_versions['au_prompt_template'],
                                                        model='vertexai'))
for llm_model_str3 in llm_model_str3_lst:
    try:
        au_df3 = asyncio.run(get_answer_usefulness_output_async(rr_df3, llm_model_str3,
                                                        prompt_versions['au_prompt_template'],
                                                        model='mistral'))
        break
    except:
        continue

print(au_df1.shape, au_df2.shape, au_df3.shape)
display(au_df1.head(n=2))
display(au_df2.head(n=2))
display(au_df3.head(n=2))

(30, 8) (30, 8) (30, 8)


Unnamed: 0,query,retrieved_content,referenced_answer,answer,rr_auto_score,rr_reasoning,answer_usefulness_score,au_reasoning
0,How to deposit a cheque issued to an associate...,[Just have the associate sign the back and the...,Have the check reissued to the proper payee.Ju...,The best way to deposit a cheque issued to an ...,3,The retrieved content directly addresses depos...,0.2,The answer is not helpful or accurate. It inco...
1,Can I send a money order from USPS as a business?,[Sure you can. You can fill in whatever you w...,Sure you can. You can fill in whatever you wa...,"Yes, you can send a money order from USPS as a...",3,The retrieved content directly addresses sendi...,0.8,The AI answer correctly confirms that USPS mon...


Unnamed: 0,query,retrieved_content,referenced_answer,answer,rr_auto_score,rr_reasoning,answer_usefulness_score,au_reasoning
0,How to deposit a cheque issued to an associate...,[Just have the associate sign the back and the...,Have the check reissued to the proper payee.Ju...,The best way to deposit a cheque issued to an ...,3,The retrieved content directly addresses the u...,0.6,The AI's answer is helpful and relevant in tha...
1,Can I send a money order from USPS as a business?,[Sure you can. You can fill in whatever you w...,Sure you can. You can fill in whatever you wa...,"Yes, you can send a money order from USPS as a...",3,The retrieved content directly answers the use...,1.0,"The AI's answer is highly helpful, relevant, a..."


Unnamed: 0,query,retrieved_content,referenced_answer,answer,rr_auto_score,rr_reasoning,answer_usefulness_score,au_reasoning
0,How to deposit a cheque issued to an associate...,[Just have the associate sign the back and the...,Have the check reissued to the proper payee.Ju...,The best way to deposit a cheque issued to an ...,3,The RETRIEVED CONTENT directly addresses the U...,0.6,AI's ANSWER is good and addresses the USER QUE...
1,Can I send a money order from USPS as a business?,[Sure you can. You can fill in whatever you w...,Sure you can. You can fill in whatever you wa...,"Yes, you can send a money order from USPS as a...",3,The RETRIEVED CONTENT directly addresses the U...,0.8,AI's ANSWER is excellent and addresses the USE...


In [23]:
print(au_df1['answer_usefulness_score'].value_counts(normalize=True))   
print(au_df2['answer_usefulness_score'].value_counts(normalize=True))
print(au_df3['answer_usefulness_score'].value_counts(normalize=True))


au_df = au_df1[['query', 'answer_usefulness_score']]\
        .merge(au_df2[['query', 'answer_usefulness_score']], on='query', suffixes=('_gpt5', '_gemini'))\
        .merge(au_df3[['query', 'answer_usefulness_score']], on='query')
au_df = au_df.rename(columns={'answer_usefulness_score': 'answer_usefulness_score_mistral'})
au_df['au_confidence_score'] = rr_df.apply(calculate_confidence, axis=1)

display(au_df.head())
print(au_df['au_confidence_score'].value_counts(normalize=True))

answer_usefulness_score
0.60    0.433333
0.40    0.300000
0.20    0.133333
0.80    0.100000
0.25    0.033333
Name: proportion, dtype: float64
answer_usefulness_score
0.6    0.333333
0.4    0.233333
0.2    0.200000
0.8    0.166667
1.0    0.066667
Name: proportion, dtype: float64
answer_usefulness_score
0.8    0.500000
0.6    0.466667
0.4    0.033333
Name: proportion, dtype: float64


Unnamed: 0,query,answer_usefulness_score_gpt5,answer_usefulness_score_gemini,answer_usefulness_score_mistral,au_confidence_score
0,How to deposit a cheque issued to an associate...,0.2,0.6,0.6,1.0
1,Can I send a money order from USPS as a business?,0.8,1.0,0.8,1.0
2,1 EIN doing business under multiple business n...,0.4,0.2,0.4,0.666667
3,Applying for and receiving business credit,0.6,0.8,0.8,1.0
4,401k Transfer After Business Closure,0.4,0.4,0.6,0.666667


au_confidence_score
1.000000    0.566667
0.666667    0.400000
0.333333    0.033333
Name: proportion, dtype: float64
