## Setup the variables

In [100]:

embedding_model_for_retrieval = 'sentence-transformers/all-MiniLM-L6-v2' #using huggingface embedding function -- ensure that it is the same as the one used to push to milvus

llm_model = "meta-llama/Llama-3.2-3B-Instruct" #will be loaded with quantized framework, alternative : "aaditya/OpenBioLLM-Llama3-8B" or other methods 
llm_model_longrope = "unsloth/Llama-3.2-3B-Instruct" #ideally same model type as the llm_model for fair comparison -- I assume the one from unsloth support longrope? so I specify differently


TOKEN = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
CLUSTER_ENDPOINT = "https://xxxxxxx.serverless.gcp-us-west1.cloud.zilliz.com"

collection_name = "radiology_paper_raptor" #remove the _raptor for the version without raptor method

#book_titles = ["Imaging evaluation of the liver in oncology patients"] #or "all" for searching in all documents, or ["a", "b"] for more than 1 document
document = "all"

max_new_tokens=200

# change here
use_hyde = True
use_longrope = True
use_raptor = True

k = 10 #how many vector to retrieve


# PROMPT_TEMPLATE = """SYSTEM: You are an advanced AI assistant strictly limited to providing concise, fact-based, and statistically accurate answers to questions.

# You must:
# 1. Rely exclusively on the provided context enclosed in `<context>` tags.
# 2. Never use prior knowledge, fabricate, infer, or offer opinions.
# 3. Ensure all statements are verifiable within the context.
# 4. Keep answers specific, concise, and under 50 words.
# 5. Prioritize clarity and quantifiable details (e.g., numbers, statistics).

# Failure to adhere will result in outputting: "I'm sorry I don't have enough information to answer this question."

# USER:
# <context>
# {context}
# </context>

# <question>
# {question}
# </question>

# ASSISTANT:"""


PROMPT_TEMPLATE = """SYSTEM: You are an advanced AI assistant strictly limited to providing concise, fact-based, and statistically accurate answers to questions.
End your generated answer with "[END OF ANSWER]".

You must:
1. Rely exclusively on the provided context enclosed in `<context>` tags.
2. Never use prior knowledge, fabricate, infer, or offer opinions.
3. Ensure all statements are verifiable within the context.
4. Keep answers specific, concise,  and at most 50 words.
5. Prioritize clarity and quantifiable details (e.g., numbers, statistics).

Failure to adhere or lack of confidence will result in outputting: "I'm sorry I don't have enough information to answer this question."

USER:
<context>
{context}
</context>

<question>
{question}
</question>

ASSISTANT:"""



## Setup LLM Models and Milvus Database

In [101]:

from langchain_core.prompts import PromptTemplate

rag_prompt = PromptTemplate(template=PROMPT_TEMPLATE, input_variables=["context", "question"])

#from openai import OpenAI
from transformers import AutoTokenizer, AutoModelForCausalLM
import streamlit as st
import pandas as pd
from streamlit_float import *
import time
import os

from resource_initialization_dev import connect_to_milvus,load_embedding_model,load_llm_model
from resource_initialization_dev import initialize_llm_pipeline,initialize_vectorstore
from resource_initialization_dev import format_docs,hyde_chain_generation,vanilla_chain_generation
from resource_initialization_dev import initialize_chain,rag_and_synthesize


In [102]:
connect_to_milvus = connect_to_milvus(CLUSTER_ENDPOINT, TOKEN)
embed_model = load_embedding_model(embedding_model_for_retrieval)
tokenizer, model= load_llm_model(use_longrope, llm_model, llm_model_longrope)
llm = initialize_llm_pipeline(model, tokenizer, max_new_tokens)
collection_name = "radiology_paper_raptor" 
vectorstore = initialize_vectorstore(collection_name, embed_model, CLUSTER_ENDPOINT, TOKEN)




In [103]:
if document == "all":
    expr = ""
    if use_raptor == False :
        expr = "level == 0"
else: 
    expr ="book_title in [" + ", ".join([f"'{title}'" for title in [document]]) + "]"
    if use_raptor == False :
        expr = expr + "&&  (level == 0)"


In [104]:
expr

''

In [105]:
chain = initialize_chain(use_hyde, llm, vectorstore, document, embed_model, k,expr,rag_prompt )

## Setup Evaluation

In [106]:
from transformers import pipeline

from ragas.metrics import NonLLMContextPrecisionWithReference, NonLLMContextRecall, NoiseSensitivity, ResponseRelevancy, Faithfulness
from langchain.chains.question_answering import load_qa_chain
from ragas import evaluate, SingleTurnSample
from ragas.metrics._factual_correctness import FactualCorrectness

from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_core.prompt_values import StringPromptValue

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import pickle
import json
import random
import asyncio
import openai
import os



openai_api_key = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'

In [107]:
os.environ["OPENAI_API_KEY"] = openai_api_key

In [108]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings

evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o",openai_api_key=openai_api_key))
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())
from ragas.llms import llm_factory

evaluator_llm = llm_factory("gpt-4o")
#evaluator_llm = llm_factory("gpt-4o-mini")

## Evaluation (Lea)

In [109]:
from ragas import evaluate, SingleTurnSample
import asyncio
from ragas.metrics import NonLLMContextPrecisionWithReference, NonLLMContextRecall, NoiseSensitivity, ResponseRelevancy, Faithfulness, ContextEntityRecall

from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import OpenAIEmbeddings
from ragas.llms import llm_factory
#os.environ['OPENAI_API_KEY'] = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
#evaluator_llm = llm_factory("gpt-4o")
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

In [110]:

async def task1(sample):
    context_precision = await NonLLMContextPrecisionWithReference().single_turn_ascore(sample)
    return "Precision: " + str(context_precision)

async def task2(sample):
    context_recall = await NonLLMContextRecall().single_turn_ascore(sample)
    return "Recall: " + str(context_recall)

async def task3(sample):
    scorer = ContextEntityRecall()
    scorer.llm=evaluator_llm
    score = await scorer.single_turn_ascore(sample)
    return "Entity Recall: " + str(score)

async def rag_eval(sample):
    results = await asyncio.gather(task1(sample), task2(sample), task3(sample))
    print(results)

## Retrieve and Synthesize Final Result - using langchain framework on 10 sample data (Lea)

In [111]:
import re

In [112]:
from typing import List
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain_milvus import Milvus
from transformers import pipeline
import json
import random
from ragas.metrics._string import NonLLMStringSimilarity
from ragas.metrics import BleuScore, RougeScore, StringPresence, FactualCorrectness, SemanticSimilarity, NonLLMStringSimilarity

In [22]:
with open('./datasets/rag_eval_dataset/paper_data_1119.json', 'r') as file:
    paper_data_1114 = json.load(file)
    


In [23]:
# import re

# # Copy data structure for manipulation
# paper_data_1114_filtered = paper_data_1114.copy()

# # Step 1: Filter queries, excluding those containing "context" or "document"
# filtered_queries_keys = {
#     key for key, value in paper_data_1114_filtered.get('queries', {}).items()
#     if "context" not in str(value).lower() and "document" not in str(value).lower()
# }

# filtered_queries = {
#     key: value for key, value in paper_data_1114_filtered.get('queries', {}).items()
#     if key in filtered_queries_keys
# }

# # Step 2: Filter corpus to exclude keys not referenced in relevant_docs and matching the invalid pattern
# valid_corpus_keys = {
#     key for key, value in paper_data_1114_filtered.get('corpus', {}).items()
#     if (value is None or not re.match(r'^\d+\.\t', str(value)))
# }

# # Update valid corpus keys to only include those mentioned in relevant_docs
# corpus_keys_in_relevant_docs = {
#     doc for docs in paper_data_1114_filtered.get('relevant_docs', {}).values() for doc in docs
# }

# final_corpus_keys = valid_corpus_keys & corpus_keys_in_relevant_docs

# filtered_corpus = {
#     key: value for key, value in paper_data_1114_filtered.get('corpus', {}).items()
#     if key in final_corpus_keys
# }

# # Step 3: Filter relevant_docs to exclude empty lists and documents not in valid corpus keys
# filtered_relevant_docs = {
#     key: [doc for doc in docs if doc in final_corpus_keys]
#     for key, docs in paper_data_1114_filtered.get('relevant_docs', {}).items()
#     if key in filtered_queries_keys and docs  # Exclude empty lists and irrelevant keys
# }

# # Remove entries with empty lists in relevant_docs
# filtered_relevant_docs = {
#     key: docs for key, docs in filtered_relevant_docs.items() if docs
# }

# # Step 4: Filter queries to only keep keys present in relevant_docs
# filtered_queries = {
#     key: value for key, value in filtered_queries.items()
#     if key in filtered_relevant_docs.keys()
# }

# # Combine the results into the final JSON format
# paper_data_1114_filtered = {
#     'queries': filtered_queries,
#     'corpus': filtered_corpus,
#     'relevant_docs': filtered_relevant_docs,
# }

# # Output the filtered JSON object
# paper_data_1114_filtered

In [24]:
# num_samples = 10
# random_keys = random.sample(list(paper_data_1114_filtered['queries'].keys()), num_samples)

In [25]:
# retrieval_eval_data = []
# for key in random_keys:
#     tmp = {
#         "query": paper_data_1114_filtered['queries'][key],
#         "retrieved_contexts": None,
#         "expected_contexts": [paper_data_1114_filtered['corpus'][paper_data_1114_filtered['relevant_docs'][key][0]]],
#         "llm_output": None
#     }
#     retrieval_eval_data.append(tmp)

In [27]:
#retrieval_eval_data

In [36]:
# with open('./eval_dataset_20241201_3.pkl', 'wb') as file:
#     pickle.dump(retrieval_eval_data, file)


In [28]:
# with open('./eval_dataset_20241127.pkl', 'rb') as file:
#     retrieval_eval_data = pickle.load(file)


with open('./eval_dataset_20241201_3.pkl', 'rb') as file:
    retrieval_eval_data = pickle.load(file)

### Use Hyde + Longrope (Done)

In [29]:
use_raptor = True
use_hyde = True
use_longrope = True

In [30]:
if document == "all":
    expr = ""
    if use_raptor == False :
        expr = "level == 0"
else: 
    expr ="book_title in [" + ", ".join([f"'{title}'" for title in [document]]) + "]"
    if use_raptor == False :
        expr = expr + "&&  (level == 0)"

In [31]:
chain = initialize_chain(use_hyde, llm, vectorstore, document, embed_model, k,expr,rag_prompt )

In [32]:
for data in retrieval_eval_data:
    query = data['query']
    retrieved_docs, retrieved_docs_final,final_result = rag_and_synthesize(query,chain, use_longrope, use_hyde)
    data['retrieved_contexts'] = [doc.replace("\n", "").strip() for doc in retrieved_docs.split("\n\n") if doc.strip()][:k]
    #data['retrieved_contexts'] = [doc.strip() for doc in retrieved_docs.split("\n\n") if doc.strip()][:k]
    data['llm_output'] = [doc.split("\n\nText Sources:")[0] for doc in [final_result]][0]

Using longrope
Using HyDE
Retrieved docs:

 **Hypothetical Answer Generated by HyDE:**

 OATP1B3 overexpression in HCCs can lead to β-catenin activation, promoting cell proliferation and metastasis.  

 ------ 

 **Retrieved Documents:**

 Additional molecular-genetic analyses of HCCs have revealed that HCC with hyperintensity dur-ing the HBP (ie, OATP1B3-overexpressed HCC) shows β-catenin and hepatocyte nuclear factor 4α activation (25,26). A similar molecular mechanism of OATP1B3 expression can be expected in other hepatocellular nodules (4).


 Source: 
 Hyperintense Liver Masses at Hepatobiliary Phase Gadoxetic Acidenhanced MRI --Page: 3.0 --Score: 0.7909  

 -- 



Hyperintense HCCs show β-catenin and hepatocyte nuclear factor 4α activation, while hyperintense HCCs with OATP1B3 overexpression show β-catenin and hepatocyte nuclear factor 4α activation. Molecular mechanisms of HCCs are related to β-catenin and hepatocyte nuclear factor 4α activation. HCCs with β-catenin gene mutatio

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Using HyDE
Retrieved docs:

 **Hypothetical Answer Generated by HyDE:**

 Understanding hyperintensity in hepatic mass lesions during HBP is crucial for radiologists to accurately interpret images and diagnose conditions accurately, as it relates to the pathogenesis of the lesions.  

 ------ 

 **Retrieved Documents:**

 Relative Signal Intensity and Differential Diagnosis of Hyperintense Liver Masses during the HBPThe majority of FNHs, FNH-like lesions, and NRHs are iso- to hyperintense during the HBP because they are composed of nonneoplastic hyperplastic hepatocytes. Therefore, marked hy-perintensity can be seen during the HBP.


 Source: 
 Hyperintense Liver Masses at Hepatobiliary Phase Gadoxetic Acidenhanced MRI --Page: 19.0 --Score: 0.7806  

 -- 



■■Hepatic mass lesions can show hyperintensity partially or en-tirely during the HBP owing to the following mechanisms:  (a) uptake by hyperplastic hepatocytes, (b) uptake by tumor cells, (c) retention in extracellular space, (d) p

#### RAG eval (Done)

In [33]:
for result in retrieval_eval_data:
    sample = SingleTurnSample(
        retrieved_contexts=result['retrieved_contexts'], 
        reference_contexts=result['expected_contexts'],
        reference=result['expected_contexts'][0],
    )
    await rag_eval(sample)

['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.7999999984']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.3333333329629629']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.0']
['Precision: 0.5833333333041666', 'Recall: 1.0', 'Entity Recall: 0.6666666644444444']
['Precision: 0.62499999996875', 'Recall: 1.0', 'Entity Recall: 0.4999999975']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.8666666660888889']
['Precision: 0.49999999995', 'Recall: 1.0', 'Entity Recall: 0.7999999992']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.749999999375']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.49999999916666665']
['Precision: 0.249999999975', 'Recall: 1.0', 'Entity Recall: 0.23076923059171597']


#### RAG + LLM eval (Done)

In [34]:
for result in retrieval_eval_data:
    sample = SingleTurnSample(
        user_input=result['query'],
        response=result['llm_output'],
        reference=result['expected_contexts'][0],
        retrieved_contexts=result['retrieved_contexts']
    )
    scorer = ResponseRelevancy()
    scorer.llm=evaluator_llm
    scorer.embeddings = evaluator_embeddings
    response_rel = await scorer.single_turn_ascore(sample)
    
    scorer = Faithfulness()
    scorer.llm=evaluator_llm
    faith_ful = await scorer.single_turn_ascore(sample)
    
    # scorer = NoiseSensitivity()
    # scorer.llm=evaluator_llm
    # noise = await scorer.single_turn_ascore(sample)

    print("ResponseRelevancy: ", response_rel, "Faithfulness: ", faith_ful)

ResponseRelevancy:  0.9653559977380896 Faithfulness:  0.3333333333333333
ResponseRelevancy:  0.9851707143392737 Faithfulness:  0.75
ResponseRelevancy:  0.9260403045495581 Faithfulness:  0.0
ResponseRelevancy:  0.9787860709379889 Faithfulness:  1.0
ResponseRelevancy:  0.9474099279479975 Faithfulness:  0.25
ResponseRelevancy:  0.925459468824791 Faithfulness:  1.0
ResponseRelevancy:  0.9496073196925758 Faithfulness:  0.7142857142857143
ResponseRelevancy:  0.9352757891911848 Faithfulness:  0.6666666666666666
ResponseRelevancy:  0.9427123365993065 Faithfulness:  0.3333333333333333
ResponseRelevancy:  0.9661688534528009 Faithfulness:  1.0


#### LLM eval (Done)

In [38]:
for result in retrieval_eval_data:
    sample = SingleTurnSample(
        user_input=result['query'],
        response=result['llm_output'],
        reference=result['expected_contexts'][0],
        retrieved_contexts=result['retrieved_contexts']
    )
    
    # scorer = FactualCorrectness()
    # scorer.llm = evaluator_llm
    # correct = await scorer.single_turn_ascore(sample)
    
    scorer = SemanticSimilarity()
    scorer.embeddings = evaluator_embeddings
    semantic = await scorer.single_turn_ascore(sample)
    
#     scorer = NonLLMStringSimilarity()
#     nonllm_similarity = await scorer.single_turn_ascore(sample)

    print("SemanticSimilarity: ", semantic)

SemanticSimilarity:  0.9477281017651895
SemanticSimilarity:  0.9572975061475162
SemanticSimilarity:  0.8820600810311117
SemanticSimilarity:  0.9543533914519257
SemanticSimilarity:  0.9270657951301581
SemanticSimilarity:  0.87986258712949
SemanticSimilarity:  0.9681190567521221
SemanticSimilarity:  0.9741101833312816
SemanticSimilarity:  0.9243911017507495
SemanticSimilarity:  0.921891199337912


In [39]:
import copy

retrieval_eval_data_all = copy.deepcopy(retrieval_eval_data)

### Use Longrope Only

In [29]:
use_hyde = False
use_longrope = True
use_raptor = True

In [30]:
if document == "all":
    expr = ""
    if use_raptor == False :
        expr = "level == 0"
else: 
    expr ="book_title in [" + ", ".join([f"'{title}'" for title in [document]]) + "]"
    if use_raptor == False :
        expr = expr + "&&  (level == 0)"

In [31]:
chain = initialize_chain(use_hyde, llm, vectorstore, document, embed_model, k,expr,rag_prompt )

In [32]:
for data in retrieval_eval_data:
    query = data['query']
    retrieved_docs, retrieved_docs_final,final_result = rag_and_synthesize(query,chain, use_longrope, use_hyde)
    data['retrieved_contexts'] = [doc.replace("\n", "").strip() for doc in retrieved_docs.split("\n\n") if doc.strip()][:k]
    #data['retrieved_contexts'] = [doc.strip() for doc in retrieved_docs.split("\n\n") if doc.strip()][:k]
    data['llm_output'] = [doc.split("\n\nText Sources:")[0] for doc in [final_result]][0]

Using longrope
Without using HyDE
Retrieved docs:

 **Retrieved Documents:**

 Additional molecular-genetic analyses of HCCs have revealed that HCC with hyperintensity dur-ing the HBP (ie, OATP1B3-overexpressed HCC) shows β-catenin and hepatocyte nuclear factor 4α activation (25,26). A similar molecular mechanism of OATP1B3 expression can be expected in other hepatocellular nodules (4).


 Source: 
 Hyperintense Liver Masses at Hepatobiliary Phase Gadoxetic Acidenhanced MRI --Page: 3.0 --Score: 0.8197  

 -- 



Hyperintense HCCs show β-catenin and hepatocyte nuclear factor 4α activation, while hyperintense HCCs with OATP1B3 overexpression show β-catenin and hepatocyte nuclear factor 4α activation. Molecular mechanisms of HCCs are related to β-catenin and hepatocyte nuclear factor 4α activation. HCCs with β-catenin gene mutation show higher OATP1B3 expression and are associated with accelerated bile production and a favorable prognosis.

        Key Concepts:
        β-catenin and hepa

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Without using HyDE
Retrieved docs:

 **Retrieved Documents:**

 In some institutions, liver volumetry is per-formed preoperatively, with schematics of the resection line and segmental or sectorial volumes inserted in the report (42). Radiologists working in centers where liver resections are performed are encouraged to become familiar with and include in their report descriptions of procedures aiming to increase the function and size of the future liver remnant such as portal vein embolization, portal vein ligation, and associated liver partition and portal vein ligation for staged hepatectomy.


 Source: 
 How to Use LI-RADS to Report Liver CT and MRI Observations --Page: 12.0 --Score: 0.7636  

 -- 



Patients Considered for Hepatic ResectionIn patients considered for tumor resection, ad-ditional information should be included. De-scription of the biliary and vascular anatomy is critical. A description of the tumor location rela-tive to major hepatic veins and biliary structures, di

#### RAG eval (Done)

In [33]:
for result in retrieval_eval_data:
    sample = SingleTurnSample(
        retrieved_contexts=result['retrieved_contexts'], 
        reference_contexts=result['expected_contexts'],
        reference=result['expected_contexts'][0],
    )
    await rag_eval(sample)

['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.9999999980000001']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.49999999937499995']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.0']
['Precision: 0.5833333333041666', 'Recall: 1.0', 'Entity Recall: 0.6666666644444444']
['Precision: 0.7499999999625', 'Recall: 1.0', 'Entity Recall: 0.4999999975']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.8666666660888889']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.9999999989999999']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.666666666111111']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.9999999980000001']
['Precision: 0.19999999998', 'Recall: 1.0', 'Entity Recall: 0.15384615372781063']


#### RAG + LLM eval (Done)

In [34]:
for result in retrieval_eval_data:
    sample = SingleTurnSample(
        user_input=result['query'],
        response=result['llm_output'],
        reference=result['expected_contexts'][0],
        retrieved_contexts=result['retrieved_contexts']
    )
    scorer = ResponseRelevancy()
    scorer.llm=evaluator_llm
    scorer.embeddings = evaluator_embeddings
    response_rel = await scorer.single_turn_ascore(sample)
    
    scorer = Faithfulness()
    scorer.llm=evaluator_llm
    faith_ful = await scorer.single_turn_ascore(sample)
    
    # scorer = NoiseSensitivity()
    # scorer.llm=evaluator_llm
    # noise = await scorer.single_turn_ascore(sample)

    print("ResponseRelevancy: ", response_rel, "Faithfulness: ", faith_ful)


ResponseRelevancy:  0.9750546248619049 Faithfulness:  0.75
ResponseRelevancy:  0.9871813924878667 Faithfulness:  0.875
ResponseRelevancy:  0.9300038841455365 Faithfulness:  0.0
ResponseRelevancy:  0.969492414476238 Faithfulness:  1.0
ResponseRelevancy:  0.9063852011389345 Faithfulness:  0.14285714285714285
ResponseRelevancy:  0.9225314961333297 Faithfulness:  1.0
ResponseRelevancy:  0.9403466645738018 Faithfulness:  0.7
ResponseRelevancy:  0.9429481382441919 Faithfulness:  0.5555555555555556
ResponseRelevancy:  0.9318801748260447 Faithfulness:  0.4
ResponseRelevancy:  0.9306112486627912 Faithfulness:  1.0


#### LLM eval

In [35]:
for result in retrieval_eval_data:
    sample = SingleTurnSample(
        user_input=result['query'],
        response=result['llm_output'],
        reference=result['expected_contexts'][0],
        retrieved_contexts=result['retrieved_contexts']
    )
    
#     scorer = FactualCorrectness()
#     scorer.llm = evaluator_llm
#     correct = await scorer.single_turn_ascore(sample)
    
    scorer = SemanticSimilarity()
    scorer.embeddings = evaluator_embeddings
    semantic = await scorer.single_turn_ascore(sample)
    
    # scorer = NonLLMStringSimilarity()
    # nonllm_similarity = await scorer.single_turn_ascore(sample)

#     print("FactualCorrectness: ", correct, "SemanticSimilarity: ", semantic, "Similarity: ", nonllm_similarity)
    
    print( "SemanticSimilarity: ", semantic)

SemanticSimilarity:  0.9517369295318759
SemanticSimilarity:  0.9583243617522927
SemanticSimilarity:  0.8821152551098643
SemanticSimilarity:  0.9625026270656718
SemanticSimilarity:  0.9440746291805224
SemanticSimilarity:  0.87836395722458
SemanticSimilarity:  0.9660360580511516
SemanticSimilarity:  0.9759574346951506
SemanticSimilarity:  0.916816368872817
SemanticSimilarity:  0.9064705348694023


In [36]:
import copy

In [37]:
retrieval_eval_data_longrope = copy.deepcopy(retrieval_eval_data)

### Use Hyde Only (Done)

In [38]:
use_hyde = True
use_longrope = False
use_raptor = True

In [39]:
if document == "all":
    expr = ""
    if use_raptor == False :
        expr = "level == 0"
else: 
    expr ="book_title in [" + ", ".join([f"'{title}'" for title in [document]]) + "]"
    if use_raptor == False :
        expr = expr + "&&  (level == 0)"

In [40]:
chain = initialize_chain(use_hyde, llm, vectorstore, document, embed_model, k,expr,rag_prompt )

In [42]:
for data in retrieval_eval_data:
    query = data['query']
    retrieved_docs, retrieved_docs_final,final_result = rag_and_synthesize(query,chain, use_longrope, use_hyde)
    data['retrieved_contexts'] = [doc.replace("\n", "").strip() for doc in retrieved_docs.split("\n\n") if doc.strip()][:k]
    #data['retrieved_contexts'] = [doc.strip() for doc in retrieved_docs.split("\n\n") if doc.strip()][:k]
    data['llm_output'] = [doc.split("\n\nText Sources:")[0] for doc in [final_result]][0]

Without using longrope
Using HyDE
Retrieved docs:

 **Hypothetical Answer Generated by HyDE:**

 OATP1B3 overexpression in HCCs can disrupt β-catenin signaling, leading to activation of HNF4α transcription factors.  

 ------ 

 **Retrieved Documents:**

 Additional molecular-genetic analyses of HCCs have revealed that HCC with hyperintensity dur-ing the HBP (ie, OATP1B3-overexpressed HCC) shows β-catenin and hepatocyte nuclear factor 4α activation (25,26). A similar molecular mechanism of OATP1B3 expression can be expected in other hepatocellular nodules (4).


 Source: 
 Hyperintense Liver Masses at Hepatobiliary Phase Gadoxetic Acidenhanced MRI --Page: 3.0 --Score: 0.8122  

 -- 



Hyperintense HCCs show β-catenin and hepatocyte nuclear factor 4α activation, while hyperintense HCCs with OATP1B3 overexpression show β-catenin and hepatocyte nuclear factor 4α activation. Molecular mechanisms of HCCs are related to β-catenin and hepatocyte nuclear factor 4α activation. HCCs with β-cate

#### RAG eval (Done)

In [43]:
for result in retrieval_eval_data:
    sample = SingleTurnSample(
        retrieved_contexts=result['retrieved_contexts'], 
        reference_contexts=result['expected_contexts'],
        reference=result['expected_contexts'][0],
    )
    await rag_eval(sample)

['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.9999999980000001']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.3333333329629629']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.0']
['Precision: 0.8333333332916666', 'Recall: 1.0', 'Entity Recall: 0.6666666644444444']
['Precision: 0.699999999965', 'Recall: 1.0', 'Entity Recall: 0.4999999975']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.5999999996']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.8999999991']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.749999999375']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.6249999992187499']
['Precision: 0.3333333333', 'Recall: 1.0', 'Entity Recall: 0.46153846118343195']


#### RAG + LLM eval (Done)

In [44]:
for result in retrieval_eval_data:
    sample = SingleTurnSample(
        user_input=result['query'],
        response=result['llm_output'],
        reference=result['expected_contexts'][0],
        retrieved_contexts=result['retrieved_contexts']
    )
    scorer = ResponseRelevancy()
    scorer.llm=evaluator_llm
    scorer.embeddings = evaluator_embeddings
    response_rel = await scorer.single_turn_ascore(sample)
    
    scorer = Faithfulness()
    scorer.llm=evaluator_llm
    faith_ful = await scorer.single_turn_ascore(sample)
    
    # scorer = NoiseSensitivity()
    # scorer.llm=evaluator_llm
    # noise = await scorer.single_turn_ascore(sample)

    print("ResponseRelevancy: ", response_rel, "Faithfulness: ", faith_ful)


ResponseRelevancy:  0.9315857391863487 Faithfulness:  0.8571428571428571
ResponseRelevancy:  0.9445800702635007 Faithfulness:  1.0
ResponseRelevancy:  0.9350315301967721 Faithfulness:  0.0
ResponseRelevancy:  0.957502537893777 Faithfulness:  0.6428571428571429
ResponseRelevancy:  0.8929702542143482 Faithfulness:  0.5833333333333334
ResponseRelevancy:  0.9225284659124661 Faithfulness:  1.0
ResponseRelevancy:  0.9034525482764363 Faithfulness:  1.0
ResponseRelevancy:  0.9518540085356967 Faithfulness:  0.625
ResponseRelevancy:  0.9318801748260447 Faithfulness:  0.6666666666666666
ResponseRelevancy:  0.9643301216127886 Faithfulness:  1.0


#### LLM eval (Done)

In [46]:
for result in retrieval_eval_data:
    sample = SingleTurnSample(
        user_input=result['query'],
        response=result['llm_output'],
        reference=result['expected_contexts'][0],
        retrieved_contexts=result['retrieved_contexts']
    )
    
#     scorer = FactualCorrectness()
#     scorer.llm = evaluator_llm
#     correct = await scorer.single_turn_ascore(sample)
    
    scorer = SemanticSimilarity()
    scorer.embeddings = evaluator_embeddings
    semantic = await scorer.single_turn_ascore(sample)
    
    # scorer = NonLLMStringSimilarity()
    # nonllm_similarity = await scorer.single_turn_ascore(sample)

#     print("FactualCorrectness: ", correct, "SemanticSimilarity: ", semantic, "Similarity: ", nonllm_similarity)
    
    print( "SemanticSimilarity: ", semantic)

SemanticSimilarity:  0.9612499005663939
SemanticSimilarity:  0.9491351923133902
SemanticSimilarity:  0.8813053833478841
SemanticSimilarity:  0.9658051329206573
SemanticSimilarity:  0.9391404962124527
SemanticSimilarity:  0.87986258712949
SemanticSimilarity:  0.9646053483346595
SemanticSimilarity:  0.9762805241850678
SemanticSimilarity:  0.9214500832720183
SemanticSimilarity:  0.9032630202084617


In [47]:
retrieval_eval_data_hyde = copy.deepcopy(retrieval_eval_data)

### Use None (Done)

In [48]:
use_hyde = False
use_longrope = False
use_raptor = True

In [49]:
if document == "all":
    expr = ""
    if use_raptor == False :
        expr = "level == 0"
else: 
    expr ="book_title in [" + ", ".join([f"'{title}'" for title in [document]]) + "]"
    if use_raptor == False :
        expr = expr + "&&  (level == 0)"

In [50]:
chain = initialize_chain(use_hyde, llm, vectorstore, document, embed_model, k,expr,rag_prompt )

In [51]:
for data in retrieval_eval_data:
    query = data['query']
    retrieved_docs, retrieved_docs_final,final_result = rag_and_synthesize(query,chain, use_longrope, use_hyde)
    data['retrieved_contexts'] = [doc.replace("\n", "").strip() for doc in retrieved_docs.split("\n\n") if doc.strip()][:k]
    #data['retrieved_contexts'] = [doc.strip() for doc in retrieved_docs.split("\n\n") if doc.strip()][:k]
    data['llm_output'] = [doc.split("\n\nText Sources:")[0] for doc in [final_result]][0]

Without using longrope
Without using HyDE
Retrieved docs:

 **Retrieved Documents:**

 Additional molecular-genetic analyses of HCCs have revealed that HCC with hyperintensity dur-ing the HBP (ie, OATP1B3-overexpressed HCC) shows β-catenin and hepatocyte nuclear factor 4α activation (25,26). A similar molecular mechanism of OATP1B3 expression can be expected in other hepatocellular nodules (4).


 Source: 
 Hyperintense Liver Masses at Hepatobiliary Phase Gadoxetic Acidenhanced MRI --Page: 3.0 --Score: 0.8197  

 -- 



Hyperintense HCCs show β-catenin and hepatocyte nuclear factor 4α activation, while hyperintense HCCs with OATP1B3 overexpression show β-catenin and hepatocyte nuclear factor 4α activation. Molecular mechanisms of HCCs are related to β-catenin and hepatocyte nuclear factor 4α activation. HCCs with β-catenin gene mutation show higher OATP1B3 expression and are associated with accelerated bile production and a favorable prognosis.

        Key Concepts:
        β-catenin 

#### RAG eval (Done)

In [52]:
for result in retrieval_eval_data:
    sample = SingleTurnSample(
        retrieved_contexts=result['retrieved_contexts'], 
        reference_contexts=result['expected_contexts'],
        reference=result['expected_contexts'][0],
    )
    await rag_eval(sample)

['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.9999999980000001']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.4444444439506172']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.0']
['Precision: 0.5833333333041666', 'Recall: 1.0', 'Entity Recall: 0.6666666644444444']
['Precision: 0.7499999999625', 'Recall: 1.0', 'Entity Recall: 0.4999999975']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.874999999453125']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.9999999989999999']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.666666666111111']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.9999999980000001']
['Precision: 0.19999999998', 'Recall: 1.0', 'Entity Recall: 0.24999999979166665']


#### RAG + LLM eval (DOne)

In [53]:
for result in retrieval_eval_data:
    sample = SingleTurnSample(
        user_input=result['query'],
        response=result['llm_output'],
        reference=result['expected_contexts'][0],
        retrieved_contexts=result['retrieved_contexts']
    )
    scorer = ResponseRelevancy()
    scorer.llm=evaluator_llm
    scorer.embeddings = evaluator_embeddings
    response_rel = await scorer.single_turn_ascore(sample)
    
    scorer = Faithfulness()
    scorer.llm=evaluator_llm
    faith_ful = await scorer.single_turn_ascore(sample)
    
    # scorer = NoiseSensitivity()
    # scorer.llm=evaluator_llm
    # noise = await scorer.single_turn_ascore(sample)

    print("ResponseRelevancy: ", response_rel, "Faithfulness: ", faith_ful)


ResponseRelevancy:  0.9315857391863487 Faithfulness:  0.42857142857142855
ResponseRelevancy:  0.9871813924878667 Faithfulness:  1.0
ResponseRelevancy:  0.930098414893779 Faithfulness:  0.16666666666666666
ResponseRelevancy:  0.9338622307286055 Faithfulness:  0.7142857142857143
ResponseRelevancy:  0.9662971887883387 Faithfulness:  0.05555555555555555
ResponseRelevancy:  0.925459468824791 Faithfulness:  1.0
ResponseRelevancy:  0.9094692815888274 Faithfulness:  0.8
ResponseRelevancy:  0.9031024456473776 Faithfulness:  0.5454545454545454
ResponseRelevancy:  0.9427213857029985 Faithfulness:  0.4
ResponseRelevancy:  0.8762940386217243 Faithfulness:  1.0


#### LLM eval (Done)

In [54]:
for result in retrieval_eval_data:
    sample = SingleTurnSample(
        user_input=result['query'],
        response=result['llm_output'],
        reference=result['expected_contexts'][0],
        retrieved_contexts=result['retrieved_contexts']
    )
    
#     scorer = FactualCorrectness()
#     scorer.llm = evaluator_llm
#     correct = await scorer.single_turn_ascore(sample)
    
    scorer = SemanticSimilarity()
    scorer.embeddings = evaluator_embeddings
    semantic = await scorer.single_turn_ascore(sample)
    
    # scorer = NonLLMStringSimilarity()
    # nonllm_similarity = await scorer.single_turn_ascore(sample)

#     print("FactualCorrectness: ", correct, "SemanticSimilarity: ", semantic, "Similarity: ", nonllm_similarity)
    
    print( "SemanticSimilarity: ", semantic)

SemanticSimilarity:  0.9615459086025728
SemanticSimilarity:  0.9562688823389577
SemanticSimilarity:  0.8849459680547205
SemanticSimilarity:  0.9452601003667951
SemanticSimilarity:  0.9310555588807383
SemanticSimilarity:  0.87986258712949
SemanticSimilarity:  0.9646185824251792
SemanticSimilarity:  0.9797219699919693
SemanticSimilarity:  0.9270844961300303
SemanticSimilarity:  0.8973369964883897


In [64]:
retrieval_eval_data_none = copy.deepcopy(retrieval_eval_data)

## Without Raptor

### Use Hyde + Longrope (Done)

In [99]:
retrieval_eval_data

[{'query': 'How does OATP1B3 overexpression in HCCs affect the molecular mechanisms involving β-catenin and hepatocyte nuclear factor 4α activation?',
  'retrieved_contexts': ['Additional molecular-genetic analyses of HCCs have revealed that HCC with hyperintensity dur-ing the HBP (ie, OATP1B3-overexpressed HCC) shows β-catenin and hepatocyte nuclear factor 4α activation (25,26). A similar molecular mechanism of OATP1B3 expression can be expected in other hepatocellular nodules (4).',
   '68.\t Fukusato T, Soejima Y, Kondo F, et al. Preserved or en-hanced OATP1B3 expression in hepatocellular adenoma subtypes with nuclear accumulation of β-catenin. Hepatol Res 2015;45(10):E32–E42.',
   'Kitao et al (26) reported that HCC with β-catenin gene mutation showed higher OATP1B3 expression, a pseudoglandular pattern, bile production, and hyperintensity during the HBP. In addition, investigators in other studies have reported that HCCs with β-catenin gene mutation, as compared with HCCs without 

In [55]:
use_raptor = False
use_hyde = True
use_longrope = True

In [56]:
if document == "all":
    expr = ""
    if use_raptor == False :
        expr = "level == 0"
else: 
    expr ="book_title in [" + ", ".join([f"'{title}'" for title in [document]]) + "]"
    if use_raptor == False :
        expr = expr + "&&  (level == 0)"

In [57]:
chain = initialize_chain(use_hyde, llm, vectorstore, document, embed_model, k,expr,rag_prompt )

In [58]:
for data in retrieval_eval_data:
    query = data['query']
    retrieved_docs, retrieved_docs_final,final_result = rag_and_synthesize(query,chain, use_longrope, use_hyde)
    data['retrieved_contexts'] = [doc.replace("\n", "").strip() for doc in retrieved_docs.split("\n\n") if doc.strip()][:k]
    #data['retrieved_contexts'] = [doc.strip() for doc in retrieved_docs.split("\n\n") if doc.strip()][:k]
    data['llm_output'] = [doc.split("\n\nText Sources:")[0] for doc in [final_result]][0]

Using longrope
Using HyDE
Retrieved docs:

 **Hypothetical Answer Generated by HyDE:**

 OATP1B3 overexpression in HCCs leads to β-catenin activation, promoting cell proliferation and tumor growth.  

 ------ 

 **Retrieved Documents:**

 Additional molecular-genetic analyses of HCCs have revealed that HCC with hyperintensity dur-ing the HBP (ie, OATP1B3-overexpressed HCC) shows β-catenin and hepatocyte nuclear factor 4α activation (25,26). A similar molecular mechanism of OATP1B3 expression can be expected in other hepatocellular nodules (4).


 Source: 
 Hyperintense Liver Masses at Hepatobiliary Phase Gadoxetic Acidenhanced MRI --Page: 3.0 --Score: 0.8054  

 -- 



OATP expression progressively declines in most HCCs during hepatocarcinogenesis, resulting in HBP hypointensity.


 Source: 
 Abbreviated MRI for Hepatocellular Carcinoma Screening and Surveillance --Page: 2.0 --Score: 0.7072  

 -- 



Kitao et al (26) reported that HCC with β-catenin gene mutation showed higher OATP1B3

#### RAG eval (Done)

In [59]:
for result in retrieval_eval_data:
    sample = SingleTurnSample(
        retrieved_contexts=result['retrieved_contexts'], 
        reference_contexts=result['expected_contexts'],
        reference=result['expected_contexts'][0],
    )
    await rag_eval(sample)

['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.7999999984']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.3333333329629629']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.0']
['Precision: 0.5833333333041666', 'Recall: 1.0', 'Entity Recall: 0.3333333322222222']
['Precision: 0.21111111110055555', 'Recall: 1.0', 'Entity Recall: 0.4999999975']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.8571428565306122']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.9999999989999999']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.749999999375']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.37499999953124996']
['Precision: 0.49999999995', 'Recall: 1.0', 'Entity Recall: 0.5454545449586776']


#### RAG + LLM eval (Done)

In [60]:
for result in retrieval_eval_data:
    sample = SingleTurnSample(
        user_input=result['query'],
        response=result['llm_output'],
        reference=result['expected_contexts'][0],
        retrieved_contexts=result['retrieved_contexts']
    )
    scorer = ResponseRelevancy()
    scorer.llm=evaluator_llm
    scorer.embeddings = evaluator_embeddings
    response_rel = await scorer.single_turn_ascore(sample)
    
    scorer = Faithfulness()
    scorer.llm=evaluator_llm
    faith_ful = await scorer.single_turn_ascore(sample)
    
    # scorer = NoiseSensitivity()
    # scorer.llm=evaluator_llm
    # noise = await scorer.single_turn_ascore(sample)

    print("ResponseRelevancy: ", response_rel, "Faithfulness: ", faith_ful)

ResponseRelevancy:  0.9499713930131813 Faithfulness:  0.2857142857142857
ResponseRelevancy:  0.9851707143392737 Faithfulness:  0.8333333333333334
ResponseRelevancy:  0.9268117111448673 Faithfulness:  0.0
ResponseRelevancy:  0.9361742007617715 Faithfulness:  1.0
ResponseRelevancy:  0.9416952731958196 Faithfulness:  0.16666666666666666
ResponseRelevancy:  0.9317587192910266 Faithfulness:  1.0
ResponseRelevancy:  0.9034525482764363 Faithfulness:  1.0
ResponseRelevancy:  0.9359886863506742 Faithfulness:  0.5714285714285714
ResponseRelevancy:  0.9567958981790053 Faithfulness:  0.7142857142857143
ResponseRelevancy:  0.960357494371233 Faithfulness:  1.0


#### LLM eval (Done)

In [61]:
for result in retrieval_eval_data:
    sample = SingleTurnSample(
        user_input=result['query'],
        response=result['llm_output'],
        reference=result['expected_contexts'][0],
        retrieved_contexts=result['retrieved_contexts']
    )
    
#     scorer = FactualCorrectness()
#     scorer.llm = evaluator_llm
#     correct = await scorer.single_turn_ascore(sample)
    
    scorer = SemanticSimilarity()
    scorer.embeddings = evaluator_embeddings
    semantic = await scorer.single_turn_ascore(sample)
    
    # scorer = NonLLMStringSimilarity()
    # nonllm_similarity = await scorer.single_turn_ascore(sample)

#     print("FactualCorrectness: ", correct, "SemanticSimilarity: ", semantic, "Similarity: ", nonllm_similarity)
    
    print( "SemanticSimilarity: ", semantic)

SemanticSimilarity:  0.9559866546307562
SemanticSimilarity:  0.9281453019960607
SemanticSimilarity:  0.8858798877608256
SemanticSimilarity:  0.9765774227636448
SemanticSimilarity:  0.9343166846331491
SemanticSimilarity:  0.8797053814922426
SemanticSimilarity:  0.9646053483346595
SemanticSimilarity:  0.9598887708593977
SemanticSimilarity:  0.9472256247208672
SemanticSimilarity:  0.9079769895208178


In [62]:
import copy

retrieval_eval_data_all_noraptor = copy.deepcopy(retrieval_eval_data)

### Use Longrope Only

In [63]:
use_hyde = False
use_longrope = True
use_raptor = False

In [64]:
if document == "all":
    expr = ""
    if use_raptor == False :
        expr = "level == 0"
else: 
    expr ="book_title in [" + ", ".join([f"'{title}'" for title in [document]]) + "]"
    if use_raptor == False :
        expr = expr + "&&  (level == 0)"

In [65]:
chain = initialize_chain(use_hyde, llm, vectorstore, document, embed_model, k,expr,rag_prompt )

In [66]:
for data in retrieval_eval_data:
    query = data['query']
    retrieved_docs, retrieved_docs_final,final_result = rag_and_synthesize(query,chain, use_longrope, use_hyde)
    data['retrieved_contexts'] = [doc.replace("\n", "").strip() for doc in retrieved_docs.split("\n\n") if doc.strip()][:k]
    #data['retrieved_contexts'] = [doc.strip() for doc in retrieved_docs.split("\n\n") if doc.strip()][:k]
    data['llm_output'] = [doc.split("\n\nText Sources:")[0] for doc in [final_result]][0]

Using longrope
Without using HyDE
Retrieved docs:

 **Retrieved Documents:**

 Additional molecular-genetic analyses of HCCs have revealed that HCC with hyperintensity dur-ing the HBP (ie, OATP1B3-overexpressed HCC) shows β-catenin and hepatocyte nuclear factor 4α activation (25,26). A similar molecular mechanism of OATP1B3 expression can be expected in other hepatocellular nodules (4).


 Source: 
 Hyperintense Liver Masses at Hepatobiliary Phase Gadoxetic Acidenhanced MRI --Page: 3.0 --Score: 0.8197  

 -- 



Kitao et al (26) reported that HCC with β-catenin gene mutation showed higher OATP1B3 expression, a pseudoglandular pattern, bile production, and hyperintensity during the HBP. In addition, investigators in other studies have reported that HCCs with β-catenin gene mutation, as compared with HCCs without this mutation, are associated with accelerated bile production (79), higher OATP1B3 expression (80), and a favorable prognosis (81). The investigators in these studies also repo

#### RAG eval (Done)

In [67]:
for result in retrieval_eval_data:
    sample = SingleTurnSample(
        retrieved_contexts=result['retrieved_contexts'], 
        reference_contexts=result['expected_contexts'],
        reference=result['expected_contexts'][0],
    )
    await rag_eval(sample)

['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.7999999984']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.4444444439506172']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.0']
['Precision: 0.5833333333041666', 'Recall: 1.0', 'Entity Recall: 0.3333333322222222']
['Precision: 0.99999999995', 'Recall: 1.0', 'Entity Recall: 0.4999999975']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.874999999453125']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.9999999989999999']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.749999999375']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.6249999992187499']
['Precision: 0.249999999975', 'Recall: 1.0', 'Entity Recall: 0.2727272724793388']


#### RAG + LLM eval (Done)

In [68]:
for result in retrieval_eval_data:
    sample = SingleTurnSample(
        user_input=result['query'],
        response=result['llm_output'],
        reference=result['expected_contexts'][0],
        retrieved_contexts=result['retrieved_contexts']
    )
    scorer = ResponseRelevancy()
    scorer.llm=evaluator_llm
    scorer.embeddings = evaluator_embeddings
    response_rel = await scorer.single_turn_ascore(sample)
    
    scorer = Faithfulness()
    scorer.llm=evaluator_llm
    faith_ful = await scorer.single_turn_ascore(sample)
    
    # scorer = NoiseSensitivity()
    # scorer.llm=evaluator_llm
    # noise = await scorer.single_turn_ascore(sample)

    print("ResponseRelevancy: ", response_rel, "Faithfulness: ", faith_ful)


ResponseRelevancy:  0.9255633310292799 Faithfulness:  0.8571428571428571
ResponseRelevancy:  0.9851707143392737 Faithfulness:  0.2
ResponseRelevancy:  0.9300038841455365 Faithfulness:  0.0
ResponseRelevancy:  0.9787860709379889 Faithfulness:  1.0
ResponseRelevancy:  0.9500706569028147 Faithfulness:  0.375
ResponseRelevancy:  0.9317587192910266 Faithfulness:  1.0
ResponseRelevancy:  0.9467828562409824 Faithfulness:  0.5714285714285714
ResponseRelevancy:  0.9178091343371232 Faithfulness:  0.75
ResponseRelevancy:  0.9427213857029985 Faithfulness:  0.5
ResponseRelevancy:  0.9306112486627912 Faithfulness:  0.6666666666666666


#### LLM eval

In [69]:
for result in retrieval_eval_data:
    sample = SingleTurnSample(
        user_input=result['query'],
        response=result['llm_output'],
        reference=result['expected_contexts'][0],
        retrieved_contexts=result['retrieved_contexts']
    )
    
#     scorer = FactualCorrectness()
#     scorer.llm = evaluator_llm
#     correct = await scorer.single_turn_ascore(sample)
    
    scorer = SemanticSimilarity()
    scorer.embeddings = evaluator_embeddings
    semantic = await scorer.single_turn_ascore(sample)
    
    # scorer = NonLLMStringSimilarity()
    # nonllm_similarity = await scorer.single_turn_ascore(sample)

#     print("FactualCorrectness: ", correct, "SemanticSimilarity: ", semantic, "Similarity: ", nonllm_similarity)
    
    print( "SemanticSimilarity: ", semantic)

SemanticSimilarity:  0.961748074922542
SemanticSimilarity:  0.9426339739429342
SemanticSimilarity:  0.8899416072155053
SemanticSimilarity:  0.9330135007571916
SemanticSimilarity:  0.9368841908619674
SemanticSimilarity:  0.8797053814922426
SemanticSimilarity:  0.9641310291977092
SemanticSimilarity:  0.9641841726348803
SemanticSimilarity:  0.932176648094051
SemanticSimilarity:  0.8990852625156833


In [70]:
retrieval_eval_data_longrope_noraptor = copy.deepcopy(retrieval_eval_data)

### Use Hyde Only (Done)

In [71]:
use_hyde = True
use_longrope = False
use_raptor = False

In [72]:
if document == "all":
    expr = ""
    if use_raptor == False :
        expr = "level == 0"
else: 
    expr ="book_title in [" + ", ".join([f"'{title}'" for title in [document]]) + "]"
    if use_raptor == False :
        expr = expr + "&&  (level == 0)"

In [73]:
chain = initialize_chain(use_hyde, llm, vectorstore, document, embed_model, k,expr,rag_prompt )

In [74]:
for data in retrieval_eval_data:
    query = data['query']
    retrieved_docs, retrieved_docs_final,final_result = rag_and_synthesize(query,chain, use_longrope, use_hyde)
    data['retrieved_contexts'] = [doc.replace("\n", "").strip() for doc in retrieved_docs.split("\n\n") if doc.strip()][:k]
    #data['retrieved_contexts'] = [doc.strip() for doc in retrieved_docs.split("\n\n") if doc.strip()][:k]
    data['llm_output'] = [doc.split("\n\nText Sources:")[0] for doc in [final_result]][0]

Without using longrope
Using HyDE
Retrieved docs:

 **Hypothetical Answer Generated by HyDE:**

 OATP1B3 overexpression in HCCs disrupts β-catenin signaling by inhibiting its degradation, leading to activation of β-catenin. Activation of β-catenin promotes the expression of target genes, including Wnt/β-catenin pathway components. Hepatocyte nuclear factor 4α (HNF4α) activation is also disrupted due to OATP1B3 overexpression, which affects the expression of genes involved in glucose and lipid metabolism, ultimately impacting the function of HNF4α. This results in a complex interplay between β-catenin and HNF4α activation, which is disrupted by OATP1B3 overexpression.  

 ------ 

 **Retrieved Documents:**

 Additional molecular-genetic analyses of HCCs have revealed that HCC with hyperintensity dur-ing the HBP (ie, OATP1B3-overexpressed HCC) shows β-catenin and hepatocyte nuclear factor 4α activation (25,26). A similar molecular mechanism of OATP1B3 expression can be expected in other 

#### RAG eval (Done)

In [75]:
for result in retrieval_eval_data:
    sample = SingleTurnSample(
        retrieved_contexts=result['retrieved_contexts'], 
        reference_contexts=result['expected_contexts'],
        reference=result['expected_contexts'][0],
    )
    await rag_eval(sample)

['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.9999999980000001']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.2727272724793388']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.0']
['Precision: 0.5833333333041666', 'Recall: 1.0', 'Entity Recall: 0.3333333322222222']
['Precision: 0.37499999998125', 'Recall: 1.0', 'Entity Recall: 0.4999999975']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.6666666662222221']
['Precision: 0.3333333333', 'Recall: 1.0', 'Entity Recall: 0.7999999992']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.666666666111111']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.37499999953124996']
['Precision: 0.3333333333', 'Recall: 1.0', 'Entity Recall: 0.4999999995833333']


#### RAG + LLM eval (Done)

In [76]:
for result in retrieval_eval_data:
    sample = SingleTurnSample(
        user_input=result['query'],
        response=result['llm_output'],
        reference=result['expected_contexts'][0],
        retrieved_contexts=result['retrieved_contexts']
    )
    scorer = ResponseRelevancy()
    scorer.llm=evaluator_llm
    scorer.embeddings = evaluator_embeddings
    response_rel = await scorer.single_turn_ascore(sample)
    
    scorer = Faithfulness()
    scorer.llm=evaluator_llm
    faith_ful = await scorer.single_turn_ascore(sample)
    
    # scorer = NoiseSensitivity()
    # scorer.llm=evaluator_llm
    # noise = await scorer.single_turn_ascore(sample)

    print("ResponseRelevancy: ", response_rel, "Faithfulness: ", faith_ful)


ResponseRelevancy:  0.8906415317865349 Faithfulness:  0.7
ResponseRelevancy:  0.9882131136474549 Faithfulness:  0.6666666666666666
ResponseRelevancy:  0.9300038841455365 Faithfulness:  0.0
ResponseRelevancy:  0.9768877892012112 Faithfulness:  1.0
ResponseRelevancy:  0.9474099279479975 Faithfulness:  0.6666666666666666
ResponseRelevancy:  0.9317587192910266 Faithfulness:  0.3157894736842105
ResponseRelevancy:  0.9244278181150084 Faithfulness:  1.0
ResponseRelevancy:  0.9317853174175528 Faithfulness:  0.6666666666666666
ResponseRelevancy:  0.9427213857029985 Faithfulness:  1.0
ResponseRelevancy:  0.960746211178833 Faithfulness:  1.0


#### LLM eval (Done)

In [77]:
for result in retrieval_eval_data:
    sample = SingleTurnSample(
        user_input=result['query'],
        response=result['llm_output'],
        reference=result['expected_contexts'][0],
        retrieved_contexts=result['retrieved_contexts']
    )
    
#     scorer = FactualCorrectness()
#     scorer.llm = evaluator_llm
#     correct = await scorer.single_turn_ascore(sample)
    
    scorer = SemanticSimilarity()
    scorer.embeddings = evaluator_embeddings
    semantic = await scorer.single_turn_ascore(sample)
    
    # scorer = NonLLMStringSimilarity()
    # nonllm_similarity = await scorer.single_turn_ascore(sample)

#     print("FactualCorrectness: ", correct, "SemanticSimilarity: ", semantic, "Similarity: ", nonllm_similarity)
    
    print( "SemanticSimilarity: ", semantic)

SemanticSimilarity:  0.9586264869508738
SemanticSimilarity:  0.9389168898097704
SemanticSimilarity:  0.8848479878126014
SemanticSimilarity:  0.9633051309823875
SemanticSimilarity:  0.9373711285407165
SemanticSimilarity:  0.851385249803073
SemanticSimilarity:  0.9651430896461646
SemanticSimilarity:  0.9537409442086164
SemanticSimilarity:  0.9222818844800357
SemanticSimilarity:  0.9079434276057015


In [78]:
retrieval_eval_data_hyde_noraptor = copy.deepcopy(retrieval_eval_data)

### Use None (Done)

In [91]:
use_hyde = False
use_longrope = False
use_raptor = False

In [92]:
if document == "all":
    expr = ""
    if use_raptor == False :
        expr = "level == 0"
else: 
    expr ="book_title in [" + ", ".join([f"'{title}'" for title in [document]]) + "]"
    if use_raptor == False :
        expr = expr + "&&  (level == 0)"

In [93]:
chain = initialize_chain(use_hyde, llm, vectorstore, document, embed_model, k,expr,rag_prompt )

In [94]:
for data in retrieval_eval_data:
    query = data['query']
    retrieved_docs, retrieved_docs_final,final_result = rag_and_synthesize(query,chain, use_longrope, use_hyde)
    data['retrieved_contexts'] = [doc.replace("\n", "").strip() for doc in retrieved_docs.split("\n\n") if doc.strip()][:k]
    #data['retrieved_contexts'] = [doc.strip() for doc in retrieved_docs.split("\n\n") if doc.strip()][:k]
    data['llm_output'] = [doc.split("\n\nText Sources:")[0] for doc in [final_result]][0]

Without using longrope
Without using HyDE
Retrieved docs:

 **Retrieved Documents:**

 Additional molecular-genetic analyses of HCCs have revealed that HCC with hyperintensity dur-ing the HBP (ie, OATP1B3-overexpressed HCC) shows β-catenin and hepatocyte nuclear factor 4α activation (25,26). A similar molecular mechanism of OATP1B3 expression can be expected in other hepatocellular nodules (4).


 Source: 
 Hyperintense Liver Masses at Hepatobiliary Phase Gadoxetic Acidenhanced MRI --Page: 3.0 --Score: 0.8197  

 -- 



Kitao et al (26) reported that HCC with β-catenin gene mutation showed higher OATP1B3 expression, a pseudoglandular pattern, bile production, and hyperintensity during the HBP. In addition, investigators in other studies have reported that HCCs with β-catenin gene mutation, as compared with HCCs without this mutation, are associated with accelerated bile production (79), higher OATP1B3 expression (80), and a favorable prognosis (81). The investigators in these studies a

#### RAG eval (Done)

In [95]:
for result in retrieval_eval_data:
    sample = SingleTurnSample(
        retrieved_contexts=result['retrieved_contexts'], 
        reference_contexts=result['expected_contexts'],
        reference=result['expected_contexts'][0],
    )
    await rag_eval(sample)

['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.7999999984']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.12499999984374999']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.0']
['Precision: 0.5833333333041666', 'Recall: 1.0', 'Entity Recall: 0.3333333322222222']
['Precision: 0.99999999995', 'Recall: 1.0', 'Entity Recall: 0.999999995']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.7999999994666667']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.9999999989999999']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.8333333326388889']
['Precision: 0.9999999999', 'Recall: 1.0', 'Entity Recall: 0.8333333319444445']
['Precision: 0.249999999975', 'Recall: 1.0', 'Entity Recall: 0.24999999979166665']


#### RAG + LLM eval (DOne)

In [96]:
for result in retrieval_eval_data:
    sample = SingleTurnSample(
        user_input=result['query'],
        response=result['llm_output'],
        reference=result['expected_contexts'][0],
        retrieved_contexts=result['retrieved_contexts']
    )
    scorer = ResponseRelevancy()
    scorer.llm=evaluator_llm
    scorer.embeddings = evaluator_embeddings
    response_rel = await scorer.single_turn_ascore(sample)
    
    scorer = Faithfulness()
    scorer.llm=evaluator_llm
    faith_ful = await scorer.single_turn_ascore(sample)
    
    # scorer = NoiseSensitivity()
    # scorer.llm=evaluator_llm
    # noise = await scorer.single_turn_ascore(sample)

    print("ResponseRelevancy: ", response_rel, "Faithfulness: ", faith_ful)


ResponseRelevancy:  0.9315857391863487 Faithfulness:  0.8333333333333334
ResponseRelevancy:  0.9882131136474549 Faithfulness:  0.8333333333333334
ResponseRelevancy:  0.9287615515781823 Faithfulness:  0.0
ResponseRelevancy:  0.9787860709379889 Faithfulness:  1.0
ResponseRelevancy:  0.9474099279479975 Faithfulness:  0.2
ResponseRelevancy:  0.9317587192910266 Faithfulness:  1.0
ResponseRelevancy:  0.9402170389655241 Faithfulness:  1.0
ResponseRelevancy:  0.9379970680222208 Faithfulness:  0.6666666666666666
ResponseRelevancy:  0.9427213857029985 Faithfulness:  0.6666666666666666
ResponseRelevancy:  0.9661688534528009 Faithfulness:  0.7


#### LLM eval (Done)

In [97]:

for result in retrieval_eval_data:
    sample = SingleTurnSample(
        user_input=result['query'],
        response=result['llm_output'],
        reference=result['expected_contexts'][0],
        retrieved_contexts=result['retrieved_contexts']
    )
    
#     scorer = FactualCorrectness()
#     scorer.llm = evaluator_llm
#     correct = await scorer.single_turn_ascore(sample)
    
    scorer = SemanticSimilarity()
    scorer.embeddings = evaluator_embeddings
    semantic = await scorer.single_turn_ascore(sample)
    
    # scorer = NonLLMStringSimilarity()
    # nonllm_similarity = await scorer.single_turn_ascore(sample)

#     print("FactualCorrectness: ", correct, "SemanticSimilarity: ", semantic, "Similarity: ", nonllm_similarity)
    
    print( "SemanticSimilarity: ", semantic)


SemanticSimilarity:  0.9643160027360951
SemanticSimilarity:  0.9324992683486706
SemanticSimilarity:  0.8827486574501135
SemanticSimilarity:  0.9655632730184613
SemanticSimilarity:  0.930988173679459
SemanticSimilarity:  0.8797053814922426
SemanticSimilarity:  0.9588984662223315
SemanticSimilarity:  0.9697202881736477
SemanticSimilarity:  0.910699221463944
SemanticSimilarity:  0.9152990215887623


In [98]:
retrieval_eval_data_none_noraptor = copy.deepcopy(retrieval_eval_data)

In [None]:
# retrieval_eval_data

# file_path = "eval_dataset_20241127.pkl"
# with open(file_path, "wb") as file:
#     pickle.dump(retrieval_eval_data, file)
