### Test different LLM settings

In [None]:
import os
import json
from tqdm import tqdm
import pandas as pd
from typing import List, Tuple
from langchain_core.documents import Document
from sentence_transformers import SentenceTransformer, util
from ingest import ingest_corpus, verify_db_existence
from rag import RAG 
from dotenv import load_dotenv  
load_dotenv()

In [None]:
# Import questions
queries = []
with open("./queries.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        queries.append(obj)
print(len(queries), "queries loaded")

In [None]:
# function to test rag questiona and answering
model = SentenceTransformer('all-MiniLM-L6-v2')
def question_answer_similarity(question: str, answer: str) -> float:
    question_embeddings = model.encode(question)
    answer_embeddings = model.encode(answer)
    scores = util.cos_sim(question_embeddings, answer_embeddings)
    return scores.item()

def citation_answer_max_and_min_similarity(citations: List[Document], answer: str) -> Tuple[float, float]:
    answer_embeddings = model.encode(answer)
    citation_texts = [citation.page_content for citation in citations]
    citation_embeddings = model.encode(citation_texts)
    scores = util.cos_sim(answer_embeddings, citation_embeddings)
    max_score = scores.max().item()
    min_score = scores.min().item()
    return max_score, min_score

In [None]:
# Different configurations to test
# chunk size, top_k, top_k_rerank
configurations = [
    [500, 20, 5],
    [500, 50, 5],
    [500, 50, 10],
    [1000, 20, 5]
] # due to limited time, we only examine 3 configurations here

# Run tests
result_df = {
    "chunk_size": [],
    "top_k": [],
    "top_k_rerank": [],
    "query": [],
    "answer_without_citation": [],
    "qa_similarity": [],
    "citation_answer_max_similarity": [],
    "citation_answer_min_similarity": []
}
for (chunk_size, top_k, top_k_rerank) in configurations:
    ingest_corpus(chroma_db_path="./test_chroma_db", chunk_size=chunk_size, print_progress=False)
    print(f"Testing configuration: chunk_size={chunk_size}, top_k={top_k}, top_k_rerank={top_k_rerank}")
    rag = RAG(
        chroma_db_path="./test_chroma_db",
        top_k=top_k,
        top_k_rerank=top_k_rerank,
        print_progress=False
    )
    for query_obj in tqdm(queries):
        question = query_obj["question"]
        retrieved_docs = rag.retrieve(question)
        answer = rag.answer(question, with_citation=False) # not generate with citation to only consider the llm output
        # Calculate similarities
        qa_similarity = question_answer_similarity(question, answer)
        max_citation_sim, min_citation_sim = citation_answer_max_and_min_similarity(retrieved_docs, answer)
        # Store results
        result_df["chunk_size"].append(chunk_size)
        result_df["top_k"].append(top_k)
        result_df["top_k_rerank"].append(top_k_rerank)
        result_df["query"].append(question)
        result_df["answer_without_citation"].append(answer)
        result_df["qa_similarity"].append(qa_similarity)
        result_df["citation_answer_max_similarity"].append(max_citation_sim)
        result_df["citation_answer_min_similarity"].append(min_citation_sim)

In [12]:
result_df = pd.DataFrame(result_df)
result_df.head()

Unnamed: 0,chunk_size,top_k,top_k_rerank,query,answer_without_citation,qa_similarity,citation_answer_max_similarity,citation_answer_min_similarity
0,500,20,5,What is known about APOE and Alzheimer’s disease?,The provided citations discuss the role of APO...,0.865631,0.870943,0.627306
1,500,20,5,How do TREM2 variants influence the risk of Al...,The provided citations highlight the role of T...,0.871481,0.856757,0.626495
2,500,20,5,What evidence links ABCA7 to Alzheimer’s patho...,The provided citations discuss the role of the...,0.818911,0.840042,0.513437
3,500,20,5,Are there genetic loci associated with early-o...,A genome-wide association study (GWAS) has ide...,0.829213,0.83659,0.668779
4,500,20,5,Which biological pathways are most commonly im...,Recent genomic studies have identified several...,0.63245,0.729587,0.494551


In [13]:
# get the mean resumes
result_mean_df = result_df.groupby(["chunk_size",  "top_k", "top_k_rerank"], as_index = False)[["qa_similarity", "citation_answer_max_similarity", "citation_answer_min_similarity"]].mean()
result_mean_df.head()

Unnamed: 0,chunk_size,top_k,top_k_rerank,qa_similarity,citation_answer_max_similarity,citation_answer_min_similarity
0,500,20,5,0.822992,0.805708,0.582623
1,500,50,5,0.790218,0.83432,0.517048
2,500,50,10,0.778366,0.812251,0.520168
3,1000,20,5,0.777831,0.783236,0.539775


In [14]:
result_df.to_csv("./test_results.csv", index=False)
result_mean_df.to_csv("./test_results_mean.csv", index=False)