In [1]:
import pandas as pd
from openai import OpenAI
import os
import random
import json
from dotenv import load_dotenv
from typing import List, Dict

from tqdm.auto import tqdm

In [2]:
load_dotenv('/home/ubuntu/medical_assistant_rag/.envrc')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

## Ingestion

In [3]:
df = pd.read_csv('./data/data_metadata_small.csv')
df.head()

Unnamed: 0,id,question,answer,medical_department,condition_type,patient_demographics,common_symptoms,treatment_or_management,severity
0,0,A 23-year-old pregnant woman at 22 weeks gesta...,Nitrofurantoin,Obstetrics & Gynecology,Infectious,"Age Group: Adult, Gender: Female, Pregnancy St...","Burning sensation (e.g., urination)",Medication,Mild
1,1,A 3-month-old baby died suddenly at night whil...,Placing the infant in a supine position on a f...,Pediatrics,Idiopathic,"Age Group: Infant (1-12 months), Gender: Male,...","Fever, Altered Mental Status","Preventive Measures (e.g., vaccinations)",Life-threatening
2,2,A mother brings her 3-week-old infant to the p...,Abnormal migration of ventral pancreatic bud,Pediatrics,Infectious,"Age Group: Neonate (0-28 days), Gender: Male, ...","Fussiness, Nausea/Vomiting",Observation/Monitoring,Moderate
3,3,A pulmonary autopsy specimen from a 58-year-ol...,Thromboembolism,Pulmonology,Acute,"Age Group: Adult, Gender: Female, Pregnancy St...","Dyspnea (Shortness of breath), Fatigue","Supportive Care (e.g., oxygen therapy)",Life-threatening
4,4,A 20-year-old woman presents with menorrhagia ...,Von Willebrand disease,Obstetrics & Gynecology,Chronic,"Age Group: Adult, Gender: Female, Pregnancy St...","Bleeding (e.g., menorrhagia), Easy bruising",Medication,Moderate


In [4]:
len(df)

99

In [5]:
documents = df.to_dict(orient='records')
documents[0]

{'id': 0,
 'question': 'A 23-year-old pregnant woman at 22 weeks gestation presents with burning upon urination. She states it started 1 day ago and has been worsening despite drinking more water and taking cranberry extract. She otherwise feels well and is followed by a doctor for her pregnancy. Her temperature is 97.7°F (36.5°C), blood pressure is 122/77 mmHg, pulse is 80/min, respirations are 19/min, and oxygen saturation is 98% on room air. Physical exam is notable for an absence of costovertebral angle tenderness and a gravid uterus. Which of the following is the best treatment for this patient?',
 'answer': 'Nitrofurantoin',
 'medical_department': 'Obstetrics & Gynecology',
 'condition_type': 'Infectious',
 'patient_demographics': 'Age Group: Adult, Gender: Female, Pregnancy Status: Pregnant',
 'common_symptoms': 'Burning sensation (e.g., urination)',
 'treatment_or_management': 'Medication',
 'severity': 'Mild'}

## Elastic Search

In [6]:
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch

In [7]:
es_url = 'http://localhost:9200'
es_client = Elasticsearch(es_url)

Indexing

In [8]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "id": {"type": "keyword"},
            "question": {"type": "text"},
            "answer": {"type": "text"},
            "medical_department": {"type": "keyword"},
            "condition_type": {"type": "keyword"},
            "patient_demographics": {"type": "text"},
            "common_symptoms": {"type": "text"},
            "treatment_or_management": {"type": "text"},
            "severity": {"type": "keyword"},
            "question_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "answer_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "question_answer_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

In [9]:
index_name = "medical-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'medical-questions'})

In [10]:
model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')



In [11]:
for doc in tqdm(documents):
    question = doc.get('question', 'No question provided')
    answer = doc.get('answer', 'No answer provided')
    qa_combined = question + ' ' + answer

    doc['question'] = question
    doc['answer'] = answer
    doc['question_vector'] = model.encode(question).tolist()
    doc['answer_vector'] = model.encode(answer).tolist()
    doc['question_answer_vector'] = model.encode(qa_combined).tolist()

    # Use the document's 'id' field as the Elasticsearch document ID
    es_client.index(index=index_name, id=doc['id'], document=doc)

  0%|          | 0/99 [00:00<?, ?it/s]

Retrieval

In [12]:
from langchain.embeddings import SentenceTransformerEmbeddings
from typing import Dict

In [13]:
query = "A 30-year-old woman in her second trimester of pregnancy presents with symptoms of dysuria and urinary urgency. She has no significant medical history and is not allergic to any medications. Physical examination and vital signs are within normal limits. Which antibiotic is considered safe and effective for treating her urinary tract infection during pregnancy?"

Embeddings:

In [14]:
embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/multi-qa-MiniLM-L6-cos-v1")

  embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/multi-qa-MiniLM-L6-cos-v1")


In [15]:
def hybrid_query_rrf(search_query: str) -> List[Dict]:
    vector = embeddings.embed_query(search_query)
    k = 60

    knn_query = {
        "field": "question_answer_vector",
        "query_vector": vector,
        "k": 10,
        "num_candidates": 10000,
        "boost": 0.5,
    }

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": search_query,
                    "fields": [
                        "question",
                        "answer",
                        "common_symptoms",
                        "condition_type",
                        "medical_department",
                        "patient_demographics"
                    ],
                    "type": "best_fields",
                    "boost": 0.5,
                }
            },
        }
    }

    source_fields = [
        "id", "question", "answer", "medical_department", "condition_type",
        "patient_demographics", "common_symptoms", "treatment_or_management", "severity"
    ]

    knn_response = es_client.search(
        index=index_name,
        body={
            "knn": knn_query,
            "size": 10,
            "_source": source_fields
        }
    )
    knn_results = knn_response['hits']['hits']

    keyword_response = es_client.search(
        index=index_name,
        body={
            "query": keyword_query,
            "size": 10,
            "_source": source_fields
        }
    )
    keyword_results = keyword_response['hits']['hits']

    def compute_rrf(rank, k=60):
        return 1.0 / (k + rank)

    rrf_scores = {}
    for rank, hit in enumerate(knn_results):
        doc_id = hit['_id']
        score = compute_rrf(rank + 1, k)
        rrf_scores[doc_id] = rrf_scores.get(doc_id, 0) + score

    for rank, hit in enumerate(keyword_results):
        doc_id = hit['_id']
        score = compute_rrf(rank + 1, k)
        rrf_scores[doc_id] = rrf_scores.get(doc_id, 0) + score

    reranked_docs = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)

    final_results = []
    for doc_id, score in reranked_docs[:5]:
        doc = next((hit for hit in (knn_results + keyword_results) if hit['_id'] == doc_id), None)
        if doc:
            source = doc['_source']
            final_results.append(source)

    return final_results

def question_text_hybrid(q):
    question = q['question']
    return hybrid_query_rrf(question)

def hit_rate(relevance_total):
    cnt = 0
    for line in relevance_total:
        if True in line:
            cnt += 1
    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0
    for line in relevance_total:
        for rank, rel in enumerate(line):
            if rel:
                total_score += 1 / (rank + 1)
                break  # Only consider the first relevant result
    return total_score / len(relevance_total)

def evaluate(documents, search_function):
    relevance_total = []
    for q in tqdm(documents):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results if 'id' in d]
        relevance_total.append(relevance)
    hit_rate_value = hit_rate(relevance_total)
    mrr_value = mrr(relevance_total)
    return {
        'hit_rate': hit_rate_value,
        'mrr': mrr_value,
    }

In [20]:
evaluate(documents, question_text_hybrid)

  0%|          | 0/99 [00:00<?, ?it/s]

{'hit_rate': 1.0, 'mrr': 1.0}

## RAG

In [21]:
client = OpenAI()

In [32]:
prompt_template = """
You are a knowledgeable medical assistant. Answer the QUESTION based solely on the information provided in the CONTEXT from the medical database.

Use only the facts from the CONTEXT when formulating your answer.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

entry_template = """
Medical Department: {medical_department}
Condition Type: {condition_type}
Patient Demographics: {patient_demographics}
Common Symptoms: {common_symptoms}
Treatment or Management: {treatment_or_management}
Severity: {severity}
""".strip()

def build_prompt(query, search_results):
    context = ""
    for doc in search_results:
        context += entry_template.format(
            medical_department=doc.get('medical_department', 'N/A'),
            condition_type=doc.get('condition_type', 'N/A'),
            patient_demographics=doc.get('patient_demographics', 'N/A'),
            common_symptoms=doc.get('common_symptoms', 'N/A'),
            treatment_or_management=doc.get('treatment_or_management', 'N/A'),
            severity=doc.get('severity', 'N/A')
        ) + "\n\n"
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

def rag(query, model='gpt-4o-mini'):
    search_results = hybrid_query_rrf(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt, model=model)
    return answer

In [33]:
query = "Given that this patient is at 22 weeks of gestation and without signs of systemic infection, how does the choice of antibiotic like nitrofurantoin compare to other options in terms of safety during pregnancy, and what factors should be considered when prescribing antibiotics to pregnant patients?"

In [28]:
answer = rag(query)
print(answer)

The best treatment for this patient, who presents with burning upon urination at 22 weeks gestation, is medication. Given her symptoms and the context provided, it suggests she may have a urinary tract infection (UTI), which is common in pregnancy. Therefore, appropriate medication such as antibiotics would be the most effective treatment option.


## Retrieval Evaluation

In [34]:
df_question = pd.read_csv('./data/ground_truth_retrieval_small.csv')

In [35]:
ground_truth = df_question.to_dict(orient='records')
ground_truth[0]

{'id': 0,
 'question': 'What are the potential causes of dysuria in a 23-year-old pregnant woman at 22 weeks gestation who presents with burning upon urination, and how do the symptoms of a urinary tract infection compare to other conditions like vulvovaginitis?'}

In [36]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)
    
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [37]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [41]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d.get('id') == doc_id for d in results if d.get('id') is not None]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [40]:
evaluate(ground_truth, question_text_hybrid)

  0%|          | 0/495 [00:00<?, ?it/s]

{'hit_rate': 0.9676767676767677, 'mrr': 0.9060942760942758}

Baseline: 

Hit Rate: 96.76%,
MRR: 90.60%

## Finding best parameters

In [62]:
def hybrid_query_rrf(search_query: str, boost_params: Dict[str, float]) -> List[Dict]:
    vector = embeddings.embed_query(search_query)
    k = 60

    knn_query = {
        "field": "question_answer_vector",
        "query_vector": vector,
        "k": 10,
        "num_candidates": 10000,
        "boost": 0.5,
    }

    fields_with_boosts = []
    for field, boost in boost_params.items():
        fields_with_boosts.append(f"{field}^{boost}")

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": search_query,
                    "fields": fields_with_boosts,
                    "type": "best_fields",
                }
            },
        }
    }

    source_fields = [
        "id", "question", "answer", "medical_department", "condition_type",
        "patient_demographics", "common_symptoms", "treatment_or_management", "severity"
    ]

    knn_response = es_client.search(
        index=index_name,
        body={
            "knn": knn_query,
            "size": 10,
            "_source": source_fields
        }
    )
    knn_results = knn_response['hits']['hits']

    keyword_response = es_client.search(
        index=index_name,
        body={
            "query": keyword_query,
            "size": 10,
            "_source": source_fields
        }
    )
    keyword_results = keyword_response['hits']['hits']

    def compute_rrf(rank, k=60):
        return 1.0 / (k + rank)

    rrf_scores = {}
    for rank, hit in enumerate(knn_results):
        doc_id = hit['_id']
        score = compute_rrf(rank + 1, k)
        rrf_scores[doc_id] = rrf_scores.get(doc_id, 0) + score

    for rank, hit in enumerate(keyword_results):
        doc_id = hit['_id']
        score = compute_rrf(rank + 1, k)
        rrf_scores[doc_id] = rrf_scores.get(doc_id, 0) + score

    reranked_docs = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)

    final_results = []
    for doc_id, score in reranked_docs[:5]:
        doc = next((hit for hit in (knn_results + keyword_results) if hit['_id'] == doc_id), None)
        if doc:
            source = doc['_source']
            final_results.append(source)

    return final_results

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d.get('id') == doc_id for d in results if d.get('id') is not None]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

def optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')

    for _ in range(n_iterations):
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            current_params[param] = random.uniform(min_val, max_val)

        current_score = objective_function(current_params)
    
        if current_score > best_score:
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [63]:
param_ranges = {
     'question': (0.0, 3.0),
     'answer': (0.0, 3.0),
     'medical_department': (0.0, 3.0),
     'condition_type': (0.0, 3.0),
     'patient_demographics': (0.0, 3.0),
     'common_symptoms': (0.0, 3.0),
     'treatment_or_management': (0.0, 3.0),
     'severity': (0.0, 3.0),
}

def objective(boost_params):
    def search_function(q):
        return hybrid_query_rrf(q['question'], boost_params)
    
    results = evaluate(ground_truth, search_function)
    return results['mrr']

In [None]:
best_params, best_score = optimize(param_ranges, objective, n_iterations=10)
print("Best parameters:", best_params)
print("Best MRR score:", best_score)

Improved Hybrid Search with RRF

In [73]:
def hybrid_query_rrf(search_query: str) -> List[Dict]:

    best_boost_params = {
    'question': 1.62,
    'answer': 1.70,
    'medical_department': 1.67,
    'condition_type': 0.97,
    'patient_demographics': 0.64,
    'common_symptoms': 1.75,
    'treatment_or_management': 0.27,
    'severity': 1.85
    }
    
    vector = embeddings.embed_query(search_query)
    k = 60

    knn_query = {
        "field": "question_answer_vector",
        "query_vector": vector,
        "k": 10,
        "num_candidates": 10000,
        "boost": 0.5,
    }

    fields_with_boosts = []
    for field, boost in boost_params.items():
        fields_with_boosts.append(f"{field}^{boost}")

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": search_query,
                    "fields": fields_with_boosts,
                    "type": "best_fields",
                }
            },
        }
    }

    source_fields = [
        "id", "question", "answer", "medical_department", "condition_type",
        "patient_demographics", "common_symptoms", "treatment_or_management", "severity"
    ]

    knn_response = es_client.search(
        index=index_name,
        body={
            "knn": knn_query,
            "size": 10,
            "_source": source_fields
        }
    )
    knn_results = knn_response['hits']['hits']

    keyword_response = es_client.search(
        index=index_name,
        body={
            "query": keyword_query,
            "size": 10,
            "_source": source_fields
        }
    )
    keyword_results = keyword_response['hits']['hits']

    def compute_rrf(rank, k=60):
        return 1.0 / (k + rank)

    rrf_scores = {}
    for rank, hit in enumerate(knn_results):
        doc_id = hit['_id']
        score = compute_rrf(rank + 1, k)
        rrf_scores[doc_id] = rrf_scores.get(doc_id, 0) + score

    for rank, hit in enumerate(keyword_results):
        doc_id = hit['_id']
        score = compute_rrf(rank + 1, k)
        rrf_scores[doc_id] = rrf_scores.get(doc_id, 0) + score

    reranked_docs = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)

    final_results = []
    for doc_id, score in reranked_docs[:5]:
        doc = next((hit for hit in (knn_results + keyword_results) if hit['_id'] == doc_id), None)
        if doc:
            source = doc['_source']
            final_results.append(source)

    return final_results

def question_text_hybrid(q):
    question = q['question']
    return hybrid_query_rrf(question)

In [70]:
evaluate(ground_truth, question_text_hybrid)

  0%|          | 0/495 [00:00<?, ?it/s]

{'hit_rate': 0.9676767676767677, 'mrr': 0.9081144781144779}

Boost parameters tuned: 

Hit Rate: 96.76%,
MRR: 90.81%

## RAG Evaluation - LLM-as-a-Judge

In [74]:
prompt1_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer compared to the original answer provided.
Based on the relevance and similarity of the generated answer to the original answer, you will classify
it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Original Answer: {answer_orig}
Generated Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the original
answer and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [75]:
record = ground_truth[0]
question = record['question']
answer_orig = documents[0]['answer']
answer_llm = rag(question)
print(answer_llm)

The potential causes of dysuria in a 23-year-old pregnant woman at 22 weeks gestation with burning upon urination could primarily include urinary tract infections (UTIs) or other conditions affecting the urinary or genital systems, such as vulvovaginitis.

When considering urinary tract infections, the symptoms typically include a burning sensation during urination, increased frequency of urination, urgency, and possibly lower abdominal discomfort. UTIs can often present with these symptoms due to the inflammation of the bladder or urethra.

In contrast, vulvovaginitis may also present with a burning sensation but often includes additional symptoms such as itching, vaginal discharge, or irritation. While both conditions may feature dysuria and burning, vulvovaginitis typically has a broader spectrum of associated symptoms related to the vaginal area, compared to the more urinary-focused symptoms of a UTI.

Therefore, while both UTIs and vulvovaginitis can cause dysuria, the distinction

In [76]:
prompt = prompt1_template.format(question=question, answer_orig=answer_orig, answer_llm=answer_llm)
print(prompt)

You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer compared to the original answer provided.
Based on the relevance and similarity of the generated answer to the original answer, you will classify
it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Original Answer: Nitrofurantoin
Generated Question: What are the potential causes of dysuria in a 23-year-old pregnant woman at 22 weeks gestation who presents with burning upon urination, and how do the symptoms of a urinary tract infection compare to other conditions like vulvovaginitis?
Generated Answer: The potential causes of dysuria in a 23-year-old pregnant woman at 22 weeks gestation with burning upon urination could primarily include urinary tract infections (UTIs) or other conditions affecting the urinary or genital systems, such as vulvovaginitis.

When considering urinary tract infections, the symptoms 

In [77]:
llm(prompt)

'{\n  "Relevance": "NON_RELEVANT",\n  "Explanation": "The generated answer discusses the potential causes of dysuria and compares urinary tract infections with vulvovaginitis, but it does not mention \'Nitrofurantoin,\' which is the original answer provided. The focus of the original answer is on a specific medication that is relevant to treating UTIs, whereas the generated answer lacks context about treatment options and is more focused on symptoms and comparisons, making it non-relevant to the original answer."\n}'

## GPT4o-mini

In [80]:
model = 'gpt-4o-mini'

## Evaluation for Prompt 1

In [71]:
evaluations = []

In [None]:
for record in tqdm(ground_truth):

    question = record['question']
    answer_orig = documents[record['id']]['answer']
    answer_llm = rag(question)
    
    prompt = prompt1_template.format(
        question=question,
        answer_orig=answer_orig,
        answer_llm=answer_llm
    )
    evaluation = llm(prompt, model)
    evaluation = json.loads(evaluation)
    
    evaluations.append((record, answer_orig, answer_llm, evaluation))

In [83]:
df_eval = pd.DataFrame(evaluations_2, columns = ['record', 'answer_llm', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

df_eval.to_csv('llm_as_a_judge_prompt2_gpt-4o-mini.csv', index=False)

df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.911111
PARTLY_RELEVANT    0.044444
NON_RELEVANT       0.044444
Name: proportion, dtype: float64

## GPT4o

In [84]:
model = 'gpt-4o'

In [113]:
evaluations = []

In [None]:
for record in tqdm(ground_truth):

    question = record['question']
    answer_orig = documents[record['id']]['answer']
    answer_llm = rag(question)
    
    prompt = prompt1_template.format(
        question=question,
        answer_orig=answer_orig,
        answer_llm=answer_llm
    )
    evaluation = llm(prompt, model)
    evaluation = json.loads(evaluation)
    
    evaluations.append((record, answer_orig, answer_llm, evaluation))

In [120]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer_orig', 'answer_llm', 'evaluation'])
    
df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

df_eval.to_csv('llm_as_a_judge_prompt1_gpt-4o.csv', index=False)

df_eval.relevance.value_counts(normalize=True)

relevance
NON_RELEVANT       0.677551
PARTLY_RELEVANT    0.216327
RELEVANT           0.106122
Name: proportion, dtype: float64