In [1]:
import pandas as pd

# Ingestion

In [2]:
df = pd.read_csv('../data/data.csv')
df.head()

Unnamed: 0,ID,exercise_name,type_of_activity,type_of_equipment,body_part,type,muscle_groups_activated,instructions
0,0,Push-Ups,Strength,Bodyweight,Upper Body,Push,"Pectorals, Triceps, Deltoids",Start in a high plank position with your hands...
1,1,Squats,Strength,Bodyweight,Lower Body,Push,"Quadriceps, Glutes, Hamstrings",Stand with feet shoulder-width apart. Lower yo...
2,2,Plank,Strength/Mobility,Bodyweight,Core,Hold,"Rectus Abdominis, Transverse Abdominis",Start in a forearm plank position with your el...
3,3,Deadlift,Strength,Barbell,Lower Body,Pull,"Glutes, Hamstrings, Lower Back","Stand with feet hip-width apart, barbell in fr..."
4,4,Bicep Curls,Strength,Dumbbells,Upper Body,Pull,"Biceps, Forearms","Stand with a dumbbell in each hand, arms fully..."


In [3]:
import minsearch

In [4]:
documents = df.to_dict(orient='records')

In [5]:
index = minsearch.Index(
    text_fields=['exercise_name', 'type_of_activity', 'type_of_equipment', 'body_part',
       'type', 'muscle_groups_activated', 'instructions'],
    keyword_fields=["ID"]
)


In [6]:
index.fit(documents)

<minsearch.minsearch.Index at 0x77402215d790>

In [7]:
query = 'give me a workout for my legs'

In [8]:
index.search(query, num_results=10)

[{'ID': 150,
  'exercise_name': 'One-Arm Dumbbell Row',
  'type_of_activity': 'Strength',
  'type_of_equipment': 'Dumbbell',
  'body_part': 'Upper Body',
  'type': 'Pull',
  'muscle_groups_activated': 'Latissimus Dorsi, Biceps',
  'instructions': 'Place one knee and hand on a bench for support. Row the dumbbell towards your torso with the other hand.'},
 {'ID': 57,
  'exercise_name': 'Single Arm Dumbbell Row',
  'type_of_activity': 'Strength',
  'type_of_equipment': 'Dumbbell',
  'body_part': 'Upper Body',
  'type': 'Pull',
  'muscle_groups_activated': 'Latissimus Dorsi, Biceps',
  'instructions': 'Place one knee and hand on a bench for support, row the dumbbell towards your torso with the other hand.'},
 {'ID': 14,
  'exercise_name': 'Leg Raises',
  'type_of_activity': 'Strength',
  'type_of_equipment': 'Bodyweight',
  'body_part': 'Core',
  'type': 'Hold',
  'muscle_groups_activated': 'Rectus Abdominis, Hip Flexors',
  'instructions': 'Lie on your back with your legs extended. Lift y

# Rag flow

In [9]:
from google import genai

In [10]:
import os
os.environ["GEMINI_API_KEY"] = "YOUR_API_KEY"

In [11]:
client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))

In [12]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        boost_dict=boost,
        num_results=10
    )

    return results

In [13]:
prompt_template = """
You're a fitness instructor. Answer the QUESTION based on the CONTEXT from our exercises database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

entry_template = """
    'exercise_name': {exercise_name}
    'type_of_activity': {type_of_activity}
    'type_of_equipment': {type_of_equipment}
    'body_part':  {body_part}
    'type' : {type}
    'muscle_groups_activated': {muscle_groups_activated}
    'instructions': {instructions}

    """.strip()

In [14]:
def build_prompt(query, search_results):
  

    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [15]:
search_results = search(query)
prompt = build_prompt(query, search_results)

In [16]:


def llm(prompt, model = "gemini-1.5-flash"):
    """
    Call the LLM with the given prompt and model.
    Args:
        prompt (str): The prompt to send to the LLM.
        model (str): The model to use for generating content.
    Returns:
        str: The generated content from the LLM.
    """
    response = client.models.generate_content(
        model=model,
        contents=prompt,
    )
    
    return response.text



In [17]:

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

## Retrieval evaluation

In [18]:
prompt1_template = """

You emulate a user of our fitness assistant application. 
Formulate 5 questions a user might ask based on the provided exercise. 
Make the questions specific to the exercise. 
The record should contain the answers to the questions, and the questions should be complete and not too short.
Use as few words as possible from the record.

The record:

exercise_name: {exercise_name} 
type_of_activity: {type_of_activity}
type_of_equipment: {type_of_equipment}
body_part: {body_part} type: {type} 
muscle_groups_activated: {muscle_groups_activated} 
instructions: {instructions}

Provide the output as a pure JSON string, without wrapping it in Markdown code fences, code blocks, or any other formatting.
Example output:

{{"questions": ["question1", "question2", "question3", "question4", "question5"]}}
Don't provide answers, just questions.
""".strip()

In [20]:
prompt = prompt1_template.format(**documents[0])

In [21]:
questions = llm(prompt)

In [22]:
questions

'{"questions": ["How many sets and reps of push-ups should I aim for as a beginner?", "Can I modify the push-up to make it easier if I can\'t do a full push-up?", "What is the proper hand placement for optimal activation of my pectoral muscles during push-ups?", "Besides the pectorals, triceps, and deltoids, what other muscles are indirectly involved in push-ups?", "If I feel pain in my wrists during push-ups, what alternative exercise can I substitute?"]}\n'

In [23]:
import json

In [24]:
json.loads(questions)

{'questions': ['How many sets and reps of push-ups should I aim for as a beginner?',
  "Can I modify the push-up to make it easier if I can't do a full push-up?",
  'What is the proper hand placement for optimal activation of my pectoral muscles during push-ups?',
  'Besides the pectorals, triceps, and deltoids, what other muscles are indirectly involved in push-ups?',
  'If I feel pain in my wrists during push-ups, what alternative exercise can I substitute?']}

In [29]:
def generate_questions(doc):
    prompt = prompt1_template.format(**doc)

    response = llm(prompt)

    json_response = response
    return json_response

In [30]:
from tqdm.auto import tqdm   


In [31]:
results = {}


In [None]:
for doc in tqdm(documents):
    doc_id = doc['ID']
    if doc_id in results:
        continue
    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw.strip())
    results[doc_id] = questions['questions']

In [58]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [59]:
final_results[0]

(0,
 'How many sets and repetitions of push-ups are recommended for building upper body strength?')

In [62]:
df_results= pd.DataFrame(final_results, columns=['id', 'question'])

In [63]:
df_results.to_csv('../data/ground-trunth-retrieval.csv', index=False)

In [64]:
!head ../data/ground-trunth-retrieval.csv

id,question
0,How many sets and repetitions of push-ups are recommended for building upper body strength?
0,Should my elbows be tucked in close to my body or flared out during push-ups?
0,What modifications can I make to the push-up if I'm unable to perform a standard push-up?
0,Does the position of my hands affect which muscles are primarily worked during a push-up?
0,"Are there variations of push-ups that target specific muscle groups within the chest, triceps, and shoulders?"
1,How many reps and sets of squats should I do for optimal muscle growth?
1,"Should my knees go past my toes during the squat, and if not, why?"
1,What are some common mistakes to avoid when performing squats to prevent injury?
1,Is it better to do squats with a narrow or wide stance for targeting different muscle groups?


In [32]:
df_questions = pd.read_csv('../data/ground-trunth-retrieval.csv')

In [34]:
ground_truth = df_questions.to_dict(orient='records')

In [35]:
ground_truth[0]

{'id': 0,
 'question': 'How many sets and repetitions of push-ups are recommended for building upper body strength?'}

In [36]:
def hit_rate(relevance_total):
    """Calculate the hit rate"""
    hits = sum([any(r) for r in relevance_total])
    return hits / len(relevance_total)

In [37]:
def mrr(relevance_total):
    """Calculate the Mean Reciprocal Rank (MRR)"""
    ranks = []
    for relevance in relevance_total:
        try:
            rank = 1 / (relevance.index(True) + 1)
        except ValueError:
            rank = 0
        ranks.append(rank)
    return sum(ranks) / len(ranks)

In [38]:
def minsearch_search(query):
    """Perform a search using the minsearch index"""
    boost = {}
    results = index.search(
        query=query,
        boost_dict=boost,
        num_results=10
    )
    return results

In [39]:
def evaluate_search(ground_truth, search_fn):
    """Evaluate the search results"""
    relevance_total = []
    for q in tqdm(ground_truth):
       doc_id = q["id"]
       results = search_fn(q)
       relevance = [d["ID"] == doc_id for d in results]
       relevance_total.append(relevance)
    return {
        "hit_rate": hit_rate(relevance_total),
        "mrr": mrr(relevance_total)
    }

In [40]:
evaluate_search(ground_truth, lambda q: minsearch_search(query=q["question"]))

  0%|          | 0/1045 [00:00<?, ?it/s]

{'hit_rate': 0.8966507177033493, 'mrr': 0.7020464038885091}

# Finding the best parameters

In [42]:
df_validation = df_questions[:100]

df_test = df_questions[100:]

In [43]:
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

  import pkg_resources


In [44]:
from hyperopt.pyll import scope

## Finding the best parameters

In [45]:
gt_val = df_validation.to_dict(orient='records')
gt_test = df_test.to_dict(orient='records')

In [46]:
evaluate_search(gt_val, lambda q: minsearch_search(query=q["question"]))

  0%|          | 0/100 [00:00<?, ?it/s]

{'hit_rate': 0.8, 'mrr': 0.5968849206349206}

In [47]:
def minsearch_search_boosted(query, boost):
    """Perform a search using the minsearch index with boosting"""
    results = index.search(
        query=query,
        boost_dict=boost,
        num_results=10
    )
    return results


In [48]:
param_ranges = {
    "exercise_name": hp.uniform("exercise_name", 0, 3),
    "type_of_activity": hp.uniform("type_of_activity", 0, 3),
    "type_of_equipment": hp.uniform("type_of_equipment", 0, 3),
    "body_part": hp.uniform("body_part", 0, 3),
    "type": hp.uniform("type", 0, 3),
    "muscle_groups_activated": hp.uniform("muscle_groups_activated", 0, 3),
    "instructions": hp.uniform("instructions", 0, 3)
}

In [49]:
def objective(boost_params):
    def search_function(q):
        return minsearch_search_boosted(
            query=q["question"],
            boost=boost_params
        )
    results = evaluate_search(gt_val, search_function)
    return -results["mrr"]
    

In [50]:
optim_params = fmin(objective,
    space=param_ranges,
    algo=tpe.suggest,
    max_evals=20,
    trials=Trials()
)

  0%|          | 0/20 [00:00<?, ?trial/s, best loss=?]

  0%|          | 0/100 [00:00<?, ?it/s]

  5%|▌         | 1/20 [00:00<00:11,  1.67trial/s, best loss: -0.6468849206349206]

  0%|          | 0/100 [00:00<?, ?it/s]

 10%|█         | 2/20 [00:01<00:10,  1.73trial/s, best loss: -0.6504444444444445]

  0%|          | 0/100 [00:00<?, ?it/s]

 15%|█▌        | 3/20 [00:01<00:10,  1.65trial/s, best loss: -0.6504444444444445]

  0%|          | 0/100 [00:00<?, ?it/s]

 20%|██        | 4/20 [00:02<00:09,  1.61trial/s, best loss: -0.6504444444444445]

  0%|          | 0/100 [00:00<?, ?it/s]

 25%|██▌       | 5/20 [00:03<00:09,  1.66trial/s, best loss: -0.6504444444444445]

  0%|          | 0/100 [00:00<?, ?it/s]

 30%|███       | 6/20 [00:03<00:08,  1.57trial/s, best loss: -0.6504444444444445]

  0%|          | 0/100 [00:00<?, ?it/s]

 35%|███▌      | 7/20 [00:04<00:08,  1.59trial/s, best loss: -0.6504444444444445]

  0%|          | 0/100 [00:00<?, ?it/s]

 40%|████      | 8/20 [00:05<00:07,  1.54trial/s, best loss: -0.6504444444444445]

  0%|          | 0/100 [00:00<?, ?it/s]

 45%|████▌     | 9/20 [00:05<00:07,  1.56trial/s, best loss: -0.6504444444444445]

  0%|          | 0/100 [00:00<?, ?it/s]

 50%|█████     | 10/20 [00:06<00:06,  1.58trial/s, best loss: -0.6504444444444445]

  0%|          | 0/100 [00:00<?, ?it/s]

 55%|█████▌    | 11/20 [00:07<00:06,  1.49trial/s, best loss: -0.6504444444444445]

  0%|          | 0/100 [00:00<?, ?it/s]

 60%|██████    | 12/20 [00:07<00:05,  1.55trial/s, best loss: -0.6642063492063492]

  0%|          | 0/100 [00:00<?, ?it/s]

 65%|██████▌   | 13/20 [00:08<00:04,  1.62trial/s, best loss: -0.6642063492063492]

  0%|          | 0/100 [00:00<?, ?it/s]

 70%|███████   | 14/20 [00:08<00:03,  1.60trial/s, best loss: -0.6642063492063492]

  0%|          | 0/100 [00:00<?, ?it/s]

 75%|███████▌  | 15/20 [00:09<00:03,  1.64trial/s, best loss: -0.6642063492063492]

  0%|          | 0/100 [00:00<?, ?it/s]

 80%|████████  | 16/20 [00:09<00:02,  1.73trial/s, best loss: -0.6642063492063492]

  0%|          | 0/100 [00:00<?, ?it/s]

 85%|████████▌ | 17/20 [00:10<00:01,  1.72trial/s, best loss: -0.6642063492063492]

  0%|          | 0/100 [00:00<?, ?it/s]

 90%|█████████ | 18/20 [00:11<00:01,  1.66trial/s, best loss: -0.6642063492063492]

  0%|          | 0/100 [00:00<?, ?it/s]

 95%|█████████▌| 19/20 [00:11<00:00,  1.64trial/s, best loss: -0.6642063492063492]

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 20/20 [00:12<00:00,  1.63trial/s, best loss: -0.6642063492063492]


In [51]:
optim_params

{'body_part': np.float64(0.28948257778251707),
 'exercise_name': np.float64(2.0824319225191696),
 'instructions': np.float64(2.117875365915427),
 'muscle_groups_activated': np.float64(0.3778777904539854),
 'type': np.float64(1.2345697029690499),
 'type_of_activity': np.float64(2.6855588701365107),
 'type_of_equipment': np.float64(0.4626335300269879)}

In [52]:
evaluate_search(ground_truth, lambda q: minsearch_search_boosted(query=q["question"], boost=optim_params))

  0%|          | 0/1045 [00:00<?, ?it/s]

{'hit_rate': 0.9358851674641149, 'mrr': 0.7871326042378675}

In [53]:
def minsearch_search_improved(query):
    """Perform a search using the minsearch index with optimized boosting"""
    
    boost = {'body_part': 0.947771590052861,
            'exercise_name': 2.8439224585464493,
            'instructions': 0.6987228015703881,
            'muscle_groups_activated': 0.49261344050772715,
            'type': 2.587600420079811,
            'type_of_activity': 0.3898333128794963,
            'type_of_equipment': 1.234288967556835
            }
    return minsearch_search_boosted(query=query, boost=boost)

In [54]:
evaluate_search(ground_truth, lambda q: minsearch_search_improved(query=q["question"]))

  0%|          | 0/1045 [00:00<?, ?it/s]

{'hit_rate': 0.9349282296650717, 'mrr': 0.7900831624515836}

# RAG Evaluation

In [55]:

def rag(query, model="gemini-1.5-flash"):
    search_results = minsearch_search_improved(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt, model=model)
    return answer

In [56]:
prompt2_template = """
You are an expert judge evaluating a generated answer in a Question-Answering (QA) system. You do NOT have access to a reference answer.

You are given:
- A generated question
- A generated answer

Your task is to assess whether the generated answer is appropriate, coherent, and directly relevant to the question.

Provide the output as a pure JSON string, without wrapping it in Markdown code fences, code blocks, or any other formatting.
Example output:

{{
  "evaluation": "RELEVANT" | "PARTIALLY_RELEVANT" | "NON_RELEVANT",
  "explanation": "Brief explanation of your reasoning"
}}

Guidelines:
- "RELEVANT": The answer is coherent, correct, and directly answers the question.
- "PARTIALLY_RELEVANT": The answer is partially correct or vague, or it omits key information.
- "NON_RELEVANT": The answer does not answer the question, is off-topic, or is factually incorrect.

Now evaluate:

Question: {question} 
Generated Answer: {answer_llm}
""".strip()

In [57]:
record = ground_truth[0]

In [58]:
question = record['question']
answer_llm = rag(question)
prompt2 = prompt2_template.format(question=question, answer_llm=answer_llm)
print(prompt2)

You are an expert judge evaluating a generated answer in a Question-Answering (QA) system. You do NOT have access to a reference answer.

You are given:
- A generated question
- A generated answer

Your task is to assess whether the generated answer is appropriate, coherent, and directly relevant to the question.

Provide the output as a pure JSON string, without wrapping it in Markdown code fences, code blocks, or any other formatting.
Example output:

{
  "evaluation": "RELEVANT" | "PARTIALLY_RELEVANT" | "NON_RELEVANT",
  "explanation": "Brief explanation of your reasoning"
}

Guidelines:
- "RELEVANT": The answer is coherent, correct, and directly answers the question.
- "PARTIALLY_RELEVANT": The answer is partially correct or vague, or it omits key information.
- "NON_RELEVANT": The answer does not answer the question, is off-topic, or is factually incorrect.

Now evaluate:

Question: How many sets and repetitions of push-ups are recommended for building upper body strength? 
Genera

In [59]:
df_sample = df_questions.sample(200)
sample = df_sample.to_dict(orient='records')

In [143]:
import json

In [60]:
evaluations = []

In [61]:
for rec in tqdm(sample):
    id = rec['id']
    # if id in evaluations:
    #     continue
    question = rec['question']
    
    answer_llm = rag(question)
    # Generate the evaluation prompt
    prompt = prompt2_template.format(question=question, answer_llm=answer_llm)
    evaluation = json.loads(llm(prompt))
    evaluations.append((id, rec, answer_llm, evaluation["evaluation"], evaluation["explanation"]))

  0%|          | 0/200 [00:00<?, ?it/s]

In [63]:
evaluations[0]
df_evaluations = pd.DataFrame(evaluations, columns=['id', 'record', 'answer_llm', 'evaluation', 'explanation'])

In [64]:
df_evaluations.head()

Unnamed: 0,id,record,answer_llm,evaluation,explanation
0,177,"{'id': 177, 'question': 'Can I modify the Cabl...",The provided text describes several exercises ...,RELEVANT,The answer directly addresses the question by ...
1,207,"{'id': 207, 'question': 'How can I modify the ...",The provided text suggests using a resistance ...,RELEVANT,The answer directly addresses the question by ...
2,38,"{'id': 38, 'question': 'Should I keep my back ...",The provided text describes how to perform a l...,PARTIALLY_RELEVANT,The answer correctly points out that the provi...
3,125,"{'id': 125, 'question': 'What modifications ca...",The provided text describes the Superman exerc...,PARTIALLY_RELEVANT,The answer correctly identifies that the provi...
4,145,"{'id': 145, 'question': 'How many repetitions ...",The provided text gives instructions for a Sta...,PARTIALLY_RELEVANT,The answer correctly identifies that the quest...


In [71]:
# df_evaluations["question"] = df_evaluations["record"].apply(lambda x: x['question'])
# df_evaluations["id"] = df_evaluations["record"].apply(lambda x: x['id'])
# df_evaluations = df_evaluations.drop(columns=['record'])
df_evaluations.rename(columns={'evaluation': 'relevance'}, inplace=True)
df_evaluations.to_csv('../data/ground-truth-evaluation.csv', index=False)
!head ../data/ground-truth-evaluation.csv

id,answer_llm,relevance,explanation,question
177,"The provided text describes several exercises using a cable machine.  It does not offer modifications for these exercises if a cable machine is unavailable.
",RELEVANT,The answer directly addresses the question by stating that the provided text (presumably context not shown here) lacks modifications for cable exercises when a cable machine is unavailable.  This is a concise and accurate response to the inquiry.,Can I modify the Cable Tricep Kickback exercise if I lack access to a cable machine?
207,"The provided text suggests using a resistance band as an alternative.  Attach a resistance band to a high anchor point and pull the band towards your face, leading with your elbows.
",RELEVANT,The answer directly addresses the question by providing a suitable modification for the face pull exercise when a cable machine is unavailable.  It offers a clear and concise alternative using resistance bands.,How can I modify the face pull if I don't

In [72]:
df_evaluations.head()

Unnamed: 0,id,answer_llm,relevance,explanation,question
0,177,The provided text describes several exercises ...,RELEVANT,The answer directly addresses the question by ...,Can I modify the Cable Tricep Kickback exercis...
1,207,The provided text suggests using a resistance ...,RELEVANT,The answer directly addresses the question by ...,How can I modify the face pull if I don't have...
2,38,The provided text describes how to perform a l...,PARTIALLY_RELEVANT,The answer correctly points out that the provi...,Should I keep my back flat against the backres...
3,125,The provided text describes the Superman exerc...,PARTIALLY_RELEVANT,The answer correctly identifies that the provi...,What modifications can I make to the Superman ...
4,145,The provided text gives instructions for a Sta...,PARTIALLY_RELEVANT,The answer correctly identifies that the quest...,How many repetitions should I aim for in a Sta...


In [74]:
df_evaluations.relevance.value_counts(normalize=True)

relevance
PARTIALLY_RELEVANT    0.480
NON_RELEVANT          0.415
RELEVANT              0.105
Name: proportion, dtype: float64

In [89]:
evaluation_gemini20flash = []

In [91]:
for rec in tqdm(sample):
    id = rec['id']
    # if id in evaluations:
    #     continue
    question = rec['question']
    
    answer_llm = rag(question, model="gemini-2.0-flash")
    # Generate the evaluation prompt
    prompt = prompt2_template.format(question=question, answer_llm=answer_llm)
    evaluation = json.loads(llm(prompt))
    # print(evaluation["evaluation"], evaluation["explanation"])
    evaluation_gemini20flash.append((id, rec, answer_llm, evaluation["evaluation"], evaluation["explanation"]))

  0%|          | 0/200 [00:00<?, ?it/s]

In [92]:
df_evaluations_gemini20flash = pd.DataFrame(evaluation_gemini20flash, columns=['id', 'record', 'answer_llm', 'evaluation', "explanation"])
df_evaluations_gemini20flash.rename(columns={'evaluation': 'relevance'}, inplace=True)
df_evaluations_gemini20flash["question"] = df_evaluations_gemini20flash["record"].apply(lambda x: x['question'])
df_evaluations_gemini20flash["id"] = df_evaluations_gemini20flash["record"].apply(lambda x: x['id'])
df_evaluations_gemini20flash = df_evaluations_gemini20flash.drop(columns=['record'])
df_evaluations_gemini20flash.to_csv('../data/ground-truth-evaluation-gemini20flash.csv', index=False)
df_evaluations_gemini20flash.head()

Unnamed: 0,id,answer_llm,relevance,explanation,question
0,177,The Cable Tricep Kickback exercise requires a ...,PARTIALLY_RELEVANT,The answer correctly states that the exercise ...,Can I modify the Cable Tricep Kickback exercis...
1,207,You can modify the face pull by using a resist...,RELEVANT,The answer directly addresses the question by ...,How can I modify the face pull if I don't have...
2,38,The provided text does not include instruction...,NON_RELEVANT,The answer is a canned response indicating tha...,Should I keep my back flat against the backres...
3,125,"I am sorry, but the provided context does not ...",PARTIALLY_RELEVANT,The answer is honest in stating it cannot prov...,What modifications can I make to the Superman ...
4,145,"I am sorry, but the number of repetitions for ...",PARTIALLY_RELEVANT,The answer acknowledges it cannot provide a sp...,How many repetitions should I aim for in a Sta...


In [93]:
!head ../data/ground-truth-evaluation-gemini20flash.csv

id,answer_llm,relevance,explanation,question
177,"The Cable Tricep Kickback exercise requires a cable machine.
",PARTIALLY_RELEVANT,"The answer correctly states that the exercise requires a cable machine, which directly addresses part of the question. However, it does not offer any suggestions for modifications or alternatives if a cable machine is unavailable, making it only partially relevant to the user's inquiry.",Can I modify the Cable Tricep Kickback exercise if I lack access to a cable machine?
207,"You can modify the face pull by using a resistance band attached to a high anchor.
",RELEVANT,The answer directly addresses the question by providing a viable alternative to using a cable machine for performing a face pull exercise.  It's concise and accurate.,How can I modify the face pull if I don't have access to a cable machine?
38,"The provided text does not include instructions on whether or not to keep your back flat against the backrest throughout the entire leg press movemen