In [1]:
import pandas as pd

In [3]:
import os

In [5]:
from openai import OpenAI

client = OpenAI()

In [6]:
df = pd.read_csv('data.csv')
documents = df.to_dict(orient='records')

In [8]:
prompt_template = """
You emulate a user of our fitness assistant application.
Formulate 5 questions this user might ask based on a provided exercise.
Make the questions specific to this exercise.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:

exercise_name: {exercise_name}
type_of_activity: {type_of_activity}
type_of_equipment: {type_of_equipment}
body_part: {body_part}
type: {type}
muscle_groups_activated: {muscle_groups_activated}
instructions: {instructions}

Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

In [9]:
prompt = prompt_template.format(**documents[0])

In [12]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [13]:
questions = llm(prompt)

In [15]:
import json

In [16]:
json.loads(questions)

{'questions': ['What is the starting position for performing push-ups?',
  'Which muscle groups are primarily activated during push-ups?',
  'What body part do push-ups predominantly work on?',
  'Is equipment required to perform push-ups, and if so, what is it?',
  'What is the proper technique for lowering down during a push-up?']}

In [21]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content

    # Try to extract JSON part if there is any extra text
    try:
        start_idx = json_response.index('{')
        end_idx = json_response.rindex('}') + 1
        json_response = json_response[start_idx:end_idx]
        parsed_response = json.loads(json_response)
        return json.dumps(parsed_response)  # Return valid JSON string
    except (json.JSONDecodeError, ValueError) as e:
        print(f"Failed to decode JSON for doc_id {doc['id']}: {json_response}")
        return None  # Or handle the error appropriately


In [22]:
from tqdm.auto import tqdm

In [23]:
results = {}

In [24]:
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']

  0%|          | 0/207 [00:00<?, ?it/s]

In [26]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [30]:
final_results[0]

(0, 'What is the starting position for Push-Ups?')

In [31]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])

In [32]:
df_results

Unnamed: 0,id,question
0,0,What is the starting position for Push-Ups?
1,0,Which muscle groups are activated during Push-...
2,0,Do I need any equipment to perform Push-Ups?
3,0,How do I properly lower my body during Push-Ups?
4,0,What body part is primarily worked by doing Pu...
...,...,...
1030,206,What is the primary body part targeted by the ...
1031,206,Which muscles are activated during the Dumbbel...
1032,206,Can I perform the Dumbbell Bench Press with di...
1033,206,What equipment do I need for the Dumbbell Benc...


In [33]:
df_results.to_csv('ground-truth-retrieval.csv', index=False)