In [36]:
import pandas as pd
import requests

In [47]:
from openai import OpenAI

client = OpenAI(api_key="ollama", base_url="http://localhost:11434/api/generate")

In [48]:
df = pd.read_csv('../data/my_data.csv')
documents = df.to_dict(orient='records')

In [49]:
prompt_template = """
You emulate a user of our fitness assistant application.
Formulate 5 questions this user might ask based on a provided exercise.
Make the questions specific to this exercise.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:

"name": {name},
"category": {category},
"equipment": {equipment},
"force": {force},
"instructions": {instructions},
"level": {level},
"mechanic": {mechanic},
"primaryMuscles": {primaryMuscles},
"secondaryMuscles": {secondaryMuscles}

Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

In [50]:
prompt = prompt_template.format(**documents[0])

In [54]:
def llm(prompt, max_retries=3):
    for attempt in range(max_retries):
        try:
            url = "http://localhost:11434/api/generate"
            data = {
                "model": "llama3.2:latest",
                "prompt": prompt,
                "stream": False,
                # Optional parameters
                "temperature": 0.7,
                "top_p": 0.9,
                "max_tokens": 500
            }
            
            response = requests.post(url, json=data, timeout=30)
            response.raise_for_status()
            
            return response.json()['response']
            
        except requests.exceptions.RequestException as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt == max_retries - 1:
                return "Error: Unable to get response from LLM"
            
            import time
            time.sleep(1)  # Wait before retry

In [55]:
questions = llm(prompt)

In [56]:
import json

In [None]:
json.loads(questions)

In [62]:
def generate_questions(doc):
    # Format the prompt
    prompt = prompt_template.format(**doc)

    # Define the Ollama API endpoint and parameters
    url = "http://localhost:11434/api/generate"  # Ollama's local API endpoint
    payload = {
        "model": "gpt-4o-mini",  # Specify the Ollama model
        "prompt": prompt
    }

    # Make a request to Ollama's API
    response = requests.post(url, json=payload)

    if response.status_code == 200:
        # Parse the JSON response to get the content
        json_response = response.json()["content"]
        return json_response
    else:
        # Handle errors
        raise Exception(f"Error {response.status_code}: {response.text}")


In [63]:
from tqdm.auto import tqdm

In [81]:
results = {}

In [None]:
for doc in tqdm(documents):
    doc_id = doc['id']
    
    # Skip if results already contain this document ID
    if doc_id in results:
        continue

    # Generate questions for the document
    try:
        questions_raw = generate_questions(doc)
        print(questions_raw)
        
        # Parse and store the generated questions
        results[doc_id] = questions_raw  # Assuming `questions_raw` is already structured appropriately
    except Exception as e:
        print(f"Error processing document {doc_id}: {e}")

In [42]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [None]:
final_results[0]

In [46]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])

In [48]:
df_results.to_csv('../data/ground-truth-retrieval.csv', index=False)

In [None]:
!head ../data/ground-truth-retrieval.csv