In [1]:
from openai import OpenAI
import json
from tqdm import tqdm
import pandas as pd

In [2]:
with open("Medical-QA.json", "r") as f:
    documents = json.load(f)

len(documents)

16407

In [3]:
prompt_template = """
You are tasked with generating potential questions based on the structure of a medical dataset. This dataset contains fields like Question Type, Question, and Answer. Formulate 5 possible questions that a user might ask based on the provided record. Each question should be complete, concise, and avoid directly using too many words from the record itself.

The record format:

Question Type: {Question Type}
Question: {Question}
Answer: {Answer}

Please provide the output in parsable JSON format without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [4]:
client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

def generate_questions(doc):
    prompt = prompt_template.format(**doc)
    
    response = client.chat.completions.create(
        model='gemma2:2b',
        messages=[{"role": "user", "content": prompt}],
    )

    json_response = response.choices[0].message.content
    return json_response

In [5]:
results = {}
for doc in tqdm(documents):
    doc_id = doc['id']
    if doc_id in results:
        continue
    questions = generate_questions(doc)
    results[doc_id] = questions
    if len(results) >= 2000:
        break

 12%|█▏        | 2032/16407 [3:47:03<26:46:18,  6.70s/it]


In [6]:
parsed_resulst = {}
for doc_id, json_questions in results.items():
    try:
        parsed_resulst[doc_id] = json.loads(json_questions)
    except json.JSONDecodeError as e:
        continue

In [7]:
len(results), len(parsed_resulst)

(2000, 862)

In [8]:
final_results = []

for doc_id, questions in parsed_resulst.items():
    for q in questions:
        final_results.append((q, doc_id))

df = pd.DataFrame(final_results, columns=['question', 'document'])
df.to_csv('ground-truth-data.csv', index=False)