In [1]:
import csv

from openevals.prompts import CONCISENESS_PROMPT


def csv_to_json_evaluate(csv_file_path: str) -> list[dict]:
    """
    Convert CSV file to JSON format
    Assumes columns: question_text, Answer, Answer2
    """
    data = []
    with open(csv_file_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for idx, row in enumerate(reader, start=1):
            data.append({
                "input": row.get("input", "").strip(),
                "disease_name": row.get("Disease name", "").strip(),
                "reasoning": row.get("Reasoning", "").strip(),
                "recommendation": row.get("Recommendation", "").strip(),
                "context": row.get("Context", "").strip(),
                "thinking": row.get("Thinking", "").strip(),
            })
    return data

In [2]:
training_data1 = csv_to_json_evaluate('/Volumes/SandiskSSD//Downloads/training_tropical_dataset.csv')

In [4]:
len(training_data1)

39

In [5]:
training_data2 = csv_to_json_evaluate('/Volumes/SandiskSSD//Downloads/data2.csv')

In [6]:
len(training_data2)

39

In [7]:
training_data3 = csv_to_json_evaluate('/Volumes/SandiskSSD//Downloads/data3.csv')

In [8]:
len(training_data3)

39

In [3]:
training_data1

[{'input': "I’ve had a fever that keeps getting higher every day, and now my stomach hurts and I feel extremely tired. I also noticed I'm \nhaving constipation sometimes and then diarrhea the next day. Is this something serious or just a normal infection?",
  'disease_name': 'Typhoid fever',
  'reasoning': 'A fever that rises each day, stomach pain, chills, headache, weakness, and switching between constipation and diarrhea are symptoms that can\n appear in typhoid fever. Some people also get a rash, loss of appetite, or a swollen belly as the illness progresses. Typhoid is caused by the \nbacteria Salmonella enterica serotype typhi, which spreads through contaminated water, ice, uncooked foods like raw fruits without a peel, and \nfood handled by someone who didn’t wash their hands after using the bathroom. These symptoms can come from other infections too, so a \nmedical test is needed to confirm. If the fever or stomach pain increases, seeking medical care is important.',
  'recomme

In [10]:
training_data2

[{'input': 'A 35-year-old woman, currently living in New York after recently returning from a vacation to a Caribbean island, started experiencing sudden high fever, headache, fatigue, a rash, nausea, and red eyes about five days after her return. She has no known pre-existing conditions. After about two weeks, most of her initial symptoms improved, but she has continued to have significant joint and muscle pain for the past two months. What could be causing her prolonged joint and muscle pain after these initial symptoms, considering her recent travel?',
  'disease_name': 'Chikungunya',
  'reasoning': 'The described symptoms, particularly the sudden high fever, headache, rash, and red eyes, followed by persistent joint and muscle pain after travel to a high-risk area, are highly consistent with Chikungunya. This disease is caused by the Chikungunya virus, typically transmitted through the bite of an infected mosquito, and has spread globally, including to Caribbean islands. While many

In [11]:
training_data3

[{'input': "I went camping about 10 days ago and now I feel terrible. I have a moderate fever, chills, and a severe headache. My whole body aches, especially my muscles and joints, and I've been feeling nauseous with some diarrhea. I also noticed a rash on my skin. Is this just a flu or something related to being outdoors?",
  'disease_name': 'Ehrlichiosis',
  'reasoning': 'Moderate fever, chills, severe headache, muscle aches, and gastrointestinal issues like nausea, vomiting, or diarrhea are symptoms that can appear in ehrlichiosis. Some people also experience confusion, changes in mental state, or a rash (though this is more common in children) as the illness progresses. Ehrlichiosis is caused by various species of Ehrlichia bacteria, which spread primarily through the bite of an infected Lone Star tick, commonly found in south-central, southeastern, and eastern coastal states. These symptoms can mimic the flu or other tick-borne diseases like anaplasmosis, so a medical evaluation i

In [12]:
training_data = training_data1 + training_data2 + training_data3

In [13]:
training_data

[{'input': "I’ve had a fever that keeps getting higher every day, and now my stomach hurts and I feel extremely tired. I also noticed I'm \nhaving constipation sometimes and then diarrhea the next day. Is this something serious or just a normal infection?",
  'disease_name': 'Typhoid fever',
  'reasoning': 'A fever that rises each day, stomach pain, chills, headache, weakness, and switching between constipation and diarrhea are symptoms that can\n appear in typhoid fever. Some people also get a rash, loss of appetite, or a swollen belly as the illness progresses. Typhoid is caused by the \nbacteria Salmonella enterica serotype typhi, which spreads through contaminated water, ice, uncooked foods like raw fruits without a peel, and \nfood handled by someone who didn’t wash their hands after using the bathroom. These symptoms can come from other infections too, so a \nmedical test is needed to confirm. If the fever or stomach pain increases, seeking medical care is important.',
  'recomme

In [14]:
len(training_data)

117

In [14]:
import json
json.loads(training_data[0]['context'].strip())['has_gold_chunk']

True

In [15]:
SYSTEM_PROMPT = """
You are an expert in tropical and infectious diseases. Provide an evidence-based likely diagnosis consistent with guidance from authoritative public-health sources (e.g., WHO, CDC) when applicable. You will be provided with symptoms, and contextual information.

Use evidence in this priority order:
1. Patient’s symptoms + history (highest priority)
2. Geography/epidemiology
3. Retrieved RAG cases (supporting evidence only)

Critical rule: Use patient symptoms (PRIMARY) + retrieved cases (SUPPORTING) to diagnose. Keep the final explanation clear and readable. When helpful, interpret lay terms into clinical terms internally, accept both clinical and lay language (e.g., “really tired” → fatigue; “muscles and joints ache” → myalgia/arthralgia).

Output requirements:
    - Start with a single, precise disease name on the first line: "Diagnosis: <disease>"
    - Then provide 2–4 short sentences explaining why this is most likely ("Explanation: ...").
    - Be specific (e.g., "Lassa fever" instead of "viral hemorrhagic fever").
    - If uncertainty exists, mention it briefly but still choose the most likely diagnosis.
"""

## Retriver context extraction from RAG system

In [1]:
from RAG.QdrantRetriever import get_retriever

retriever = get_retriever()

W0131 10:43:57.675000 85942 /Volumes/SandiskSSD/Library/miniforge3/envs/Tropical_Infectious_Disease/lib/python3.13/site-packages/torch/distributed/elastic/multiprocessing/redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.
<All keys matched successfully>
  graph = Neo4jGraph(


Connected successfully!


In [17]:
user_question = """
I am a 43 year old man and recently went on a hike to Durian Perangin waterfall in Teluk Intan. I got a fever, headache and a rash on my back since getting back. What disease is this?
"""

hits = await retriever.query(user_question, k=5)
contexts_list = [hit.get("context", "") for hit in (hits or [])]
all_contexts = "\n---\n".join(contexts_list)
print(all_contexts)


[HISTORY OF PRESENT ILLNESS]
Disease: African tick-bite fever
Diagnosis Details: The final diagnosis is African tick-bite fever, caused by Rickettsia africae. This was a clinical diagnosis based on the patient's presentation with fever after travel to an endemic area in Southern Africa, and the pathognomonic finding of an inoculation eschar with regional lymphadenitis. The primary differential of malaria was ruled out with negative blood tests. The patient's rapid clinical improvement on doxycycline therapy further supported the diagnosis.
Vitals: On presentation, after taking 1g of paracetamol, his tympanic temperature was 37.5 °C (99.5 °F), pulse was 80 bpm, and blood pressure was 130/70 mmHg.
Content: A 52-year-old man presented to a tropical medicine clinic with a two-day history of fever, night sweats, and a frontal headache. The symptoms began ten days after he returned from a trip to South Africa. He denied any joint pains and had not noticed a rash. He did report having a skin 

## Extract train dataset

In [73]:
json.loads(training_data[99]['context'].strip(), strict=False)['generated_context_string']

'[HISTORY OF PRESENT ILLNESS]\nDisease: Impetigo\nFinal Diagnosis: Non-bullous Impetigo\nVitals: Temp 37.2°C, HR 100, BP 90/60.\nContent: A 4-year-old male was brought to the clinic with a rash around his nose and mouth that started 3 days ago. The mother reports it began as small red blisters that burst and are now covered by thick, golden-yellow, "honey-colored" crusts.  The child is constantly touching the area, and similar lesions have appeared on his forearm (autoinoculation). He attends daycare where other children have had skin infections. There are no systemic symptoms like high fever or lethargy. The classic presentation of perioral papules evolving into golden crusts is diagnostic for Non-bullous Impetigo.\n\n[PATHOGENS & TYPES]\nContent: Impetigo is a highly contagious bacterial skin infection caused primarily by *Staphylococcus aureus* (80% of cases) and Group A *Streptococcus* (*S. pyogenes*). It occurs in three forms: \n1. **Non-bullous:** The most common form (70%), char

In [22]:
normalized_dataset  = []
for idx, item in enumerate(training_data):
    print(f"Processing item {idx + 1}/{len(training_data)}")
    normalized_dataset.append(
        {
            "messages": [
                {
                    "role": "system",
                    "content": SYSTEM_PROMPT,
                },
                {
                    "role": "user",
                    "content": f"""
                    PATIENT QUESTION:
                    {item['input']}

                    RETRIEVED REFERENCE CASES (from medical database):
                    {json.loads(item['context'].strip())['generated_context_string']}
                    Diagnose based on patient symptoms FIRST, use references to validate."""
                },
                {
                    "role": "assistant",
                    "content": f"Diagnosis: {item['disease_name']}\nExplanation: {item['reasoning']}"
                }
            ]
        }
    )

Processing item 1/117
Processing item 2/117
Processing item 3/117
Processing item 4/117
Processing item 5/117
Processing item 6/117
Processing item 7/117
Processing item 8/117
Processing item 9/117
Processing item 10/117
Processing item 11/117
Processing item 12/117
Processing item 13/117
Processing item 14/117
Processing item 15/117
Processing item 16/117
Processing item 17/117
Processing item 18/117
Processing item 19/117
Processing item 20/117
Processing item 21/117
Processing item 22/117
Processing item 23/117
Processing item 24/117
Processing item 25/117
Processing item 26/117
Processing item 27/117
Processing item 28/117
Processing item 29/117
Processing item 30/117
Processing item 31/117
Processing item 32/117
Processing item 33/117
Processing item 34/117
Processing item 35/117
Processing item 36/117
Processing item 37/117
Processing item 38/117
Processing item 39/117
Processing item 40/117
Processing item 41/117
Processing item 42/117
Processing item 43/117
Processing item 44/1

In [23]:
normalized_dataset  = []
for idx, item in enumerate(training_data):
    print(f"Processing item {idx + 1}/{len(training_data)}")
    normalized_dataset.append(
        {
            "messages": [
                {
                    "role": "system",
                    "content": SYSTEM_PROMPT,
                },
                {
                    "role": "user",
                    "content": f"""{item['input']}"""
                },
                {
                    "role": "assistant",
                    "content": f"<think>{item['thinking']}</think>\n\nDiagnosis: {item['disease_name']}\nExplanation: {item['reasoning']}"
                }
            ]
        }
    )

Processing item 1/117
Processing item 2/117
Processing item 3/117
Processing item 4/117
Processing item 5/117
Processing item 6/117
Processing item 7/117
Processing item 8/117
Processing item 9/117
Processing item 10/117
Processing item 11/117
Processing item 12/117
Processing item 13/117
Processing item 14/117
Processing item 15/117
Processing item 16/117
Processing item 17/117
Processing item 18/117
Processing item 19/117
Processing item 20/117
Processing item 21/117
Processing item 22/117
Processing item 23/117
Processing item 24/117
Processing item 25/117
Processing item 26/117
Processing item 27/117
Processing item 28/117
Processing item 29/117
Processing item 30/117
Processing item 31/117
Processing item 32/117
Processing item 33/117
Processing item 34/117
Processing item 35/117
Processing item 36/117
Processing item 37/117
Processing item 38/117
Processing item 39/117
Processing item 40/117
Processing item 41/117
Processing item 42/117
Processing item 43/117
Processing item 44/1

In [18]:
len(normalized_dataset)

117

In [24]:
normalized_dataset

[{'messages': [{'role': 'system',
    'content': '\nYou are an expert in tropical and infectious diseases. Provide an evidence-based likely diagnosis consistent with guidance from authoritative public-health sources (e.g., WHO, CDC) when applicable. You will be provided with symptoms, and contextual information.\n\nUse evidence in this priority order:\n1. Patient’s symptoms + history (highest priority)\n2. Geography/epidemiology\n3. Retrieved RAG cases (supporting evidence only)\n\nCritical rule: Use patient symptoms (PRIMARY) + retrieved cases (SUPPORTING) to diagnose. Keep the final explanation clear and readable. When helpful, interpret lay terms into clinical terms internally, accept both clinical and lay language (e.g., “really tired” → fatigue; “muscles and joints ache” → myalgia/arthralgia).\n\nOutput requirements:\n    - Start with a single, precise disease name on the first line: "Diagnosis: <disease>"\n    - Then provide 2–4 short sentences explaining why this is most likely 

In [25]:
import json
file_path = '/Volumes/SandiskSSD/Downloads/training_data_for_qwen3_thinking.json'

with open(file_path, 'w', encoding='utf-8') as f:
    json.dump(normalized_dataset, f, ensure_ascii=False, indent=2)

print(f"Successfully saved data to {file_path}")

Successfully saved data to /Volumes/SandiskSSD/Downloads/training_data_for_qwen3_thinking.json


## Filter and extract diagnosis from normalized dataset

In [99]:
import re
pattern = r"Diagnosis:\s*(.*)"
match = re.search(pattern, normalized_dataset[3].get('messages')[2].get('content'))
diagnosis = match.group(1).strip()
print(diagnosis)

Viral hemorrhagic fevers


In [100]:
clean_string = re.sub(r'<think>.*?</think>\n?', '', normalized_dataset[3].get('messages')[2].get('content'), flags=re.DOTALL)
print(clean_string)

Diagnosis: Viral hemorrhagic fevers
Explanation: Early symptoms like fever, extreme tiredness, muscle or joint aches, nausea, vomiting, and diarrhea can appear in many viral hemorrhagic fevers. When symptoms 
progress to bleeding under the skin or from the mouth, eyes, or ears — along with confusion, nervous system problems, difficulty breathing, or organ failure — it 
suggests a severe form. These illnesses are caused by viruses carried by mosquitoes, ticks, rodents, bats, or primates. They spread through insect bites, contact 
with infected animals, or exposure to infected blood, saliva, or other body fluids. Some types can also spread from person to person. Symptoms may appear 2 to 21 
days after exposure. Because you were recently in an area where these viruses are common and now have worsening symptoms, urgent medical evaluation is essential.


## OpenAI Harmony streamable parser test

In [2]:
from openai_harmony import (
    load_harmony_encoding,
    Role,
    StreamableParser,
    HarmonyEncodingName
)

encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
stream = StreamableParser(encoding, role=Role.ASSISTANT)

tokens = [
    200005,35644,200008,1844,31064,25,392,4827,382,220,17,659,220,17,16842,12295,81645,
    13,51441,6052,13,200007,200006,173781,200005,17196,200008,17,659,220,17,314,220,19,
    13,200002
]

for token in tokens:
    stream.process(token)
    print("--------------------------------")
    print("current_role", stream.current_role)
    print("current_channel", stream.current_channel)
    print("last_content_delta", stream.last_content_delta)
    print("current_content_type", stream.current_content_type)
    print("current_recipient", stream.current_recipient)
    print("current_content", stream.current_content)

--------------------------------
current_role Role.ASSISTANT
current_channel None
last_content_delta None
current_content_type None
current_recipient None
current_content 
--------------------------------
current_role Role.ASSISTANT
current_channel None
last_content_delta None
current_content_type None
current_recipient None
current_content 
--------------------------------
current_role Role.ASSISTANT
current_channel analysis
last_content_delta None
current_content_type None
current_recipient None
current_content 
--------------------------------
current_role Role.ASSISTANT
current_channel analysis
last_content_delta User
current_content_type None
current_recipient None
current_content User
--------------------------------
current_role Role.ASSISTANT
current_channel analysis
last_content_delta  asks
current_content_type None
current_recipient None
current_content User asks
--------------------------------
current_role Role.ASSISTANT
current_channel analysis
last_content_delta :
current

In [116]:
from datasets import load_dataset

# Download and load the dataset
dataset_name = "HuggingFaceH4/Multilingual-Thinking"
train_dataset = load_dataset(dataset_name, split="train")

# (Optional) Save it to a specific folder on your disk
train_dataset.save_to_disk("/Users/hovietbach/Downloads/multilingual_thinking_dataset")

# (Optional) Export to CSV if you want to read it in Excel
# dataset['train'].to_csv("multilingual_thinking.csv")

Saving the dataset (1/1 shards): 100%|██████████| 1000/1000 [00:00<00:00, 96749.95 examples/s] 


In [117]:
train_dataset

Dataset({
    features: ['reasoning_language', 'developer', 'user', 'analysis', 'final', 'messages'],
    num_rows: 1000
})

In [118]:
print(type(train_dataset))

<class 'datasets.arrow_dataset.Dataset'>


In [119]:
from datasets import load_dataset

train_dataset = load_dataset("json", data_files="/Users/hovietbach/Downloads/dataset.json", split="train")

# Verify the type
print(type(train_dataset))


Generating train split: 117 examples [00:00, 871.47 examples/s]

<class 'datasets.arrow_dataset.Dataset'>





In [175]:
train_dataset[0]

{'messages': [{'content': "You are an expert in tropical and infectious diseases. Provide an evidence-based likely diagnosis consistent with guidance from authoritative public-health sources (e.g., WHO, CDC) when applicable. You will be provided with symptoms, and contextual information such as age, gender, location, pre-existing conditions and lifestyle factors.\n\nUse evidence in this priority order:\n\nPatient’s symptoms + history (highest priority)\n\nGeography/epidemiology (where they are / recent travel / exposures)\n\nRetrieved RAG cases (supporting evidence only)\n\nCritical rule: If retrieved cases conflict with the patient’s presentation, trust the patient’s presentation.\n\nLanguage handling: Accept both clinical and lay language. When helpful, interpret lay terms into clinical terms internally (e.g., “really tired” → fatigue; “muscles and joints ache” → myalgia/arthralgia), but keep the final explanation clear and readable.\n\n## Output format (always):\nDiagnosis: [Precise