In [79]:
import pandas as pd
from openai import OpenAI
import os
import json
import re
import csv
from dotenv import load_dotenv

from tqdm.auto import tqdm

In [None]:
load_dotenv('/home/ubuntu/medical_assistant_rag/.envrc')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [80]:
df = pd.read_csv('./data/data_test.csv')

In [81]:
df.head(), len(df)

(   id                                           question  \
 0   0  A 23-year-old pregnant woman at 22 weeks gesta...   
 1   1  A 3-month-old baby died suddenly at night whil...   
 2   2  A mother brings her 3-week-old infant to the p...   
 3   3  A pulmonary autopsy specimen from a 58-year-ol...   
 4   4  A 20-year-old woman presents with menorrhagia ...   
 
                                               answer  
 0                                     Nitrofurantoin  
 1  Placing the infant in a supine position on a f...  
 2       Abnormal migration of ventral pancreatic bud  
 3                                    Thromboembolism  
 4                             Von Willebrand disease  ,
 100)

In [82]:
documents = df.to_dict(orient='records')
documents

[{'id': 0,
  'question': 'A 23-year-old pregnant woman at 22 weeks gestation presents with burning upon urination. She states it started 1 day ago and has been worsening despite drinking more water and taking cranberry extract. She otherwise feels well and is followed by a doctor for her pregnancy. Her temperature is 97.7°F (36.5°C), blood pressure is 122/77 mmHg, pulse is 80/min, respirations are 19/min, and oxygen saturation is 98% on room air. Physical exam is notable for an absence of costovertebral angle tenderness and a gravid uterus. Which of the following is the best treatment for this patient?',
  'answer': 'Nitrofurantoin'},
 {'id': 1,
  'question': 'A 3-month-old baby died suddenly at night while asleep. His mother noticed that he had died only after she awoke in the morning. No cause of death was determined based on the autopsy. Which of the following precautions could have prevented the death of the baby?',
  'answer': 'Placing the infant in a supine position on a firm mat

## Generating Fields

In [83]:
fields = {
  "Medical Department": [
    "Obstetrics & Gynecology",
    "Pediatrics",
    "Cardiology",
    "Gastroenterology",
    "Neurology",
    "Pulmonology",
    "Endocrinology",
    "Hematology",
    "Nephrology",
    "Dermatology",
    "Rheumatology",
    "Oncology",
    "Emergency Medicine",
    "Infectious Diseases",
    "Orthopedics",
    "General Medicine",
  ],
  "Condition Type": [
    "Acute",
    "Chronic",
    "Congenital",
    "Infectious",
    "Idiopathic",
    "Autoimmune",
    "Genetic",
    "Lifestyle-related",
    "Degenerative",
    "Inflammatory"
  ],
  "Patient Demographics": {
    "Age Group": [
      "Neonate (0-28 days)",
      "Infant (1-12 months)",
      "Child (1-12 years)",
      "Adolescent (13-18 years)",
      "Adult (19-64 years)",
      "Elderly (65+ years)"
    ],
    "Gender": [
      "Male",
      "Female"
    ],
    "Pregnancy Status": [
      "Pregnant",
      "Not Pregnant"
    ],
    "Special Status": [
      "Postpartum",
      "Immunocompromised",
      "Comorbidities",
      "Athlete"
    ]
  },
  "Common Symptoms": [
    "Pain (abdominal, chest, joint, etc.)",
    "Fever",
    "Nausea/Vomiting",
    "Cough",
    "Dyspnea (Shortness of breath)",
    "Edema (Swelling)",
    "Headache",
    "Fatigue",
    "Dizziness/Lightheadedness",
    "Burning sensation (e.g., urination)",
    "Palpitations",
    "Rash",
    "Bleeding (e.g., menorrhagia)",
    "Weight Loss/Gain",
    "Changes in Vision",
    "Neurological Deficit (e.g., numbness, weakness)",
    "Altered Mental Status"
  ],
  "Treatment or Management": [
    "Medication",
    "Surgery",
    "Lifestyle Changes",
    "Physical Therapy",
    "Dietary Modification",
    "Immunotherapy",
    "Hormonal Therapy",
    "Supportive Care (e.g., oxygen therapy)",
    "Preventive Measures (e.g., vaccinations)",
    "Observation/Monitoring",
    "Radiation Therapy",
    "Chemotherapy",
    "Psychological Counseling",
    "Rehabilitation"
  ],
  "Severity": [
    "Mild",
    "Moderate",
    "Severe",
    "Life-threatening",
    "Stable",
    "Unstable"
  ]
}


In [85]:
client = OpenAI()

In [86]:
def extract_json(response_text):
    """
    Extract JSON content from the model's response using a regular expression.
    """
    try:
        json_str = re.search(r'\{.*\}', response_text, re.DOTALL).group()
        return json_str
    except AttributeError:
        return None

In [87]:
def generate_metadata(doc):
    """
    Generate metadata for each question-answer pair.
    """
    prompt_template = """
Given the following case description, please provide the appropriate values for these fields. Strictly use the values from each field. Do not generate a new value for any of these fields, and respond only in the JSON format shown below:

1. Medical Department: Obstetrics & Gynecology, Pediatrics, Cardiology, Gastroenterology, Neurology, Pulmonology, Endocrinology, Hematology, Nephrology, Dermatology, Rheumatology, Oncology, Emergency Medicine, Infectious Diseases, Orthopedics, General Medicine
2. Condition Type: Acute, Chronic, Congenital, Infectious, Idiopathic, Autoimmune, Genetic, Lifestyle-related, Degenerative, Inflammatory
3. Patient Demographics:
   - Age Group: Neonate (0-28 days), Infant (1-12 months), Child (1-12 years), Adolescent (13-18 years), Adult (19-64 years), Elderly (65+ years)
   - Gender: Male, Female
   - Pregnancy Status: Pregnant, Not Pregnant
   - Special Status: Postpartum, Immunocompromised, Comorbidities, Athlete
4. Common Symptoms: Pain (abdominal, chest, joint, etc.), Fever, Nausea/Vomiting, Cough, Dyspnea (Shortness of breath), Edema (Swelling), Headache, Fatigue, Dizziness/Lightheadedness, Burning sensation (e.g., urination), Palpitations, Rash, Bleeding (e.g., menorrhagia), Weight Loss/Gain, Changes in Vision, Neurological Deficit (e.g., numbness, weakness), Altered Mental Status
5. Treatment or Management: Medication, Surgery, Lifestyle Changes, Physical Therapy, Dietary Modification, Immunotherapy, Hormonal Therapy, Supportive Care (e.g., oxygen therapy), Preventive Measures (e.g., vaccinations), Observation/Monitoring, Radiation Therapy, Chemotherapy, Psychological Counseling, Rehabilitation
6. Severity: Mild, Moderate, Severe, Life-threatening, Stable, Unstable

Case Description: {description}

Please provide the responses in the following JSON format without any additional text:

{{
  "Medical Department": "value",
  "Condition Type": "value",
  "Patient Demographics": {{
    "Age Group": "value",
    "Gender": "value",
    "Pregnancy Status": "value",
    "Special Status": "value"
  }},
  "Common Symptoms": ["value1", "value2"],
  "Treatment or Management": "value",
  "Severity": "value"
}}
    """
    
    if 'question' not in doc:
        print(f"Document {doc.get('id', 'unknown')} is missing the 'question' key.")
        return None

    prompt = prompt_template.format(description=doc['question'])
    
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    json_response = response.choices[0].message.content
    extracted_json = extract_json(json_response)
    
    if extracted_json:
        return extracted_json
    else:
        print(f"Failed to extract JSON content for document ID {doc.get('id', 'unknown')}")
        return None

In [88]:
results = {}

In [89]:
for doc in tqdm(documents):
    doc_id = doc.get('id', 'unknown')
    if doc_id in results:
        continue

    metadata_raw = generate_metadata(doc)
    
    if metadata_raw is None:
        # Skip to the next document if metadata generation failed
        continue

    try:
        # Try parsing the extracted JSON response
        metadata = json.loads(metadata_raw)

        # Convert 'Patient Demographics' to a plain text string if it exists
        demographics = metadata.get('Patient Demographics', {})
        demographics_text = ', '.join([f"{key}: {value}" for key, value in demographics.items() if value != 'None'])

        # Convert 'Common Symptoms' list to a comma-separated string if it exists
        common_symptoms = metadata.get('Common Symptoms', [])
        common_symptoms_text = ', '.join(common_symptoms)

        # Construct the final dictionary, ensuring the order of keys
        ordered_metadata = {
            'id': doc_id,
            'question': doc['question'],
            'answer': doc['answer'],
            'Medical Department': metadata.get('Medical Department', 'Unknown'),
            'Condition Type': metadata.get('Condition Type', 'Unknown'),
            'Patient Demographics': demographics_text,  # Use the converted plain text
            'Common Symptoms': common_symptoms_text,  # Use the converted plain text
            'Treatment or Management': metadata.get('Treatment or Management', 'Unknown'),
            'Severity': metadata.get('Severity', 'Unknown')
        }
        
        # Ensure it has the required keys
        if all(key in ordered_metadata for key in ["Medical Department", "Condition Type", "Patient Demographics", "Common Symptoms", "Treatment or Management", "Severity", "id", "question", "answer"]):
            results[doc_id] = ordered_metadata
        else:
            print(f"Missing fields in JSON for document ID {doc_id}")
            results[doc_id] = {}
    except json.JSONDecodeError:
        print(f"Failed to parse JSON for document ID {doc_id}")
        results[doc_id] = {}

  0%|          | 0/100 [00:00<?, ?it/s]

In [94]:
final_results = list(results.values())

df_fields = pd.DataFrame(final_results)

headers = ["id", "question", "answer", "medical_department", "condition_type", 
           "patient_demographics", "common_symptoms", "treatment_or_management", "severity"]

column_mapping = {
    "Medical Department": "medical_department",
    "Condition Type": "condition_type",
    "Patient Demographics": "patient_demographics",
    "Common Symptoms": "common_symptoms",
    "Treatment or Management": "treatment_or_management",
    "Severity": "severity"
}
df_fields = df_fields.rename(columns=column_mapping)

for col in headers:
    if col != 'id':
        df_fields[col] = df_fields[col].astype(str)

with open('./data/data_metadata_small.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(headers)

with open('./data/data_metadata_small.csv', 'a', newline='') as file:
    df_fields.to_csv(file, index=False, quotechar='"', quoting=csv.QUOTE_NONNUMERIC, header=False)

In [91]:
df_fields.head()

Unnamed: 0,id,question,answer,Medical Department,Condition Type,Patient Demographics,Common Symptoms,Treatment or Management,Severity
0,0,A 23-year-old pregnant woman at 22 weeks gesta...,Nitrofurantoin,Obstetrics & Gynecology,Infectious,"Age Group: Adult, Gender: Female, Pregnancy St...","Burning sensation (e.g., urination)",Medication,Mild
1,1,A 3-month-old baby died suddenly at night whil...,Placing the infant in a supine position on a f...,Pediatrics,Idiopathic,"Age Group: Infant (1-12 months), Gender: Male,...","Fever, Altered Mental Status","Preventive Measures (e.g., vaccinations)",Life-threatening
2,2,A mother brings her 3-week-old infant to the p...,Abnormal migration of ventral pancreatic bud,Pediatrics,Infectious,"Age Group: Neonate (0-28 days), Gender: Male, ...","Fussiness, Nausea/Vomiting",Observation/Monitoring,Moderate
3,3,A pulmonary autopsy specimen from a 58-year-ol...,Thromboembolism,Pulmonology,Acute,"Age Group: Adult, Gender: Female, Pregnancy St...","Dyspnea (Shortness of breath), Fatigue","Supportive Care (e.g., oxygen therapy)",Life-threatening
4,4,A 20-year-old woman presents with menorrhagia ...,Von Willebrand disease,Obstetrics & Gynecology,Chronic,"Age Group: Adult, Gender: Female, Pregnancy St...","Bleeding (e.g., menorrhagia), Easy bruising",Medication,Moderate
