In [2]:

# Set up dependencies
%pip install openai datasets pandas tqdm

Note: you may need to restart the kernel to use updated packages.


In [11]:
import os
import json
import random
from openai import OpenAI
from datasets import Dataset
import pandas as pd
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
# Initialize OpenAI client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Define comprehensive list of FHIR resources
FHIR_RESOURCES = [
    "Patient", "Practitioner", "Organization", "Location", "Device",
    "Observation", "DiagnosticReport", "Condition", "Procedure", "MedicationRequest",
    "MedicationAdministration", "MedicationDispense", "MedicationStatement", "Immunization",
    "AllergyIntolerance", "Encounter", "Appointment", "AppointmentResponse",
    "CarePlan", "CareTeam", "Goal", "ServiceRequest", "Task",
    "DocumentReference", "Binary", "Media", "Composition", "DiagnosticReport",
    "Specimen", "ImagingStudy", "QuestionnaireResponse", "Questionnaire",
    "Coverage", "ExplanationOfBenefit", "Claim", "ClaimResponse", "PaymentNotice",
    "Invoice", "Account", "ChargeItem", "Contract", "EnrollmentRequest",
    "VisionPrescription", "NutritionOrder", "SupplyRequest", "SupplyDelivery",
    "BiologicallyDerivedProduct", "ResearchStudy", "ResearchSubject", "ActivityDefinition",
    "PlanDefinition", "Measure", "MeasureReport", "Library", "GuidanceResponse",
    "RiskAssessment", "DetectedIssue", "ClinicalImpression", "FamilyMemberHistory",
    "Group", "Person", "RelatedPerson", "Endpoint", "HealthcareService",
    "InsurancePlan", "SubstanceSpecification", "MolecularSequence", "ImmunizationEvaluation",
    "ImmunizationRecommendation", "AdverseEvent", "Flag", "List", "Linkage",
    "AuditEvent", "Provenance", "Consent", "Communication", "CommunicationRequest",
    "DeviceRequest", "DeviceUseStatement", "DeviceMetric", "Substance", "Medication",
    "MedicinalProduct", "Schedule", "Slot",
    "Bundle", "MessageHeader", "MessageDefinition", "EventDefinition", "ObservationDefinition"
]

In [21]:
import asyncio

async def generate_fhir_statement_async(resource_type):
    """Generate a short healthcare statement about a FHIR resource (async version)"""
    prompt = f"""Generate a single, concise statement describing a healthcare scenario, object, or event that could be represented by a {resource_type} FHIR resource.
    
    The statement should:
    - Be medically accurate and realistic
    - Be between 10-25 words
    - Use everyday language that healthcare professionals would understand
    - Focus on a specific clinical scenario or use case
    - NOT use the word "FHIR" in the sentence
    - NOT directly reference the canonical name of the resource type in the sentence (i.e. patient is okay but Patient is not)
    
    Examples for other resources:
    - Patient: "A patient John Smith was admitted to the emergency department with chest pain complaints."
    - Observation: "Blood pressure observation shows elevated systolic reading of 180 mmHg during routine check-up."
    - Medication: "Prescribed medication includes daily 10mg lisinopril for hypertension management."
    
    Generate one statement for {resource_type}:"""
    
    try:
        # Run the synchronous OpenAI call in a thread pool
        loop = asyncio.get_event_loop()
        response = await loop.run_in_executor(
            None,
            lambda: client.chat.completions.create(
                model="gpt-4.1-mini",  # Fixed model name
                messages=[{"role": "user", "content": prompt}],
                max_tokens=100,
                temperature=0.8
            )
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error generating statement for {resource_type}: {e}")
        return None

async def create_synthetic_dataset_parallel(target_count=1000, batch_size=10):
    """Create synthetic FHIR dataset with parallel processing"""
    
    data = []
    failed_generations = 0
    
    print(f"Generating {target_count} synthetic FHIR healthcare statements with {batch_size} parallel requests...")
    
    # Calculate how many examples per resource type
    examples_per_resource = target_count // len(FHIR_RESOURCES)
    remaining_examples = target_count % len(FHIR_RESOURCES)
    
    # Create list of all tasks
    tasks = []
    for i, resource_type in enumerate(FHIR_RESOURCES):
        count_for_this_resource = examples_per_resource + (1 if i < remaining_examples else 0)
        tasks.extend([resource_type] * count_for_this_resource)
    
    # Process in batches
    with tqdm(total=len(tasks), desc="Generating statements") as pbar:
        for i in range(0, len(tasks), batch_size):
            batch = tasks[i:i + batch_size]
            
            # Create async tasks for this batch
            batch_tasks = [generate_fhir_statement_async(resource_type) for resource_type in batch]
            
            # Run batch concurrently
            results = await asyncio.gather(*batch_tasks)
            
            # Process results
            for j, statement in enumerate(results):
                resource_type = batch[j]
                if statement:
                    data.append({
                        "statement": statement,
                        "resource_type": resource_type,
                        "id": len(data)
                    })
                else:
                    failed_generations += 1
                    fallback = f"Healthcare record contains {resource_type} information for patient care documentation."
                    data.append({
                        "statement": fallback,
                        "resource_type": resource_type,
                        "id": len(data)
                    })
            
            pbar.update(len(batch))
    
    print(f"Generated {len(data)} statements successfully")
    print(f"Failed generations (used fallbacks): {failed_generations}")
    
    return data

# Note: In Jupyter, we'll call the async function directly with await


In [22]:
# Generate the dataset using the async parallel function
print("Creating synthetic FHIR dataset...")
synthetic_data = await create_synthetic_dataset_parallel(1000, batch_size=5)

# Convert to pandas DataFrame for easier manipulation
df = pd.DataFrame(synthetic_data)
print(f"Generated {len(df)} examples")
print("\nSample data:")
print(df.head())

# Create Hugging Face Dataset
dataset = Dataset.from_pandas(df)
print(f"\nCreated Hugging Face dataset with {len(dataset)} examples")

# Save the dataset
dataset.save_to_disk("./fhir_synthetic_dataset")
print("Dataset saved to './fhir_synthetic_dataset'")

# Also save as JSON for easy inspection
with open("fhir_synthetic_data.json", "w") as f:
    json.dump(synthetic_data, f, indent=2)
print("Dataset also saved as 'fhir_synthetic_data.json'")

# Show statistics
resource_counts = df['resource_type'].value_counts()
print(f"\nDataset statistics:")
print(f"Total examples: {len(df)}")
print(f"Unique resource types: {len(resource_counts)}")
print(f"Examples per resource type (top 10):")
print(resource_counts.head(10))


Creating synthetic FHIR dataset...
Generating 1000 synthetic FHIR healthcare statements with 5 parallel requests...


Generating statements: 100%|██████████| 1000/1000 [03:55<00:00,  4.24it/s]


Generated 1000 statements successfully
Failed generations (used fallbacks): 0
Generated 1000 examples

Sample data:
                                           statement resource_type  id
0  A 45-year-old female with a history of diabete...       Patient   0
1  A 45-year-old female with diabetes is register...       Patient   1
2  A 45-year-old female with a history of diabete...       Patient   2
3  A 45-year-old female with diabetes and hyperte...       Patient   3
4  A 45-year-old female with a history of diabete...       Patient   4

Created Hugging Face dataset with 1000 examples


Saving the dataset (1/1 shards): 100%|██████████| 1000/1000 [00:00<00:00, 338769.40 examples/s]

Dataset saved to './fhir_synthetic_dataset'
Dataset also saved as 'fhir_synthetic_data.json'

Dataset statistics:
Total examples: 1000
Unique resource types: 90
Examples per resource type (top 10):
resource_type
DiagnosticReport          22
Patient                   11
FamilyMemberHistory       11
ImmunizationEvaluation    11
MolecularSequence         11
SubstanceSpecification    11
InsurancePlan             11
HealthcareService         11
Endpoint                  11
RelatedPerson             11
Name: count, dtype: int64



