In [0]:
from pyspark.sql.types import *
from pyspark.sql import functions as F

df_bundle = spark.read.format("delta").table("synthea.raw.fhir_bundle")

entry_json_schema = StructType([StructField("resource", StringType(), False)])
resource_json_schema_common = StructType(
    [StructField("resourceType", StringType(), False)]
)

df_bundle_parsed = (
    df_bundle.withColumn("entry_element", F.explode("entry"))
    .withColumn(
        "entry_element_parsed", F.from_json(F.col("entry_element"), entry_json_schema)
    )
    .withColumn("resource", F.col("entry_element_parsed.resource"))
    .withColumn(
        "resource_parsed", F.from_json(F.col("resource"), resource_json_schema_common)
    )
    .withColumn("resourceType", F.col("resource_parsed.resourceType"))
    .drop("type", "entry", "entry_element", "entry_element_parsed", "resource_parsed")
)

df_bundle_parsed.show()

+--------------------+--------------------+
|        resourceType|            resource|
+--------------------+--------------------+
|             Patient|{"resourceType":"...|
|        Organization|{"resourceType":"...|
|        Practitioner|{"resourceType":"...|
|           Encounter|{"resourceType":"...|
|           Condition|{"resourceType":"...|
|   MedicationRequest|{"resourceType":"...|
|               Claim|{"resourceType":"...|
|               Claim|{"resourceType":"...|
|ExplanationOfBenefit|{"resourceType":"...|
|           Encounter|{"resourceType":"...|
|           Condition|{"resourceType":"...|
|   MedicationRequest|{"resourceType":"...|
|               Claim|{"resourceType":"...|
|   MedicationRequest|{"resourceType":"...|
|               Claim|{"resourceType":"...|
|               Claim|{"resourceType":"...|
|ExplanationOfBenefit|{"resourceType":"...|
|        Organization|{"resourceType":"...|
|        Practitioner|{"resourceType":"...|
|           Encounter|{"resource

In [0]:
fhir_patient_schema = StructType(
    [
        StructField("id", StringType(), False),
        StructField("gender", StringType(), True),
        StructField("birthDate", StringType(), True),
    ]
)

fhir_observation_schema = StructType(
    [
        StructField("id", StringType(), False),
        StructField(
            "code",
            StructType(
                [
                    StructField(
                        "coding",
                        ArrayType(
                            StructType(
                                [
                                    StructField("system", StringType(), False),
                                    StructField("code", StringType(), False),
                                ]
                            )
                        ),
                        True,
                    )
                ]
            ),
            True,
        ),
        StructField(
            "subject", StructType([StructField("reference", StringType(), False)]), True
        ),
        StructField(
            "valueQuantity",
            StructType(
                [
                    StructField("value", FloatType(), True),
                    StructField("unit", StringType(), True),
                ]
            ),
            True,
        ),
        StructField("effectivePeriod", StringType(), True),
        StructField("effectiveDateTime", StringType(), True),
    ]
)

fhir_medicationrequest_schema = StructType(
    [
        StructField("id", StringType(), False),
        StructField(
            "subject", StructType([StructField("reference", StringType(), False)]), True
        ),
        StructField(
            "medicationCodeableConcept",
            StructType(
                [
                    StructField(
                        "coding",
                        ArrayType(
                            StructType(
                                [
                                    StructField("system", StringType(), False),
                                    StructField("code", StringType(), False),
                                ]
                            )
                        ),
                        True,
                    )
                ]
            ),
            True,
        ),
        StructField("authoredOn", StringType(), True),
    ]
)

fhir_condition_schema = StructType(
    [
        StructField(
            "subject", StructType([StructField("reference", StringType(), False)]), True
        ),
        StructField(
            "code",
            StructType(
                [
                    StructField(
                        "coding",
                        ArrayType(
                            StructType(
                                [
                                    StructField("system", StringType(), False),
                                    StructField("code", StringType(), False),
                                ]
                            )
                        ),
                        True,
                    )
                ]
            ),
            True,
        ),
        StructField("onsetDateTime", StringType(), True),
    ]
)

fhir_resource_type_to_schema = {
    "Patient": fhir_patient_schema,
    "Observation": fhir_observation_schema,
    "MedicationRequest": fhir_medicationrequest_schema,
    "Condition": fhir_condition_schema,
}

fhir_resource_types = fhir_resource_type_to_schema.keys()


for fhir_resource_type in fhir_resource_types:
    # Filter rows for current table
    df_bundle_filtered = df_bundle_parsed.filter(
        F.col("resourceType") == fhir_resource_type
    )

    # Parse the json dynamically selecting the schema
    df_bundle_filtered = df_bundle_filtered.withColumn(
        "resource_parsed",
        F.from_json(
            F.col("resource"), fhir_resource_type_to_schema[fhir_resource_type]
        ),
    )
    df_fhir_resource = df_bundle_filtered.select("resource_parsed.*")

    delta_table_name = f"synthea.fhir.{fhir_resource_type.lower()}"

    df_fhir_resource.write.mode("append").format("delta").saveAsTable(delta_table_name)


spark.read.format("delta").table("synthea.fhir.patient").show()
spark.read.format("delta").table("synthea.fhir.observation").show()
spark.read.format("delta").table("synthea.fhir.medicationrequest").show()
spark.read.format("delta").table("synthea.fhir.condition").show()

+--------------------+------+----------+
|                  id|gender| birthDate|
+--------------------+------+----------+
|001bf5aa-89a9-4db...|  male|2013-01-25|
|0076c218-1d8d-41a...|  male|2009-05-18|
|00a2421c-80c4-444...|  male|2012-02-06|
|00d4f791-d903-490...|female|2012-11-14|
|00d6d2b3-ed74-446...|female|2011-08-14|
|00ee5ffc-06ca-43f...|  male|2010-08-23|
|00f84f11-7ddf-4ab...|  male|2010-07-23|
|01c31137-599b-462...|  male|2012-06-08|
|01d5cc50-3fc2-432...|female|2014-09-26|
|02164be8-33fb-4fd...|female|2012-01-05|
|02916a8b-caec-4d3...|  male|2011-11-22|
|03ac1f8c-0f3d-438...|female|2015-12-20|
|03b25307-b7d2-488...|  male|2011-02-07|
|03bf37b6-bf38-427...|female|2010-07-25|
|03c796d7-373d-47f...|  male|2014-02-16|
|03cbebc3-0abb-492...|  male|2009-05-14|
|0438df3d-d086-46e...|female|2011-02-11|
|044d12e2-a9c9-497...|female|2009-10-26|
|049c880d-2651-484...|  male|2014-04-22|
|06405f78-8d04-443...|female|2013-01-03|
+--------------------+------+----------+
only showing top