# 0-project-init

In [1]:
%load_ext sparksql_magic

In [2]:
import os
import pandas as pd
import json

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [3]:
DATA_BUCKET = "../data_bucket/physionet_extract"
for item in os.listdir(DATA_BUCKET):
    print(item)

MimicEncounter.ndjson
MimicEncounterED.ndjson
MimicEncounterICU.ndjson


In [4]:
spark = SparkSession.builder.appName("FhirDataApplication").getOrCreate()

# 1-initial-data-analysis

## Encounter Domain

### a. Read Encounter Data with Infer Schema
- There are three types Encounter Datasets: `MimicEncounter`, `MimicEncounterED` & `MimicEncounterICU`
- Check for common columns for Each Encounter Dataset

In [5]:
df_MimicEncounter = spark.read.json(f"{DATA_BUCKET}/MimicEncounter.ndjson")
df_MimicEncounterED = spark.read.json(f"{DATA_BUCKET}/MimicEncounterED.ndjson")
df_MimicEncounterICU = spark.read.json(f"{DATA_BUCKET}/MimicEncounterICU.ndjson")

In [6]:
def recursive_schema_scan(schema, parent = ""):
    result = []
    for item in schema['fields']:
        name, type = item['name'], item['type']
        if isinstance(type, dict):
            if type.get('type') == 'struct':
                temp_result = recursive_schema_scan(type, f"{parent}___{name}")
                result.extend(temp_result)
            elif type.get('type') == 'array':
                if isinstance(type.get('elementType'), dict):
                    temp_result = recursive_schema_scan(type.get('elementType'), f"{parent}___{name}")
                    result.extend(temp_result)
                else:
                    result.append(f"{parent}___{name}")
            else:
                # This case is not handled
                raise Exception()
        else:
            result.append(f"{parent}___{name}")
    return result

result = []
result.extend([("MimicEncounter", item[3:]) for item in recursive_schema_scan(df_MimicEncounter.schema.jsonValue())])
result.extend([("MimicEncounterED", item[3:]) for item in recursive_schema_scan(df_MimicEncounterED.schema.jsonValue())])
result.extend([("MimicEncounterICU", item[3:]) for item in recursive_schema_scan(df_MimicEncounterICU.schema.jsonValue())])

df_encounter_columns = spark.createDataFrame(result, schema=["source", "column"])
df_encounter_columns.createOrReplaceTempView('df_encounter_columns')

spark.sql("""
    SELECT
        column, collect_list(source) sources, count(*) num_of_sources
    FROM df_encounter_columns
    GROUP BY 1
    ORDER BY 3 DESC, 1 ASC
""").show(100, truncate = False)

+--------------------------------------------------------+-----------------------------------------------------+--------------+
|column                                                  |sources                                              |num_of_sources|
+--------------------------------------------------------+-----------------------------------------------------+--------------+
|class___code                                            |[MimicEncounter, MimicEncounterED, MimicEncounterICU]|3             |
|class___system                                          |[MimicEncounter, MimicEncounterED, MimicEncounterICU]|3             |
|id                                                      |[MimicEncounter, MimicEncounterED, MimicEncounterICU]|3             |
|identifier___system                                     |[MimicEncounter, MimicEncounterED, MimicEncounterICU]|3             |
|identifier___value                                      |[MimicEncounter, MimicEncounterED, MimicEncoun

### b. Read Encounter Dataset with FHIR Schema

In [7]:
# Read Encounter JSON Schema
with open(f"./schema/Encounter.json") as f:
    schema_read = json.loads(f.read())
    encounter_schema = T.StructType.fromJson(schema_read)

In [8]:
df_MimicEncounter = spark.read.schema(encounter_schema).json(f"{DATA_BUCKET}/MimicEncounter.ndjson").withColumn("source_name", F.lit("MimicEncounter"))
df_MimicEncounterED = spark.read.schema(encounter_schema).json(f"{DATA_BUCKET}/MimicEncounterED.ndjson").withColumn("source_name", F.lit("MimicEncounterED"))
df_MimicEncounterICU = spark.read.schema(encounter_schema).json(f"{DATA_BUCKET}/MimicEncounterICU.ndjson").withColumn("source_name", F.lit("MimicEncounterICU"))

df_encounter_domain = df_MimicEncounter.unionAll(df_MimicEncounterED).unionAll(df_MimicEncounterICU)
df_encounter_domain.createOrReplaceTempView("df_encounter_domain")

### c. Encounter Class Distribution

In [9]:
df_encounter_class_distribution = spark.sql("""
-- Encounter Class of Each DF
-- https://hl7.org/fhir/R4/v3/ActEncounterCode/vs.html
SELECT
    source_name,
    class.code encounter_class,
    count(1) cnt
FROM df_encounter_domain GROUP BY 1, 2
ORDER BY 1,2
""")

In [10]:
df_pandas_encounter_class_distribution = df_encounter_class_distribution.toPandas()
fig_sunburst_encounter_class_distribution = px.sunburst(df_pandas_encounter_class_distribution, path=['source_name', 'encounter_class'], values='cnt')
fig_sunburst_encounter_class_distribution.update_traces(textinfo='label+value')

table_encounter_class_distribution = go.Table(
    header=dict(values=list(df_pandas_encounter_class_distribution.columns),fill_color='paleturquoise',align='left'),
    cells=dict(values=[df_pandas_encounter_class_distribution[col] for col in df_pandas_encounter_class_distribution.columns],align='left'))

fig_table_encounter_class_distribution = go.Figure(data=[table_encounter_class_distribution])

In [11]:
# Make Subplot Figure
fig_subplot_encounter_class_distribution = make_subplots(
    rows=1, cols=2,
    column_widths=[0.6, 0.4],
    subplot_titles=("", "Data Table"),
    specs=[[{"type": "sunburst"}, {"type": "table"}]]
)

# Add charts to the figure
fig_subplot_encounter_class_distribution.add_trace(fig_sunburst_encounter_class_distribution.data[0], row=1, col=1)
fig_subplot_encounter_class_distribution.add_trace(fig_table_encounter_class_distribution.data[0], row=1, col=2)

fig_subplot_encounter_class_distribution.update_layout(
    height=600,
    title_text="Encounter Class Distribution by Source"
)

fig_subplot_encounter_class_distribution.show()

### d. Encounter Class Transition

In [12]:
df_encounter_class_transition = spark.sql("""
WITH encounter AS (
    SELECT
        id,
        class.code,
        replace(partOf.reference, "Encounter/", "") parent_id,
        CAST(period.start AS Date) period_start,
        CAST(period.end AS Date)   period_end
    FROM df_encounter_domain
),
encounter_parent_child_transition AS (
    SELECT
        parent_encounter.id              AS parent_encounter_id,
        parent_encounter.code            AS parent_encounter_code,
        parent_encounter.period_start    AS parent_encounter_period_start,
        parent_encounter.period_end      AS parent_encounter_period_end,
        (parent_encounter.period_end - parent_encounter.period_start) parent_encounter_duration,
        child_encounter.id               AS child_encounter_id,
        child_encounter.code             AS child_encounter_code,
        child_encounter.period_start     AS child_encounter_period_start,
        child_encounter.period_end       AS child_encounter_period_end,
        (child_encounter.period_end - child_encounter.period_start) child_encounter_duration,
        (parent_encounter.period_end - child_encounter.period_start) duration_between_parent_child
    FROM encounter child_encounter
    LEFT JOIN encounter parent_encounter
    ON child_encounter.parent_id = parent_encounter.id
    WHERE child_encounter.parent_id IS NOT NULL
)
-- SELECT * FROPM encounter_parent_child_transition
SELECT
    parent_encounter_code,
    child_encounter_code,
    count(*) cnt
FROM encounter_parent_child_transition
GROUP BY ALL
ORDER BY 3 DESC
""")

In [13]:
df_collect_encounter_class_transition = df_encounter_class_transition.collect()

source = [item['parent_encounter_code'] for item in df_collect_encounter_class_transition]
target = [item['child_encounter_code'] for item in df_collect_encounter_class_transition]
count = [item['cnt'] for item in df_collect_encounter_class_transition]

# Create a mapping from labels to indices
labels = list(set(source + target))
label_to_index = {label: i for i, label in enumerate(labels)}

# Convert source and target labels to indices
source_indices = [label_to_index[s] for s in source]
target_indices = [label_to_index[t] for t in target]

fig_sankey_encounter_class_transition = go.Figure(data=[go.Sankey(
    node=dict(pad=15, thickness=20, line=dict(color="black", width=0.5), label=labels),
    link=dict(source=source_indices, target=target_indices, value=count)
)])

fig_sankey_encounter_class_transition.update_layout(
    height=600,
    title_text="Encounter Class Transition Sankey Diagram"
)

fig_sankey_encounter_class_transition.show()

### Testing

In [14]:
# %%sparksql
# with child_encounter as (
#     select id, replace(partOf.reference, "Encounter/", "") parent_id, source_name
#     FROM df_encounter_domain
#     where partOf.reference is not null
# )
# select
#     child_encounter.source_name child_source,
#     parent_encounter.source_name parent_source,
#     count(*)
# from child_encounter
# left join df_encounter_domain parent_encounter
# on parent_encounter.id = child_encounter.parent_id
# group by 1,2
# limit 10

In [15]:
# %%sparksql
# select type_coding.* from 
# (select  explode(type_coding) type_coding from (
# select explode(type.coding) type_coding from df_encounter
# ))