In [None]:
%load_ext sparksql_magic

In [None]:
import os
import pandas as pd
import json
import plotly.graph_objects as go

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [None]:
DATA_BUCKET = "../data_bucket/physionet_extract"
for item in os.listdir(DATA_BUCKET):
    print(item)

In [None]:
spark = SparkSession.builder.appName("FhirDataApplication").getOrCreate()

# Read Encounter Data with Infer Schema
- Analyze Number of Columns in Each File

In [None]:
df_MimicEncounter = spark.read.json(f"{DATA_BUCKET}/MimicEncounter.ndjson")
df_MimicEncounterED = spark.read.json(f"{DATA_BUCKET}/MimicEncounterED.ndjson")
df_MimicEncounterICU = spark.read.json(f"{DATA_BUCKET}/MimicEncounterICU.ndjson")

In [None]:
def recursive_print_schema(schema, parent = ""):
    for item in schema['fields']:
        name = item['name']
        type = item['type']
        if isinstance(type, dict):
            if type.get('type') == 'struct':
                recursive_print_schema(type, f"{parent}>{name}")           
            elif type.get('type') == 'array':
                if isinstance(type.get('elementType'), dict):
                    recursive_print_schema(type.get('elementType'), f"{parent}>{name}")
                else:
                    print(f"{parent}>{name}")
            else:
                print("<<<<<>>>>>")
        else:
            print(f"{parent}>{name}")
    

# recursive_print_schema(df_MimicEncounterICU.schema.jsonValue())

In [None]:
df_column_nested = spark.read.csv("./ColumnNested.csv", header=True)
df_column_nested.createOrReplaceTempView("df_column_nested")

In [None]:
%%sparksql -l 50
SELECT
    ColumnNested, collect_list(File) files, count(*) num_of_sources
FROM df_column_nested
GROUP BY 1
ORDER BY 3 desc, 1 asc

# Read all Encounter Data in single dataframe

In [None]:
# Read Encounter JSON Schema
with open(f"./schema/Encounter.json") as f:
    schema_read = json.loads(f.read())
    encounter_schema = T.StructType.fromJson(schema_read)

In [None]:
df_MimicEncounter = spark.read.schema(encounter_schema).json(f"{DATA_BUCKET}/MimicEncounter.ndjson").withColumn("source_name", F.lit("MimicEncounter"))
df_MimicEncounterED = spark.read.schema(encounter_schema).json(f"{DATA_BUCKET}/MimicEncounterED.ndjson").withColumn("source_name", F.lit("MimicEncounterED"))
df_MimicEncounterICU = spark.read.schema(encounter_schema).json(f"{DATA_BUCKET}/MimicEncounterICU.ndjson").withColumn("source_name", F.lit("MimicEncounterICU"))

df_encounter = df_MimicEncounter.unionAll(df_MimicEncounterED).unionAll(df_MimicEncounterICU)
df_encounter.createOrReplaceTempView("df_encounter")

In [None]:
%%sparksql
SELECT
    source_name, count(*) cnt
FROM df_encounter
GROUP BY 1

In [None]:
%%sparksql
-- Encounter Class of Each DF
-- https://hl7.org/fhir/R4/v3/ActEncounterCode/vs.html
SELECT
    source_name,
    class.code encounter_class,
    count(1) cnt
FROM df_encounter GROUP BY 1,2
ORDER BY 1,2

In [None]:
%%sparksql
-- Code System
-- AdmitSource: https://mimic.mit.edu/fhir/ValueSet-mimic-admit-source.html
-- DischargeDisposition: https://mimic.mit.edu/fhir/ValueSet-mimic-discharge-disposition.html
SELECT distinct "admitSource" key, explode(hospitalization.admitSource.coding.code) value from df_MimicEncounter UNION
SELECT distinct "dischargeDisposition" key, explode(hospitalization.dischargeDisposition.coding.code) value from df_MimicEncounter
ORDER BY 1,2

In [None]:
%%sparksql -l 10
SELECT distinct
hospitalization.admitSource.coding.code[0] admitSource,
hospitalization.dischargeDisposition.coding.code[0] dischargeDisposition
from df_MimicEncounter
order by 1

In [None]:
%%sparksql
select distinct status FROM df_Encounter

In [None]:
%%sparksql
-- partOf>reference
with temp as (
    select id, replace(partOf.reference, "Encounter/", "") parent_id, source_name
    FROM df_Encounter
    where partOf.reference is not null
)
select
    -- temp.id,
    -- temp.parent_id,
    df_Encounter.source_name parent_source,
    temp.source_name,
    count(*)
from temp
left join df_Encounter
on df_Encounter.id = temp.parent_id
group by 1,2
limit 10

In [None]:
%%sparksql
--  Check if the Encounter have GrandParents
with temp as (
    select id,
    replace(partOf.reference, "Encounter/", "") parent_id
    FROM df_Encounter
    where partOf.reference is not null
)
select
distinct partOf.reference
FROM df_Encounter
where id in (select parent_id from temp)

In [None]:
%%sparksql
select
replace(partOf.reference, "Encounter/", "") parent_id, count(*) cnt
FROM df_Encounter
where partOf.reference is not null
group by 1
order by 2 desc

In [None]:
%%sparksql
select type_coding.* from 
(select  explode(type_coding) type_coding from (
select explode(type.coding) type_coding from df_encounter
))

In [None]:
%%sparksql
select serviceType from df_encounter
where id = 'c8816dcb-47f0-55f0-933f-122d0caac629'
-- Encounter Class: Emergency

In [None]:
%%sparksql
with encounter as (
    select
        id,
        class.code,
        replace(partOf.reference, "Encounter/", "") parent_id,
        CAST(period.start AS Date) period_start,
        CAST(period.end AS Date)   period_end
    from df_encounter
),
encounter_parent_child as (
    select 
        parent_encounter.id              AS parent_encounter_id,      
        parent_encounter.code            AS parent_encounter_code,        
        parent_encounter.period_start    AS parent_encounter_period_start,                
        parent_encounter.period_end      AS parent_encounter_period_end,
        (parent_encounter.period_end - parent_encounter.period_start) parent_encounter_duration,          
        child_encounter.id               AS child_encounter_id,     
        child_encounter.code             AS child_encounter_code,       
        child_encounter.period_start     AS child_encounter_period_start,               
        child_encounter.period_end       AS child_encounter_period_end,
        (child_encounter.period_end - child_encounter.period_start) child_encounter_duration,
        (parent_encounter.period_end - child_encounter.period_start) duration_between_parent_child
    from encounter child_encounter
    left join encounter parent_encounter
    on child_encounter.parent_id = parent_encounter.id
    where child_encounter.parent_id is not null
)
select * from encounter_parent_child
-- select parent_encounter_code source,child_encounter_code target, count(*) cnt from encounter_parent_child group by all order by 3 desc

In [None]:
encounter_source_target_data = spark.sql("""
with encounter as (
    select
        id,
        class.code,
        replace(partOf.reference, "Encounter/", "") parent_id,
        CAST(period.start AS Date) period_start,
        CAST(period.end AS Date)   period_end
    from df_encounter
),
encounter_parent_child as (
    select 
        parent_encounter.id              AS parent_encounter_id,      
        parent_encounter.code            AS parent_encounter_code,        
        parent_encounter.period_start    AS parent_encounter_period_start,                
        parent_encounter.period_end      AS parent_encounter_period_end,
        (parent_encounter.period_end - parent_encounter.period_start) parent_encounter_duration,          
        child_encounter.id               AS child_encounter_id,     
        child_encounter.code             AS child_encounter_code,       
        child_encounter.period_start     AS child_encounter_period_start,               
        child_encounter.period_end       AS child_encounter_period_end,
        (child_encounter.period_end - child_encounter.period_start) child_encounter_duration,
        (parent_encounter.period_end - child_encounter.period_start) duration_between_parent_child
    from encounter child_encounter
    left join encounter parent_encounter
    on child_encounter.parent_id = parent_encounter.id
    where child_encounter.parent_id is not null
)
select parent_encounter_code source,child_encounter_code target, count(*) cnt from encounter_parent_child group by all order by 3 desc
""").collect()

source = [item['source'] for item in encounter_source_target_data]
target = [item['target'] for item in encounter_source_target_data]
count = [item['cnt'] for item in encounter_source_target_data]

# Create a list of unique labels
labels = list(set(source + target))

# Create a mapping from labels to indices
label_to_index = {label: i for i, label in enumerate(labels)}

# Convert source and target labels to indices
source_indices = [label_to_index[s] for s in source]
target_indices = [label_to_index[t] for t in target]

# Create the Sankey diagram
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=labels
    ),
    link=dict(
        source=source_indices,
        target=target_indices,
        value=count
    )
)])

# Update the layout and display the figure
fig.update_layout(title_text="Ecnounter Class Sankey Diagram", font_size=10)
fig.show()