In [1]:
import os
import sys
from pyspark import SparkContext, SparkConf
from streaming.encounter_job import EncounterJob
from common.utils import PipelineUtils
spark=PipelineUtils.getSpark()
#print(SparkConf().getAll())

In [2]:
# import and optimize
spark.sql("DROP TABLE IF EXISTS flat_obs")
spark.sql("CREATE TABLE flat_obs USING DELTA LOCATION 'flat_obs_orders.delta'")   
df = spark.sql("select * from flat_obs")
df.printSchema()

root
 |-- encounter_id: integer (nullable = true)
 |-- patient_id: integer (nullable = true)
 |-- location_id: integer (nullable = true)
 |-- visit_id: long (nullable = true)
 |-- encounter_datetime: timestamp (nullable = true)
 |-- encounter_type: long (nullable = true)
 |-- dead: boolean (nullable = true)
 |-- gender: string (nullable = true)
 |-- death_date: timestamp (nullable = true)
 |-- patient_uuid: string (nullable = true)
 |-- visit_type_id: long (nullable = true)
 |-- birthdate: date (nullable = true)
 |-- obs: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- obs_id: long (nullable = true)
 |    |    |-- voided: boolean (nullable = true)
 |    |    |-- concept_id: integer (nullable = true)
 |    |    |-- obs_datetime: timestamp (nullable = true)
 |    |    |-- value: string (nullable = true)
 |    |    |-- value_type: string (nullable = true)
 |    |    |-- obs_group_id: integer (nullable = true)
 |    |    |-- parent_concept_id: integer

In [3]:
%%time
import pyspark.sql.functions as f
nested_df = df.withColumn("obs", f.explode_outer("obs"))\
                .withColumn("orders", f.explode_outer("orders"))

nested_df.createOrReplaceTempView("nested_df")

nested_df.printSchema()


root
 |-- encounter_id: integer (nullable = true)
 |-- patient_id: integer (nullable = true)
 |-- location_id: integer (nullable = true)
 |-- visit_id: long (nullable = true)
 |-- encounter_datetime: timestamp (nullable = true)
 |-- encounter_type: long (nullable = true)
 |-- dead: boolean (nullable = true)
 |-- gender: string (nullable = true)
 |-- death_date: timestamp (nullable = true)
 |-- patient_uuid: string (nullable = true)
 |-- visit_type_id: long (nullable = true)
 |-- birthdate: date (nullable = true)
 |-- obs: struct (nullable = true)
 |    |-- obs_id: long (nullable = true)
 |    |-- voided: boolean (nullable = true)
 |    |-- concept_id: integer (nullable = true)
 |    |-- obs_datetime: timestamp (nullable = true)
 |    |-- value: string (nullable = true)
 |    |-- value_type: string (nullable = true)
 |    |-- obs_group_id: integer (nullable = true)
 |    |-- parent_concept_id: integer (nullable = true)
 |-- orders: struct (nullable = true)
 |    |-- order_id: long (null

In [4]:
%%time
spark.sql("""select 
             *, 
             case
                when obs.concept_id = 1839 then obs.value
                else null
                end as scheduled_visit,
              case
                when obs.concept_id = 5089 then obs.value
                else null
                end as weight,
              case
                when obs.concept_id = 5090 then obs.value
                else null
                end as height,
              case
                when obs.concept_id = 5088 then obs.value
                else null
                end as temp,
            case
                when obs.concept_id = 5092 then obs.value
                else null
                end as oxygen_sat,
            case
                when obs.concept_id = 5085 then obs.value
                else null
                end as systolic_bp,
             case
                when obs.concept_id = 5086 then obs.value
                else null
                end as diastolic_bp,
            case
                when obs.concept_id = 5087 then obs.value
                else null
                end as pulse

    from nested_df
                     """) .createOrReplaceTempView("vitals1")


CPU times: user 1.08 ms, sys: 1.39 ms, total: 2.47 ms
Wall time: 109 ms


In [5]:
%%time
spark.sql("""select

                first(patient_id, true) as patient_id,
                encounter_id,
                first(location_id) as location_id,
                first(encounter_datetime) as encounter_datetime,
                first(encounter_type) as encounter_type,
                first(gender) as gender,
                first(birthdate) as birthdate,
                first(weight, true) as weight_1,
                first(height, true) as height_1,
                first(temp, true) as temp_1,
                first(oxygen_sat, true) as oxygen_sat_1,
                first(systolic_bp, true) as systolic_bp_1,
                first(diastolic_bp, true) as diastolic_bp_1,
                first(pulse, true) as pulse_1
                from vitals1
            group by encounter_id
        """).createOrReplaceTempView("vitals2")

CPU times: user 1.16 ms, sys: 1.44 ms, total: 2.6 ms
Wall time: 67.3 ms


In [6]:
vitals = spark.sql("""select patient_id,encounter_id,encounter_datetime,
        case 
            when weight_1 is null and  lead(first(weight_1, true) over p) over p is not null
            then  lead(first(weight_1, true) over p) over p else first(weight_1, true) over p 
            end as weight,
        case 
            when height_1 is null and  lead(first(height_1, true) over p) over p is not null
            then  lead(first(height_1, true) over p) over p else first(height_1, true) over p 
            end as height,
        case 
            when temp_1 is null and  lead(first(temp_1, true) over p) over p is not null
            then  lead(first(temp_1, true) over p) over p else first(temp_1, true) over p 
            end as temp,
        case 
            when oxygen_sat_1 is null and  lead(first(oxygen_sat_1, true) over p) over p is not null
            then  lead(first(oxygen_sat_1, true) over p) over p else first(oxygen_sat_1, true) over p 
            end as oxygen_sat,
        case 
            when systolic_bp_1 is null and  lead(first(systolic_bp_1, true) over p) over p is not null
            then  lead(first(systolic_bp_1, true) over p) over p else first(systolic_bp_1, true) over p 
            end as systolic_bp,
        case 
            when diastolic_bp_1 is null and  lead(first(diastolic_bp_1, true) over p) over p is not null
            then  lead(first(diastolic_bp_1, true) over p) over p else first(diastolic_bp_1, true) over p 
            end as diastolic_bp,
        case 
            when pulse_1 is null and  lead(first(pulse_1, true) over p) over p is not null
            then  lead(first(pulse_1, true) over p) over p else first(pulse_1, true) over p 
            end as pulse
     from vitals2
     window p as (partition by patient_id, to_date(encounter_datetime)  order by encounter_datetime)
""")
vitals

patient_id,encounter_id,encounter_datetime,weight,height,temp,oxygen_sat,systolic_bp,diastolic_bp,pulse
69,763,2016-09-06 09:33:37,248.0,97.0,41.0,21.0,227.0,19.0,190.0
69,764,2016-09-06 10:13:37,248.0,97.0,41.0,21.0,227.0,19.0,190.0
94,1071,2016-03-14 09:50:16,14.0,91.0,42.0,75.0,170.0,76.0,0.0
94,1072,2016-03-14 10:50:16,14.0,91.0,42.0,75.0,170.0,76.0,0.0
157,1721,2015-03-28 09:32:34,65.0,119.0,27.0,10.0,163.0,129.0,14.0
157,1722,2015-03-28 10:38:34,65.0,119.0,27.0,10.0,163.0,129.0,14.0
236,2517,2015-09-28 10:02:12,182.0,204.0,37.0,49.0,108.0,83.0,220.0
236,2518,2015-09-28 10:40:12,182.0,204.0,37.0,49.0,108.0,83.0,220.0
245,2611,2015-07-05 09:24:59,4.0,76.0,29.0,65.0,192.0,60.0,197.0
245,2612,2015-07-05 10:28:59,4.0,76.0,29.0,65.0,192.0,60.0,197.0


In [11]:
from pyspark.sql.functions import col
vitals.sort(col("temp").desc())

patient_id,encounter_id,encounter_datetime,weight,height,temp,oxygen_sat,systolic_bp,diastolic_bp,pulse
25,266,2016-05-09 10:25:31,152.0,199.0,43.0,51.0,4.0,143.0,34.0
411,4578,2016-12-14 11:25:59,170.0,23.0,43.0,53.0,35.0,139.0,164.0
25,265,2016-05-09 09:47:31,152.0,199.0,43.0,51.0,4.0,143.0,34.0
403,4453,2015-07-11 10:34:00,73.0,155.0,43.0,21.0,153.0,145.0,32.0
411,4577,2016-12-14 10:05:59,170.0,23.0,43.0,53.0,35.0,139.0,164.0
77,876,2016-08-08 10:33:34,139.0,180.0,43.0,9.0,209.0,87.0,92.0
436,4839,2015-10-27 10:30:04,226.0,47.0,43.0,28.0,54.0,129.0,24.0
403,4454,2015-07-11 11:20:00,73.0,155.0,43.0,21.0,153.0,145.0,32.0
76,857,2016-08-15 09:43:25,141.0,79.0,43.0,20.0,54.0,133.0,57.0
436,4840,2015-10-27 10:41:04,226.0,47.0,43.0,28.0,54.0,129.0,24.0


Check the status: http://localhost:4040/streaming/