In [1]:

# Install missing python packages
import sys
!{sys.executable} -m pip install -r requirements.txt

import os
import sys
from pyspark import SparkContext, SparkConf
from streaming.encounter_job import EncounterJob
from common.utils import PipelineUtils
spark=PipelineUtils.getSpark()
#print(SparkConf().getAll())

In [2]:
# import and optimize
spark.sql("DROP TABLE IF EXISTS flat_obs")
spark.sql("CREATE TABLE flat_obs USING DELTA LOCATION 'flat_obs_orders.delta'")   
df = spark.sql("select * from flat_obs")
df.printSchema()

In [3]:

%%time
import pyspark.sql.functions as f
nested_df = df.withColumn("obs", f.explode_outer("obs"))\
                .withColumn("orders", f.explode_outer("orders"))

nested_df.createOrReplaceTempView("obs")

nested_df.printSchema()

In [8]:
start_time = datetime.datetime.utcnow()
print("started app")



# transformations
spark.sql("""select
            *,
            case 
                when flattened_obs.concept_id = 9082 and flattened_obs.value = '9036' then 'negative'
                else null end as patient_care_status,
            case 
                when flattened_obs.concept_id = 5303 and flattened_obs.value= '822' then 'exposed'
                when flattened_obs.concept_id = 5303 and flattened_obs.value = '664' then 'negative'
                when flattened_obs.concept_id = 5303 and flattened_obs.value = '1067' then 'unknown'
                else null end as child_hiv_status,
            case
              when encounter_type in (1,2,3,4,10,14,15,17,19,26,32,
              33,34,47,105,106,112,113,114,117,120,127,128,129,138,153,154,158) then 1
            else null end as is_clinical_encounter
            from obs
        """).createOrReplaceTempView("hiv_summary_stage_0")

spark.sql("""select 
             *,
             case 
                 when flattened_obs.concept_id = 7013 and flattened_obs.value is not null then to_date(flattened_obs.value)
                 when flattened_obs.concept_id = 7015 and flattened_obs.value is not null then to_date(flattened_obs.value)
                 when encounter_type not in (21,99999) then encounter_datetime
                 else null end as enrollment_date,
             case 
                 when flattened_obs.concept_id = 1946 and flattened_obs.value = "1065" then 1
                 when flattened_obs.concept_id = 1285 and flattened_obs.value in ("1287","9068") then 1
                 when flattened_obs.concept_id = 1596 then 1
                 when flattened_obs.concept_id = 9082 and flattened_obs.value in ("159","9036","9083","1287","9068","9079", "9504", "1285") then 1
                 when encounter_type = 31 then 1
                 else null end as out_of_care,
             case 
                 when flattened_orders.order_concept_id = 856 then 1
                 else null end as viral_load_ordered,
             case 
                 when flattened_obs.concept_id = 1255 and flattened_obs.value is not null then location_id
                 when flattened_obs.concept_id IN (1250, 1088, 2154) then location_id
                 else null end as arv_location_id
             from hiv_summary_stage_0
             where 
                 child_hiv_status is null 
                 and patient_care_status is null 
                 and is_clinical_encounter = 1 
             """).createOrReplaceTempView("hiv_summary_stage_1")


spark.sql("""select patient_id,
                    encounter_id,
                    first(location_id) as location_id,
                    first(encounter_datetime) as encounter_datetime,
                    first(encounter_type) as encounter_type,
                    first(enrollment_date, true) as enrollment_date,
                    first(out_of_care, true) as out_of_care,
                    first(location_id) as enrollment_location_id,
                    first(viral_load_ordered, true) as viral_load_ordered,
                    first(arv_location_id, true) as arv_location_id,
                    first(gender) as gender,
                    first(birthdate) as birthdate,
                    first(death_date) as death_date,
                    first(is_clinical_encounter) as is_clinical_encounter
                    from hiv_summary_stage_1
                    group by patient_id, encounter_id
        """).createOrReplaceTempView("hiv_summary_stage_2")

hiv_summary_3 = spark.sql("""select patient_id,
                    null as patient_uuid,
                    encounter_id,
                    encounter_datetime,
                    location_id,
                    gender,
                    birthdate,
                    death_date,
                    is_clinical_encounter,
                    current_timestamp() as analysis_date,
                    first(out_of_care, true) over p as out_of_care,
                    first(enrollment_date, true) over p as enrollment_date,
                    case when enrollment_date is not null then location_id
                    else null end as enrollment_location_id,
                    first(arv_location_id, true) over p as arv_start_location_id,
                    viral_load_ordered
                    from hiv_summary_stage_2
                    window p as (partition by patient_id order by encounter_datetime)
        """)

hiv_summary_3.createOrReplaceTempView("hiv_summary_stage_3")

hiv_summary = spark.sql("""select patient_id,
                    null as patient_uuid,
                    encounter_id,
                    encounter_datetime,
                    location_id,
                    gender,
                    birthdate,
                    death_date,
                    is_clinical_encounter,
                    analysis_date,
                    out_of_care,
                    enrollment_date,
                    first(enrollment_location_id, true) over p as enrollment_location_id,
                    arv_start_location_id,
                    viral_load_ordered
                    from hiv_summary_stage_3
                    window p as (partition by patient_id order by encounter_datetime)
        """)

hiv_summary.createOrReplaceTempView("hiv_sum")


#hiv_summary = spark.sql("select *,  from hiv_sum")
#spark.sql("select * from hiv_sum where ordered_viral_load is not null").show(50)
spark.sql("select * from hiv_sum where patient_id = 13460").show(50)

#print(hiv_summary.count())
load_to_cassandra(hiv_summary)

end_time = datetime.datetime.utcnow()
print("Finished: " + time.ctime()) 
print("Took {0} seconds".format((end_time - start_time).total_seconds()))

started app
+----------+------------+------------+-------------------+-----------+------+---------+----------+---------------------+--------------------+-----------+-------------------+----------------------+---------------------+------------------+
|patient_id|patient_uuid|encounter_id| encounter_datetime|location_id|gender|birthdate|death_date|is_clinical_encounter|       analysis_date|out_of_care|    enrollment_date|enrollment_location_id|arv_start_location_id|viral_load_ordered|
+----------+------------+------------+-------------------+-----------+------+---------+----------+---------------------+--------------------+-----------+-------------------+----------------------+---------------------+------------------+
|     13460|        null|     6932376|2017-06-07 10:48:43|         13|     M|     null|      null|                    1|2018-06-27 13:22:...|       null|2017-06-07 10:48:43|                    13|                   13|                 1|
|     13460|        null|     713107