In [23]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
os.environ['_JAVA_OPTIONS'] = '-Djava.security.manager=allow -Duser.name=julianromero'

from pyspark.sql import SparkSession
import pyspark.pandas as ps
spark = SparkSession.builder.appName("SparkSession").getOrCreate()

from utils.sql_manager import SQLManager
sql_manager = SQLManager(queries_dir='../queries')

In [24]:
icu_ditems = spark.read.csv("../data/nw_icu/d_items.csv", header=True, inferSchema=True)
lab_items = spark.read.csv("../data/nw_hosp/d_labitems.csv", header=True, inferSchema=True)
lab_events = spark.read.csv("../data/nw_hosp/labevents.csv", header=True, inferSchema=True)
icu_stays = spark.read.csv("../data/nw_icu/icustays.csv", header=True, inferSchema=True)
chart_events = spark.read.csv("../data/nw_icu/chartevents.csv", header=True, inferSchema=True)
procedure_events = spark.read.csv("../data/nw_icu/procedureevents.csv", header=True, inferSchema=True)
procedure_events_cat = spark.read.csv("../data/procedure_events_cat.csv", header=True, inferSchema=True)



                                                                                

In [25]:
icu_ditems.createOrReplaceTempView("icu_ditems")
ditems = spark.sql("select itemid as id, label, abbreviation, category, unitname as unit, param_type as type from icu_ditems")
ditems_df = ditems.toPandas()
ditems_df


Unnamed: 0,id,label,abbreviation,category,unit,type
0,300001,R BMI,BMI,General,Kg/m^2,Numeric
1,320045,PULSE,HR,Routine Vital Signs,bpm,Numeric
2,320050,R ARTERIAL LINE BLOOD PRESSURE,ABPs,Routine Vital Signs,mmHg,Numeric
3,320050,R ARTERIAL LINE BLOOD PRESSURE 2,ABPs,Routine Vital Signs,mmHg,Numeric
4,320051,R ARTERIAL LINE BLOOD PRESSURE,ABPd,Routine Vital Signs,mmHg,Numeric
...,...,...,...,...,...,...
339,798762,XR ABDOMEN AP PORTABLE,,,,Processes
340,798810,TUBE FEEDING,,,,Processes
341,799098,BEDREST,,,,Processes
342,799609,PHYSICAL THERAPY REFERRAL,,,,Processes


In [26]:
lab_items.createOrReplaceTempView("lab_items")
lab_items_df = spark.sql("select itemid as id, label, fluid, category from lab_items")


In [27]:
lab_events.createOrReplaceTempView("lab_events")
icu_stays.createOrReplaceTempView("icu_stays")
chart_events.createOrReplaceTempView("chart_events_data")
procedure_events.createOrReplaceTempView("procedure_events")
procedure_events_cat.createOrReplaceTempView("procedure_events_cat")



In [28]:
icu_stays_data = sql_manager.execute(spark, 'initial_data/icu_stays.sql')
icu_stays_data.createOrReplaceTempView("icu_stays_data")

lab_events_data = sql_manager.execute(spark, 'initial_data/lab_events.sql')
lab_events_data.createOrReplaceTempView("lab_events_data")

In [29]:
lab_events_adm = sql_manager.execute(spark, 'joins/admission_labevents.sql')
lab_events_adm.createOrReplaceTempView("lab_events_adm")

In [30]:
lab_events = sql_manager.execute(spark, 'joins/icu_stays_labevents.sql')
lab_events.createOrReplaceTempView("lab_events_icu")
count_item_id = spark.sql("select itemid, count(*) as count from lab_events_icu group by itemid order by count desc")
count_item_id.createOrReplaceTempView("lab_events_icu_count")
lab_events_table_data = sql_manager.execute(spark, 'joins/lab_events_table_data.sql')
lab_events_table_data.toPandas().to_csv("../data/lab_items_table_data.csv", index=False)

                                                                                

In [31]:
chart_events = sql_manager.execute(spark, 'joins/icu_stays_chartevents.sql')
chart_events.createOrReplaceTempView("chart_events_icu")
count_item_id_chart = spark.sql("select itemid, count(*) as count from chart_events_icu group by itemid order by count desc")
count_item_id_chart_df = count_item_id_chart.toPandas()
count_item_id_chart.createOrReplaceTempView("chart_events_icu_count")
chart_events_count_data = sql_manager.execute(spark, 'joins/chart_events_table_data.sql')
chart_events_table_data = spark.sql("select d.itemid as id, d.label, d.abbreviation, d.category, d.unitname as unit, d.param_type as type, c.count from icu_ditems d join chart_events_icu_count c on d.itemid = c.itemid order by d.itemid")
chart_events_table_data.createOrReplaceTempView("chart_events_table_data")
#chart_events_table_data.toPandas().to_csv("../data/chart_items_table_data.csv", index=False)
chart_events_df = chart_events.toPandas()
chart_events_df
count_item_id_chart_df



                                                                                

Unnamed: 0,itemid,count
0,320045,311904
1,320277,300912
2,320210,269071
3,320179,233411
4,320180,233411
5,323761,102953
6,320050,98946
7,320051,98817
8,326531,10389
9,300001,8766


In [32]:
procedure_events_data = sql_manager.execute(spark, 'joins/icu_stays_procedure_events.sql')
procedure_events_data.createOrReplaceTempView("procedure_events_icu")
count_item_id_procedure = spark.sql("select itemid, count(*) as count from procedure_events_icu group by itemid order by count desc")
count_item_id_procedure.createOrReplaceTempView("procedure_events_icu_count")
procedure_events_table_data = sql_manager.execute(spark, 'joins/procedure_events_table_data.sql')
procedure_events_table_data.toPandas().to_csv("../data/procedure_items_table_data.csv", index=False)

In [33]:
procedure_events_final_table_data = spark.sql("select d.itemid as id, d.label, d.abbreviation, d.category, d.unitname as unit, d.param_type as type, c.count from icu_ditems d join procedure_events_icu_count c on d.itemid = c.itemid order by d.itemid")
procedure_events_final_table_data.toPandas().to_csv("../data/procedure_items_final_table_data.csv", index=False)

In [34]:
systolic_arterial_bp = spark.sql("select * from chart_events_data where itemid = 320050")
dystolic_arterial_bp = spark.sql("select * from chart_events_data where itemid = 320051")



In [35]:
systolic_arterial_bp_df = systolic_arterial_bp.toPandas()
dystolic_arterial_bp_df = dystolic_arterial_bp.toPandas()

                                                                                

In [36]:
systolic_arterial_bp_df.head()

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valuenum,valueuom,warning
0,30000590,44608425,50707570,,2177-12-09 12:15:00,2177-12-09 12:26:00,320050,140.0,140.0,mmHg,
1,30000590,44608425,50707570,,2177-12-09 12:30:00,2177-12-09 12:46:00,320050,140.0,140.0,mmHg,
2,30000590,44608425,50707570,,2177-12-09 12:45:00,2177-12-09 12:46:00,320050,143.0,143.0,mmHg,
3,30000590,44608425,50707570,,2177-12-09 13:00:00,2177-12-09 13:36:00,320050,142.0,142.0,mmHg,
4,30000590,44608425,50707570,,2177-12-09 14:00:00,2177-12-09 14:23:00,320050,158.0,158.0,mmHg,


In [37]:
dystolic_arterial_bp_df.head()

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valuenum,valueuom,warning
0,30000590,44608425,50707570,,2177-12-09 12:15:00,2177-12-09 12:26:00,320051,73.0,73.0,mmHg,
1,30000590,44608425,50707570,,2177-12-09 12:30:00,2177-12-09 12:46:00,320051,74.0,74.0,mmHg,
2,30000590,44608425,50707570,,2177-12-09 12:45:00,2177-12-09 12:46:00,320051,74.0,74.0,mmHg,
3,30000590,44608425,50707570,,2177-12-09 13:00:00,2177-12-09 13:36:00,320051,71.0,71.0,mmHg,
4,30000590,44608425,50707570,,2177-12-09 14:00:00,2177-12-09 14:23:00,320051,86.0,86.0,mmHg,


In [38]:
df = spark.sql('select * from procedure_events_cat order by id')
df.createOrReplaceTempView("procedure_events_with_categories")

In [None]:
procedure_events_final_table_data = spark.sql("select d.itemid as id, d.label, d.param_type as type, t.categoria as category, c.count from icu_ditems d join procedure_events_icu_count c on d.itemid = c.itemid join procedure_events_with_categories t on d.itemid = t.id order by d.itemid")
procedure_events_final_table_data.toPandas().to_csv("../data/procedure_items_final_table_data.csv", index=False)