In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
import gc

spark = SparkSession.builder \
        .appName("worst vitals") \
        .config("spark.driver.memory", "20g") \
        .getOrCreate()

from pyspark import SparkConf

conf = SparkConf().set("spark.ui.showConsoleProgress", "true")

In [None]:
vitals = spark.read.parquet("/project2/wparker/SIPA_data/compressed/RCLIF_vitals_10102023.parquet")
vitals = vitals.withColumn('measured_time',f.to_timestamp('recorded_time','yyyy-MM-dd HH:mm:ss'))
vitals = vitals.select('C19_HAR_ID', 'measured_time','vital_name', 'vital_value')

In [None]:
cohort = spark.read.option("header",True).csv('/project2/wparker/cohort.csv')
cohort = cohort.withColumn('life_support_start_time',f.to_timestamp('life_support_start','yyyy-MM-dd HH:mm:ss'))
cohort = cohort.select('C19_HAR_ID', 'life_support_start_time')

In [None]:
cohort_vitals = cohort.join(vitals,'C19_HAR_ID','left')
cohort_vitals = cohort_vitals.withColumn("hour_diff", (f.col("measured_time").cast("long")-f.col("life_support_start_time").cast("long"))/(60*60))
cohort_vitals_48 = cohort_vitals.filter((f.col('hour_diff')>-42)&(f.col('hour_diff')<=6))

del vitals
del cohort_vitals
#del cohort
gc.collect()

In [None]:
cohort_vitals_48 = cohort_vitals_48.withColumn('meas_hour', f.hour(f.col('measured_time')))
cohort_vitals_48 = cohort_vitals_48.withColumn('meas_date', f.to_date(f.col('measured_time')))

In [None]:
group_cols = ["C19_HAR_ID", "life_support_start_time", "hour_diff","meas_date", "meas_hour"]
cohort_temp_grouped = cohort_vitals_48.filter(f.col('vital_name')=='temp')
cohort_temp_grouped = cohort_temp_grouped.groupBy(group_cols) \
    .agg(f.min("vital_value").alias("min_temp"), \
         f.max("vital_value").alias("max_temp"), \
     ) \
    .select('C19_HAR_ID', 'life_support_start_time','meas_date', 'meas_hour', 'min_temp', 'max_temp') 

cohort_pulse_grouped = cohort_vitals_48.filter(f.col('vital_name')=='heart_rate')
cohort_pulse_grouped = cohort_pulse_grouped.groupBy(group_cols) \
    .agg(f.min("vital_value").alias("min_heart_rate"), \
         f.max("vital_value").alias("max_heart_rate"), \
     ) \
    .select('C19_HAR_ID', 'life_support_start_time','meas_date', 'meas_hour', 'min_heart_rate', 'max_heart_rate') 

cohort_resp_grouped = cohort_vitals_48.filter(f.col('vital_name')=='respiratory_rate')
cohort_resp_grouped = cohort_resp_grouped.groupBy(group_cols) \
    .agg(f.min("vital_value").alias("min_resp_rate"), \
         f.max("vital_value").alias("max_resp_rate"), \
     ) \
    .select('C19_HAR_ID', 'life_support_start_time','meas_date', 'meas_hour', 'min_resp_rate', 'max_resp_rate') 

cohort_spO2_grouped = cohort_vitals_48.filter(f.col('vital_name')=='spO2')
cohort_spO2_grouped = cohort_spO2_grouped.groupBy(group_cols) \
    .agg(f.min("vital_value").alias("min_spO2"), \
         f.max("vital_value").alias("max_spO2"), \
     ) \
    .select('C19_HAR_ID', 'life_support_start_time','meas_date', 'meas_hour', 'min_spO2', 'max_spO2') 

cohort_sbp_grouped = cohort_vitals_48.filter(f.col('vital_name')=='sbp')
cohort_sbp_grouped = cohort_sbp_grouped.groupBy(group_cols) \
    .agg(f.min("vital_value").alias("min_sbp"), \
         f.max("vital_value").alias("max_sbp"), \
     ) \
    .select('C19_HAR_ID', 'life_support_start_time','meas_date', 'meas_hour', 'min_sbp', 'max_sbp') 

cohort_dbp_grouped = cohort_vitals_48.filter(f.col('vital_name')=='dbp')
cohort_dbp_grouped = cohort_dbp_grouped.groupBy(group_cols) \
    .agg(f.min("vital_value").alias("min_dbp"), \
         f.max("vital_value").alias("max_dbp"), \
     ) \
    .select('C19_HAR_ID', 'life_support_start_time','meas_date', 'meas_hour', 'min_dbp', 'max_dbp') 

cohort_map_grouped = cohort_vitals_48.filter(f.col('vital_name')=='MAP')
cohort_map_grouped = cohort_map_grouped.groupBy(group_cols) \
    .agg(f.min("vital_value").alias("min_MAP"), \
         f.max("vital_value").alias("max_MAP"), \
     ) \
    .select('C19_HAR_ID', 'life_support_start_time','meas_date', 'meas_hour', 'min_MAP', 'max_MAP')

del cohort_vitals_48
gc.collect()

In [None]:
cohort_vitals_blocked = cohort_temp_grouped.join(cohort_pulse_grouped, \
                                                 ['C19_HAR_ID','life_support_start_time', \
                                                     'meas_date', 'meas_hour'], how='full')
cohort_vitals_blocked = cohort_vitals_blocked.join(cohort_resp_grouped, \
                                                 ['C19_HAR_ID','life_support_start_time', \
                                                     'meas_date', 'meas_hour'], how='full')
cohort_vitals_blocked = cohort_vitals_blocked.join(cohort_spO2_grouped, \
                                                 ['C19_HAR_ID','life_support_start_time', \
                                                     'meas_date', 'meas_hour'], how='full')
cohort_vitals_blocked = cohort_vitals_blocked.join(cohort_sbp_grouped, \
                                                 ['C19_HAR_ID','life_support_start_time', \
                                                     'meas_date', 'meas_hour'], how='full')
cohort_vitals_blocked = cohort_vitals_blocked.join(cohort_dbp_grouped, \
                                                 ['C19_HAR_ID','life_support_start_time', \
                                                     'meas_date', 'meas_hour'], how='full')
cohort_vitals_blocked = cohort_vitals_blocked.join(cohort_map_grouped, \
                                                 ['C19_HAR_ID','life_support_start_time', \
                                                     'meas_date', 'meas_hour'], how='full')
del cohort_temp_grouped
del cohort_pulse_grouped
del cohort_resp_grouped
del cohort_spO2_grouped
del cohort_sbp_grouped
del cohort_dbp_grouped
del cohort_map_grouped
gc.collect()

In [None]:
cohort_vitals_blocked.write.parquet("/project2/wparker/SIPA_data/cohort_vitals_blocked.parquet")

In [None]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))
cohort_vitals_blocked.show()
spark.stop()