In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark import SparkConf
import gc


print("loaded libraries")
spark = SparkSession.builder \
        .appName("worst vitals") \
        .getOrCreate()

import os
memory = '20g'
pyspark_submit_args = ' --driver-memory ' + memory + ' pyspark-shell'
os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args

loaded libraries


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/20 13:01:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [50]:
vitals = spark.read.parquet("/project2/wparker/SIPA_data/compressed/RCLIF_vitals_10102023.parquet")
vitals = vitals.withColumn('measured_time',f.to_timestamp('recorded_time','yyyy-MM-dd HH:mm:ss'))
vitals = vitals.select('C19_HAR_ID', 'measured_time','vital_name', 'vital_value')

In [55]:
cohort = spark.read.option("header",True).csv('/project2/wparker/cohort.csv')
cohort = cohort.withColumn('life_support_start_time',f.to_timestamp('life_support_start','yyyy-MM-dd HH:mm:ss'))
cohort = cohort.select('C19_HAR_ID', 'life_support_start_time')
cohort = cohort.filter((f.col('life_support_start_time')>='2020-03-01 00:00:00') &
                                           (f.col('life_support_start_time')<='2022-03-31 11:59:59'))
rows = cohort.count()
print(f"DataFrame Rows count : {rows}")

DataFrame Rows count : 11725


In [51]:
cohort_vitals = cohort.join(vitals,'C19_HAR_ID','left')
cohort_vitals = cohort_vitals.withColumn("hour_diff", (f.col("measured_time").cast("long")-f.col("life_support_start_time").cast("long"))/(60*60))
cohort_vitals_48 = cohort_vitals.filter((f.col('hour_diff')>-42)&(f.col('hour_diff')<=5))
cohort_vitals_48 = cohort_vitals_48.filter((f.col('measured_time')>='2020-03-01 00:00:00') &
                                           (f.col('measured_time')<='2022-03-31 11:59:59'))
del vitals
del cohort_vitals
del cohort
gc.collect()

932

In [52]:
cohort_vitals_48 = cohort_vitals_48.withColumn('meas_hour', f.hour(f.col('measured_time')))
cohort_vitals_48 = cohort_vitals_48.withColumn('meas_date', f.to_date(f.col('measured_time')))
cohort_vitals_48 = cohort_vitals_48.select('C19_HAR_ID', 'life_support_start_time','meas_date',
                                           'meas_hour', 'vital_name', 'vital_value')

In [53]:
group_cols = ["C19_HAR_ID", "life_support_start_time","meas_date", "meas_hour"]
cohort_vitals_wide = cohort_vitals_48.groupBy(group_cols) \
                                     .pivot("vital_name") \
                                     .agg(f.min('vital_value').alias("min"),
                                         f.max('vital_value').alias("max")).orderBy(group_cols)

                                                                                

In [46]:
cohort_vitals_wide.write.parquet("/project2/wparker/SIPA_data/cohort_vitals_48_blocked.parquet", mode="overwrite")

                                                                                

In [59]:
group_cols = ["C19_HAR_ID", "life_support_start_time"]

cohort_vitals_wide_summary = cohort_vitals_wide.groupBy(group_cols) \
    .agg(f.min("MAP_min").alias("MAP_min"), \
         f.max("MAP_max").alias("MAP_max"), \
         f.min("dbp_min").alias("dbp_min"), \
         f.max("dbp_max").alias("dbp_max"), \
         f.min("sbp_min").alias("sbp_min"), \
         f.max("sbp_max").alias("sbp_max"), \
         f.min("heart_rate_min").alias("heart_rate_min"), \
         f.max("heart_rate_max").alias("heart_rate_max"), \
         f.min("respiratory_rate_min").alias("respiratory_rate_min"), \
         f.max("respiratory_rate_max").alias("respiratory_rate_max"), \
         f.min("spO2_min").alias("spO2_min"), \
         f.max("spO2_max").alias("spO2_max"), \
         f.min("temp_min").alias("temp_min"), \
         f.max("temp_max").alias("temp_max")).orderBy("life_support_start_time")

rows = cohort_vitals_wide_summary.count()
print(f"DataFrame Rows count : {rows}")

cohort_vitals_wide_summary.write.parquet("/project2/wparker/SIPA_data/cohort_vitals_48_summary.parquet", mode="overwrite")

                                                                                

DataFrame Rows count : 11624


                                                                                