This file converts the Research Identified Files into Limited Dataset Files.

Author: John Lawrence, Date 3/7/2023

In [None]:
from pyspark.sql.functions import monotonically_increasing_id,rand,col,round,to_timestamp,date_sub,expr,date_format,lit

In [2]:
#Load all the Ross datasets
tdfar19 = spark.read.parquet('/fs/ess/scratch/PAS2164/CarescapeComb/Ross2019Alarm.parquet')
tdfwr19 = spark.read.parquet('/fs/ess/scratch/PAS2164/CarescapeComb/Ross2019Wave.parquet')
tdfmr19 = spark.read.parquet('/fs/ess/scratch/PAS2164/CarescapeComb/Ross2019Measurement.parquet')
tdfar20 = spark.read.parquet('/fs/ess/scratch/PAS2164/CarescapeComb/Ross2020Alarm.parquet')
tdfwr20 = spark.read.parquet('/fs/ess/scratch/PAS2164/CarescapeComb/Ross2020Wave.parquet')
tdfmr20 = spark.read.parquet('/fs/ess/scratch/PAS2164/CarescapeComb/Ross2020Measurement.parquet')
#tdfar21 = spark.read.parquet('/fs/ess/scratch/PAS2164/CarescapeComb/Ross2021Alarm.parquet')
#tdfwr21 = spark.read.parquet('/fs/ess/scratch/PAS2164/CarescapeComb/Ross2021Wave.parquet')
#tdfmr21 = spark.read.parquet('/fs/ess/scratch/PAS2164/CarescapeComb/Ross2021Measurement.parquet')

#Load all the James datasets
tdfaj19 = spark.read.parquet('/fs/ess/scratch/PAS2164/CarescapeComb/James2019Alarm.parquet')
tdfwj19 = spark.read.parquet('/fs/ess/scratch/PAS2164/CarescapeComb/James2019Wave.parquet')
tdfmj19 = spark.read.parquet('/fs/ess/scratch/PAS2164/CarescapeComb/James2019Measurement.parquet')
tdfaj20 = spark.read.parquet('/fs/ess/scratch/PAS2164/CarescapeComb/James2020Alarm.parquet')
tdfwj20 = spark.read.parquet('/fs/ess/scratch/PAS2164/CarescapeComb/James2020Wave.parquet')
tdfmj20 = spark.read.parquet('/fs/ess/scratch/PAS2164/CarescapeComb/James2020Measurement.parquet')
#tdfaj21 = spark.read.parquet('/fs/ess/scratch/PAS2164/CarescapeComb/James2021Alarm.parquet')
#tdfwj21 = spark.read.parquet('/fs/ess/scratch/PAS2164/CarescapeComb/James2021Wave.parquet')
#tdfmj21 = spark.read.parquet('/fs/ess/scratch/PAS2164/CarescapeComb/James2021Measurement.parquet')

#If this is not the first time running th is, load the existing bed and person tables to preserve identifiers and date offset.
dfBed = spark.read.parquet('/fs/ess/scratch/PAS2164/LocationMap.parquet')
dfcsid = spark.read.parquet('/fs/ess/scratch/PAS2164/PatientMap.parquet')

In [3]:
#Combine all the James datasets into a 3 master datasets
tdfar2=tdfar19.unionByName(tdfar20)
#tdfar2=tdfar1.unionByName(tdfar21)
tdfwr2=tdfwr19.unionByName(tdfwr20)
#tdfwr2=tdfwr1.unionByName(tdfwr21)
tdfmr2=tdfmr19.unionByName(tdfmr20)
#tdfmr2=tdfmr1.unionByName(tdfmr21)

#Combine all the ross datasets into 3 master datasets
tdfaj2=tdfaj19.unionByName(tdfaj20)
#tdfaj2=tdfaj1.unionByName(tdfaj21)
tdfwj2=tdfwj19.unionByName(tdfwj20)
#tdfwj2=tdfwj1.unionByName(tdfwj21)
tdfmj2=tdfmj19.unionByName(tdfmj20)
#tdfmj2=tdfmj1.unionByName(tdfmj21)

#Combine all the ross and james datasets into single datasets tdfa* is temp dataframe alarm j for james, r for ross.
tempdfa=tdfar2.unionByName(tdfaj2)
tempdfw=tdfwr2.unionByName(tdfwj2)
tempdfm=tdfmr2.unionByName(tdfmj2)

#So at this point we have 3 dataframes, one for each of the three different data tables.


In [4]:
#I want to mask beds, so I am going to make a unique list of beds from all three datasets. 
bedA=tempdfa["assignedLocationCareArea","assignedLocationRoom","assignedLocationBed"].distinct()
bedW=tempdfw["assignedLocationCareArea","assignedLocationRoom","assignedLocationBed"].distinct()
bedM=tempdfm["assignedLocationCareArea","assignedLocationRoom","assignedLocationBed"].distinct()
#Then union this list of beds into 1 dataset
dfBed1=bedA.unionByName(bedW)
dfBed2=dfBed1.unionByName(bedM)
#Then identify the unique beds
dfBed3=dfBed2.distinct()
#And generate a unique ID for each bed. 
dfBed=dfBed3.withColumn("csBedID", monotonically_increasing_id())
#Note, why I planned to mask beds the beds in this list and the hospital beds didn't actually match so it ended up being moot. 

In [5]:
#Now I want to do the same thing I just did with beds but with patient identifiers. Identify unique patient identifiers, unify them, and then find all the distict ones.
csidA=tempdfa['patientIdPrimary-id','patientIdPrimary-type'].distinct()
csidW=tempdfw['patientIdPrimary-id','patientIdPrimary-type'].distinct()
csidM=tempdfm['patientIdPrimary-id','patientIdPrimary-type'].distinct()
dfcsid1=csidA.unionByName(csidW)
dfcsid2=dfcsid1.unionByName(csidM)
dfcsid3=dfcsid2.distinct()
#In addition to a unique patient identifier I also want to build a random 14 day offset by patient ID. So I generate that here as well.
dfcsid4=dfcsid3.withColumn("Offset",round((rand()*28)-14).cast("Integer"))
dfcsid=dfcsid4.withColumn("csID", monotonically_increasing_id())


In [4]:
#Next I need to map the original 3 tables to the patientID table
tempdfa1=tempdfa.join(dfcsid,tempdfa["patientIdPrimary-id"]==dfcsid["patientIdPrimary-id"],"Inner")
tempdfw1=tempdfw.join(dfcsid,tempdfw["patientIdPrimary-id"]==dfcsid["patientIdPrimary-id"],"Inner")
tempdfm1=tempdfm.join(dfcsid,tempdfm["patientIdPrimary-id"]==dfcsid["patientIdPrimary-id"],"Inner")
#And to the locationID table.
tempdfa2=tempdfa1.join(dfBed,[tempdfa1["assignedLocationCareArea"]==dfBed["assignedLocationCareArea"],tempdfa1["assignedLocationRoom"]==dfBed["assignedLocationRoom"],tempdfa1["assignedLocationBed"]==dfBed["assignedLocationBed"]],"Inner")
tempdfw2=tempdfw1.join(dfBed,[tempdfw1["assignedLocationCareArea"]==dfBed["assignedLocationCareArea"],tempdfw1["assignedLocationRoom"]==dfBed["assignedLocationRoom"],tempdfw1["assignedLocationBed"]==dfBed["assignedLocationBed"]],"Inner")
tempdfm2=tempdfm1.join(dfBed,[tempdfm1["assignedLocationCareArea"]==dfBed["assignedLocationCareArea"],tempdfm1["assignedLocationRoom"]==dfBed["assignedLocationRoom"],tempdfm1["assignedLocationBed"]==dfBed["assignedLocationBed"]],"Inner")
#And I need to remove the un-necessary dataset.
tempdfa3=tempdfa2['csID','csBedID',"Offset",'polltime','alarmName','abnormalFlags','inactivationState','sil','setLow','setHigh','chanValue']
tempdfm3=tempdfm2['csID','csBedID',"Offset",'polltime','mesname','msite','muom','mtext']
tempdfw3=tempdfw2['csID','csBedID',"Offset",'polltime','mgname','mgGain','mgHZ','mgwave','mguom','mgsite','mgscale','mginvalid','mgmissing','mgPoints','mgPointsBytes','mgMin','mgMax','mgOffset']

At this point there are 5 new datasets

tempdfa3,m3,w3 which are the coded limited datasets


dfBed and dfcsid which are the maps to the codes (one to their location, one to their identity)


For this to become a deidentified dataset, the next step is to shift the dates in polltime. 

In [5]:
tempdfa4=tempdfa3.withColumn("polltimestamp",to_timestamp("polltime"))
tempdfw4=tempdfw3.withColumn("polltimestamp",to_timestamp("polltime"))
tempdfm4=tempdfm3.withColumn("polltimestamp",to_timestamp("polltime"))
tempdfa5=tempdfa4.withColumn("offsetDate",expr("date_sub(polltimestamp,Offset)"))
tempdfw5=tempdfw4.withColumn("offsetDate",expr("date_sub(polltimestamp,Offset)"))
tempdfm5=tempdfm4.withColumn("offsetDate",expr("date_sub(polltimestamp,Offset)"))
tempdfa6=tempdfa5.withColumn("PollDate",expr("date_sub(polltimestamp,0)"))
tempdfw6=tempdfw5.withColumn("PollDate",expr("date_sub(polltimestamp,0)"))
tempdfm6=tempdfm5.withColumn("PollDate",expr("date_sub(polltimestamp,0)"))
tempdfa7=tempdfa6.withColumn('offsetTime', date_format('polltimestamp', 'HH:mm:ss'))
tempdfw7=tempdfw6.withColumn('offsetTime', date_format('polltimestamp', 'HH:mm:ss'))
tempdfm7=tempdfm6.withColumn('offsetTime', date_format('polltimestamp', 'HH:mm:ss'))


At this point we have a completed coded dataset for all of the carescape data; however, I only have EMR data for a 9 month window, and the dataset is already enormous, so I filter it to only have that 9 month window. 

In [6]:
filterLogic=((tempdfm7['PollDate'] >= "2019-11-23") & (tempdfm7['PollDate'] <= "2020-08-23"))
tempdfm8=tempdfm7.filter(filterLogic)
filterLogic=((tempdfa7['PollDate'] >= "2019-11-23") & (tempdfa7['PollDate'] <= "2020-08-23"))
tempdfa8=tempdfa7.filter(filterLogic)
filterLogic=((tempdfw7['PollDate'] >= "2019-11-23") & (tempdfw7['PollDate'] <= "2020-08-23"))
tempdfw8=tempdfw7.filter(filterLogic)

Finally, I remove the identified variables making this a coded limited.

In [7]:
tempdfa9=tempdfa8['csID','csBedID','offsetDate','offsetTime','alarmName','abnormalFlags','inactivationState','sil','setLow','setHigh','chanValue']
tempdfw9=tempdfw8['csID','csBedID','offsetDate','offsetTime','mgname','mgGain','mgHZ','mgwave','mguom','mgsite','mgscale','mginvalid','mgmissing','mgPoints','mgPointsBytes','mgMin','mgMax','mgOffset']
tempdfm9=tempdfm8['csID','csBedID','offsetDate','offsetTime','mesname','msite','muom','mtext']

tempdfa9,w9,and m9 are the 3 final datasets that are then combined with dfBed and dfcsid to make that coded limited.

All that's left now is to map these to the final dataset.

In [8]:
tempdfa9.write.option("mergeSchema", "true").mode("overwrite").parquet('/fs/ess/scratch/PAS2164/Alarms.parquet')

In [9]:
tempdfw9.write.option("mergeSchema", "true").mode("overwrite").parquet('/fs/ess/scratch/PAS2164/Waveforms.parquet')

In [10]:
tempdfm9.write.option("mergeSchema", "true").mode("overwrite").parquet('/fs/ess/scratch/PAS2164/Messages.parquet')

In [24]:
dfBed.write.option("mergeSchema", "true").mode("overwrite").parquet('/fs/ess/scratch/PAS2164/LocationMap.parquet')
dfcsid.write.option("mergeSchema", "true").mode("overwrite").parquet('/fs/ess/scratch/PAS2164/PatientMap.parquet')

Before the release of the dataset dfBed and dfcsid will be deleted converting this dataset from a coded limited dataset to a limited data set.

With the Coded limited dataset created, the next code to run is the EMR Mapping program.