There are several statistics and figures in the paper, the goal of this notebook is to walk through how those statistics were generated.

In [1]:
pathtoEMR='/fs/ess/scratch/PAS2164/'
pathtoCS='/fs/ess/scratch/PAS2164/CarescapeComb/Temp2/'

How many Hospital Admissions?
Every row of Patient.parquet is a hospital visit.

In [27]:
dfPatient = spark.read.parquet(os.path.join(pathtoEMR,"Patient.parquet"))
dfPatient.count()

15416

What is the Date Range?

In [3]:
from pyspark.sql.functions import min,max
dfPatient.select(min('offsetHospDischDate')).show()
dfPatient.select(max('offsetHospAdmsnDate')).show()

+------------------------+
|min(offsetHospDischDate)|
+------------------------+
|              2019-04-02|
+------------------------+

+------------------------+
|max(offsetHospAdmsnDate)|
+------------------------+
|              2020-09-05|
+------------------------+



Number of unique patients?

In [4]:
dfPatient.select('csID').distinct().count()

8674

%Male Population

In [5]:
dfDemographics = spark.read.parquet(os.path.join(pathtoEMR,"Demographics.parquet"))
keyConditions=[dfDemographics["csID"] == dfPatient["csID"]]
dfPatDemo=dfPatient.join(dfDemographics,keyConditions,"inner")
dfPatDemo=dfPatDemo.drop(dfDemographics.csID)
totalDemo=dfPatDemo.select('csID','Gender').distinct().count()
maleDemo=dfPatDemo.select('csID','Gender').distinct().filter(dfPatDemo['Gender']=="M").count()
maleDemo/totalDemo

0.5782854505595939

Median LOS

In [6]:
import pandas as pd
import numpy as np
pddfPatient=dfPatient.toPandas()
pddfPatient['LOS_Hours']=pd.to_numeric(pddfPatient['LOS_Hours'],errors='coerce')
pddfPatient = pddfPatient.dropna(subset=['LOS_Hours'])
pddfPatient['LOS_Hours'] = pddfPatient['LOS_Hours'].astype('int')
pddfPatient['LOS_Hours'].describe()

count    15330.000000
mean       199.497391
std        244.791483
min          0.000000
25%         75.000000
50%        128.000000
75%        235.000000
max       8931.000000
Name: LOS_Hours, dtype: float64

Median ICU LOS

In [7]:
dfICU = spark.read.parquet(os.path.join(pathtoEMR,"ICU.parquet"))
pddfICU = dfICU.toPandas()
pddfICU['offsetStartTime'] = pd.to_datetime(pddfICU['offsetStartTime'],format= '%H:%M:%S' ).dt.time
pddfICU['offsetEndTime'] = pd.to_datetime(pddfICU['offsetEndTime'],format= '%H:%M:%S' ).dt.time
pddfICU['offsetStartDatetime']=pddfICU.apply(lambda r : pd.datetime.combine(r['offsetStartDate'],r['offsetStartTime']),1)
pddfICU['offsetEndDatetime']=pddfICU.apply(lambda r : pd.datetime.combine(r['offsetEndDate'],r['offsetEndTime']),1)
(pddfICU['offsetEndDatetime']-pddfICU['offsetStartDatetime']).describe()



count                        17750
mean     4 days 07:28:34.904788732
std      5 days 06:47:46.399328600
min                0 days 00:01:00
25%                1 days 09:36:15
50%                2 days 21:08:30
75%                5 days 11:00:45
max              187 days 21:09:00
dtype: object

Charted Events

In [8]:
dfFlowsheet = spark.read.parquet(os.path.join(pathtoEMR,"Flowsheet.parquet"))
dfFlowsheet.count()/dfPatient.count()

779.5716139076285

Lab Measurements

In [9]:
dfLabs = spark.read.parquet(os.path.join(pathtoEMR,"Labs.parquet"))
dfLabs.count()/dfPatient.count()

266.5773871302543

Number of Alarms per Hospital Visit

In [10]:
distPat=dfPatient.select(["csID"]).distinct()
dfAlarm= spark.read.parquet(os.path.join(pathtoCS,"Alarm.parquet"))
keyConditions=[]
keyConditions.append(distPat["csID"] == dfAlarm["csID"])
dfPatAlarm=distPat.join(dfAlarm,keyConditions,"inner")
dfPatAlarm.count()/dfPatient.count()

28193.577062791905

Number of Measurements / Visit

In [11]:
dfMeasurements= spark.read.parquet(os.path.join(pathtoCS,"Measurement.parquet"))
keyConditions=[]
keyConditions.append(distPat["csID"] == dfMeasurements["csID"])
dfPatMeas=distPat.join(dfMeasurements,keyConditions,"inner")
dfPatMeas.count()/dfPatient.count()

1020071.9074987026

Number of Waveforms / visit

In [12]:
dfWave= spark.read.parquet(os.path.join(pathtoCS,"Waveform.parquet"))
keyConditions=[]
keyConditions.append(distPat["csID"] == dfWave["csID"])
dfPatWave=distPat.join(dfWave,keyConditions,"inner")
dfPatWave.count()/dfPatient.count()

524744.182991697

Primary and Secondary Diagnosis

In [13]:
from pyspark.sql.functions import desc
dfPatient.groupBy("Primary_ICD10").count().sort(desc("count")).show(10)
dfPatient.groupBy("Secondary_ICD10").count().sort(desc("count")).show(10)
print("primary")
print("Z98.890",  339/dfPatient.count())
print("I50.23",  303/dfPatient.count())
print("G89.18",  276/dfPatient.count())
print("Secondary")
print("Z51.5",  610/dfPatient.count())
print("Z95.2",  353/dfPatient.count())
print("Z98.890",  342/dfPatient.count())

+-------------+-----+
|Primary_ICD10|count|
+-------------+-----+
|         null| 3362|
|      Z98.890|  339|
|       I50.23|  303|
|       G89.18|  276|
|        G89.3|  204|
|        Z95.1|  198|
|       J96.01|  197|
|       R53.81|  145|
|        N17.9|  144|
|          J90|  143|
+-------------+-----+
only showing top 10 rows

+-----------------+-----+
|  Secondary_ICD10|count|
+-----------------+-----+
|             null| 2435|
|            Z51.5|  610|
|            Z95.2|  353|
|          Z98.890|  342|
|           Z51.11|  178|
|           R78.81|  170|
|           Z71.89|  163|
| unspecified type|  150|
|           G89.18|  149|
|           R53.81|  140|
+-----------------+-----+
only showing top 10 rows

primary
Z98.890 0.021990140114167098
I50.23 0.019654903995848468
G89.18 0.017903476907109497
Secondary
Z51.5 0.03956927867151012
Z95.2 0.022898287493513234
Z98.890 0.022184743124026986


In [30]:
dfPatient2=dfPatient.withColumn("pdx1", dfPatient.Primary_ICD10.substr(1,1))
dfPatient2.groupBy("pdx1").count().sort(desc("count")).show(100)

+----+-----+
|pdx1|count|
+----+-----+
|null| 3362|
|   I| 2534|
|   C| 2428|
|   R| 1426|
|   Z| 1245|
|   J|  863|
|   G|  788|
|   K|  487|
|   N|  456|
|   D|  446|
|   E|  397|
|   T|  215|
|   A|  172|
|   S|  149|
|   M|  147|
|   F|   73|
|   L|   60|
|   H|   42|
|   B|   42|
|   Q|   41|
|   O|   17|
|   W|   16|
|   U|   10|
+----+-----+



In [31]:
dfPatient2=dfPatient.withColumn("sdx1", dfPatient2.Secondary_ICD10.substr(1,1))
dfPatient2.groupBy("sdx1").count().sort(desc("count")).show(100)

+----+-----+
|sdx1|count|
+----+-----+
|   Z| 3526|
|   R| 2569|
|null| 2435|
|    | 1826|
|   I| 1268|
|   C|  601|
|   N|  423|
|   G|  418|
|   J|  363|
|   K|  340|
|   T|  258|
|   M|  249|
|   D|  248|
|   S|  238|
|   E|  164|
|   A|  117|
|   L|   80|
|   P|   53|
|   Q|   50|
|   H|   46|
|   W|   46|
|   F|   34|
|   B|   29|
|   O|   17|
|   U|   15|
|   V|    2|
|   2|    1|
+----+-----+



In [34]:
dfMeasurements.groupBy("mesname").count().sort(desc("count")).show(100)

+--------------------+----------+
|             mesname|     count|
+--------------------+----------+
|           TZ_Offset|1425815659|
|     ecg-heartRate#1|1425610338|
|     ecg-v_p_cRate#1|1422869369|
|ecg-stDeviation-II#1|1412328927|
|ecg-stDeviation-I...|1412157993|
|ecg-stDeviation-A...|1411948050|
| ecg-stDeviation-I#1|1411876703|
|ecg-stDeviation-A...|1411811864|
|ecg-stDeviation-A...|1411643763|
|  nbp-cuffPressure#1|1407023589|
|ecg-stDeviation-V1#1|1406068436|
|    spO2-pulseRate#1|1197963803|
|        spO2-satO2#1|1195115573|
|spO2-signalStreng...|1194002531|
|  ecgResp-respRate#1| 784612386|
|ecg-stDeviation-V2#1| 630821165|
|ecg-stDeviation-V3#1| 628077023|
|ecg-stDeviation-V5#1| 627089752|
|ecg-stDeviation-V4#1| 626964432|
|ecg-stDeviation-V6#1| 626720818|
|         temp-temp#1| 153730617|
|         temp-temp#2| 153698896|
|           ip-mean#1| 152520584|
|       ip-systolic#1| 147595765|
|      ip-diastolic#1| 147164516|
|      ip-pulseRate#1| 123225084|
|           ip

In [36]:
dfWave.groupBy("mgname").count().sort(desc("count")).show(100)

+--------------------+----------+
|              mgname|     count|
+--------------------+----------+
|            ecg-II#1|1426711442|
|           ecg-III#1|1426651454|
|             ecg-I#1|1426640077|
|           ecg-AVF#1|1426623302|
|           ecg-AVL#1|1426623283|
|           ecg-AVR#1|1426623249|
|            ecg-V1#1|1413570365|
|              spO2#1|1194563771|
|           ecgResp#1| 766205434|
|                ip#1| 152494410|
|                ip#2| 105883629|
|                ip#3|  22383020|
|                ip#4|  16711772|
|            ecg-V2#1|   8957818|
|            ecg-V3#1|   2453665|
|gasMon-respGas-CO2#1|   1368902|
|            ecg-V5#1|   1019466|
|            ecg-V4#1|    454206|
|            ecg-V6#1|    152917|
+--------------------+----------+

