In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2
import getpass
import seaborn as sns
# for configuring connection 
from configobj import ConfigObj
import os

%matplotlib inline

In [2]:
# Create a database connection using settings from config file
config='../db/config.ini'

# connection info
conn_info = dict()
if os.path.isfile(config):
    config = ConfigObj(config)
    conn_info["sqluser"] = config['username']
    conn_info["sqlpass"] = config['password']
    conn_info["sqlhost"] = config['host']
    conn_info["sqlport"] = config['port']
    conn_info["dbname"] = config['dbname']
    conn_info["schema_name"] = config['schema_name']
else:
    conn_info["sqluser"] = 'postgres'
    conn_info["sqlpass"] = ''
    conn_info["sqlhost"] = '192.168.60.144'
    conn_info["sqlport"] = 6432
    conn_info["dbname"] = 'eicu'
    conn_info["schema_name"] = 'public,eicu_crd'
    
# Connect to the eICU database
print('Database: {}'.format(conn_info['dbname']))
print('Username: {}'.format(conn_info["sqluser"]))
if conn_info["sqlpass"] == '':
    # try connecting without password, i.e. peer or OS authentication
    try:
        if (conn_info["sqlhost"] == '192.168.60.144') & (conn_info["sqlport"]=='6432'):
            con = psycopg2.connect(dbname=conn_info["dbname"],
                                   user=conn_info["sqluser"])            
        else:
            con = psycopg2.connect(dbname=conn_info["dbname"],
                                   host=conn_info["sqlhost"],
                                   port=conn_info["sqlport"],
                                   user=conn_info["sqluser"])
    except:
        conn_info["sqlpass"] = getpass.getpass('Password: ')

        con = psycopg2.connect(dbname=conn_info["dbname"],
                               host=conn_info["sqlhost"],
                               port=conn_info["sqlport"],
                               user=conn_info["sqluser"],
                               password=conn_info["sqlpass"])
query_schema = 'set search_path to ' + conn_info['schema_name'] + ';'

Database: eicu
Username: postgres


In [3]:
from sqlalchemy import create_engine,text
con= create_engine('postgresql://eicu@192.168.60.144:6432/eicu')

In [4]:
datadir = '/home/mei/nas/docker/dataset/EICU/eicu-collaborative-research-database-2.0/'
porcesseddir = '/home/mei/nas/docker/processedData/'

## lab

In [22]:
create_table_query =  query_schema +"""
CREATE TABLE pivoted_bg as
-- get blood gas measures
with vw0 as
(
  select
      patientunitstayid
    , labname
    , labresultoffset
    , labresultrevisedoffset
  from lab
  where labname in
  (
        'paO2'
      , 'paCO2'
      , 'pH'
      , 'FiO2'
      , 'anion gap'
      , 'Base Deficit'
      , 'Base Excess'
      , 'PEEP'
  )
  group by patientunitstayid, labname, labresultoffset, labresultrevisedoffset
  having count(distinct labresult)<=1
)
-- get the last lab to be revised
, vw1 as
(
  select
      lab.patientunitstayid
    , lab.labname
    , lab.labresultoffset
    , lab.labresultrevisedoffset
    , lab.labresult
    , ROW_NUMBER() OVER
        (
          PARTITION BY lab.patientunitstayid, lab.labname, lab.labresultoffset
          ORDER BY lab.labresultrevisedoffset DESC
        ) as rn
  from lab
  inner join vw0
    ON  lab.patientunitstayid = vw0.patientunitstayid
    AND lab.labname = vw0.labname
    AND lab.labresultoffset = vw0.labresultoffset
    AND lab.labresultrevisedoffset = vw0.labresultrevisedoffset
  WHERE
     (lab.labname = 'paO2' and lab.labresult >= 15 and lab.labresult <= 720)
  OR (lab.labname = 'paCO2' and lab.labresult >= 5 and lab.labresult <= 250)
  OR (lab.labname = 'pH' and lab.labresult >= 6.5 and lab.labresult <= 8.5)
  OR (lab.labname = 'FiO2' and lab.labresult >= 0.2 and lab.labresult <= 1.0)
  -- we will fix fio2 units later
  OR (lab.labname = 'FiO2' and lab.labresult >= 20 and lab.labresult <= 100)
  OR (lab.labname = 'anion gap' and lab.labresult >= 0 and lab.labresult <= 300)
  OR (lab.labname = 'Base Deficit' and lab.labresult >= -100 and lab.labresult <= 100)
  OR (lab.labname = 'Base Excess' and lab.labresult >= -100 and lab.labresult <= 100)
  OR (lab.labname = 'PEEP' and lab.labresult >= 0 and lab.labresult <= 60)
)
select
    patientunitstayid
  , labresultoffset as chartoffset
  -- the aggregate (max()) only ever applies to 1 value due to the where clause
  , MAX(case
        when labname != 'FiO2' then null
        when labresult >= 20 then labresult/100.0
      else labresult end) as fio2
  , MAX(case when labname = 'paO2' then labresult else null end) as pao2
  , MAX(case when labname = 'paCO2' then labresult else null end) as paco2
  , MAX(case when labname = 'pH' then labresult else null end) as pH
  , MAX(case when labname = 'anion gap' then labresult else null end) as aniongap
  , MAX(case when labname = 'Base Deficit' then labresult else null end) as basedeficit
  , MAX(case when labname = 'Base Excess' then labresult else null end) as baseexcess
  , MAX(case when labname = 'PEEP' then labresult else null end) as peep
from vw1
where rn = 1
group by patientunitstayid, labresultoffset
order by patientunitstayid, labresultoffset;
"""

In [None]:
with con.begin() as connection:
    connection.execute(text(create_table_query))  # Use text() to wrap the raw SQL

In [24]:

select_query = "SELECT * FROM pivoted_bg;"
df_bg = pd.read_sql_query(select_query, con)

print(df_bg)

         patientunitstayid  chartoffset  fio2  pao2  paco2     ph  aniongap  \
0                   141168          516   NaN   NaN    NaN    NaN      15.0   
1                   141168         1133   NaN   NaN    NaN    NaN      20.0   
2                   141168         1805  0.28  41.0   46.0  7.140       NaN   
3                   141168         2010  1.00  42.0   44.0  7.140       NaN   
4                   141168         2026   NaN  68.0   31.0  7.160      25.0   
...                    ...          ...   ...   ...    ...    ...       ...   
1464007            3353254         2610   NaN   NaN    NaN    NaN       3.0   
1464008            3353254         4144   NaN   NaN    NaN    NaN       5.0   
1464009            3353254         4237   NaN  57.0   34.3  7.412       NaN   
1464010            3353254         5558   NaN   NaN    NaN    NaN       6.0   
1464011            3353263           -7   NaN   NaN    NaN    NaN       3.0   

         basedeficit  baseexcess  peep  
0         

In [26]:
# df_bg.to_csv(porcesseddir + 'pivoted_bg.csv',  index=True)
# print("Data exported successfully to 'pivoted_bg'.")

Data exported successfully to 'pivoted_bg'.


## nursecharting

In [37]:
gcs_query =  query_schema +"""
WITH nc AS
(
    SELECT
        patientunitstayid,
        nursingchartoffset AS chartoffset,
        MIN(CASE
            WHEN nursingchartcelltypevallabel = 'Glasgow coma score'
             AND nursingchartcelltypevalname = 'GCS Total'
             AND nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$'
             AND nursingchartvalue NOT IN ('-', '.')
                THEN CAST(nursingchartvalue AS numeric)
            WHEN nursingchartcelltypevallabel = 'Score (Glasgow Coma Scale)'
             AND nursingchartcelltypevalname = 'Value'
             AND nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$'
             AND nursingchartvalue NOT IN ('-', '.')
                THEN CAST(nursingchartvalue AS numeric)
            ELSE NULL END) AS gcs,
        MIN(CASE
            WHEN nursingchartcelltypevallabel = 'Glasgow coma score'
             AND nursingchartcelltypevalname = 'Motor'
             AND nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$'
             AND nursingchartvalue NOT IN ('-', '.')
                THEN CAST(nursingchartvalue AS numeric)
            ELSE NULL END) AS gcsmotor,
        MIN(CASE
            WHEN nursingchartcelltypevallabel = 'Glasgow coma score'
             AND nursingchartcelltypevalname = 'Verbal'
             AND nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$'
             AND nursingchartvalue NOT IN ('-', '.')
                THEN CAST(nursingchartvalue AS numeric)
            ELSE NULL END) AS gcsverbal,
        MIN(CASE
            WHEN nursingchartcelltypevallabel = 'Glasgow coma score'
             AND nursingchartcelltypevalname = 'Eyes'
             AND nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$'
             AND nursingchartvalue NOT IN ('-', '.')
                THEN CAST(nursingchartvalue AS numeric)
            ELSE NULL END) AS gcseyes
    FROM nursecharting
    WHERE nursingchartcelltypecat IN ('Scores', 'Other Vital Signs and Infusions')
    GROUP BY patientunitstayid, nursingchartoffset
),
ncproc AS
(
    SELECT
        patientunitstayid,
        chartoffset,
        CASE WHEN gcs > 2 AND gcs < 16 THEN gcs ELSE NULL END AS gcs,
        gcsmotor, gcsverbal, gcseyes
    FROM nc
)
SELECT
    patientunitstayid,
    chartoffset,
    gcs,
    gcsmotor, gcsverbal, gcseyes
FROM ncproc
WHERE gcs IS NOT NULL
   OR gcsmotor IS NOT NULL
   OR gcsverbal IS NOT NULL
   OR gcseyes IS NOT NULL
ORDER BY patientunitstayid;

"""

In [38]:
df_gcs = pd.read_sql_query(gcs_query, con)


In [39]:
print(df_gcs)

         patientunitstayid  chartoffset   gcs  gcsmotor  gcsverbal  gcseyes
0                   141178         -277  12.0       NaN        NaN      NaN
1                   141179           12   9.0       NaN        NaN      NaN
2                   141179          216  12.0       NaN        NaN      NaN
3                   141179          522  12.0       NaN        NaN      NaN
4                   141179          702  12.0       NaN        NaN      NaN
...                    ...          ...   ...       ...        ...      ...
3451783            3353263         4306  15.0       6.0        5.0      4.0
3451784            3353263         4837  15.0       6.0        5.0      4.0
3451785            3353263         5651  15.0       6.0        5.0      4.0
3451786            3353263         7100  15.0       6.0        5.0      4.0
3451787            3353263         8525  15.0       6.0        5.0      4.0

[3451788 rows x 6 columns]


In [40]:
# df_gcs.to_csv(porcesseddir + 'pivoted_gcs.csv',  index=True)
# print("Data exported successfully to 'pivoted_gcs'.")

Data exported successfully to 'pivoted_gcs'.


## infusiondrug
- 从 infusiondrug 表中提取和处理多种药物（主要是一些用于支持循环系统的药物，如多巴胺、去甲肾上腺素、肾上腺素等）的输注记录，并将这些数据按患者ID和时间进行汇总处理，最终结果存储在一个名为 pivoted_infusion 的表中
- max(case ...) 判断药物是否存在
- 过滤掉那些没有任何药物使用的记录，最终输出包含至少一种药物使用的记录

In [7]:
create_table_infusion =  query_schema +"""
CREATE TABLE pivoted_infusion as
with vw0 as
(
  select
    patientunitstayid
    , infusionoffset
    -- TODO: need dopamine rate
    , max(case when drugname in
              (
                   'Dopamine'
                 , 'Dopamine ()'
                 , 'DOPamine MAX 800 mg Dextrose 5% 250 ml  Premix (mcg/kg/min)'
                 , 'Dopamine (mcg/hr)'
                 , 'Dopamine (mcg/kg/hr)'
                 , 'dopamine (mcg/kg/min)'
                 , 'Dopamine (mcg/kg/min)'
                 , 'Dopamine (mcg/min)'
                 , 'Dopamine (mg/hr)'
                 , 'Dopamine (ml/hr)'
                 , 'Dopamine (nanograms/kg/min)'
                 , 'DOPamine STD 15 mg Dextrose 5% 250 ml  Premix (mcg/kg/min)'
                 , 'DOPamine STD 400 mg Dextrose 5% 250 ml  Premix (mcg/kg/min)'
                 , 'DOPamine STD 400 mg Dextrose 5% 500 ml  Premix (mcg/kg/min)'
                 , 'Dopamine (Unknown)'
              )
              -- note: no rows found for inotropin
                then 1
              else null end
            ) as dopamine

    -- this like statement is pretty reliable - no false positives when I checked
    -- also catches the brand name dobutrex
    , max(case when lower(drugname) like '%dobu%' then 1 else null end) as dobutamine
    , max(case
              when drugname in
              (
                 'Norepinephrine'
               , 'Norepinephrine ()'
               , 'Norepinephrine MAX 32 mg Dextrose 5% 250 ml (mcg/min)'
               , 'Norepinephrine MAX 32 mg Dextrose 5% 500 ml (mcg/min)'
               , 'Norepinephrine (mcg/hr)'
               , 'Norepinephrine (mcg/kg/hr)'
               , 'Norepinephrine (mcg/kg/min)'
               , 'Norepinephrine (mcg/min)'
               , 'Norepinephrine (mg/hr)'
               , 'Norepinephrine (mg/kg/min)'
               , 'Norepinephrine (mg/min)'
               , 'Norepinephrine (ml/hr)'
               , 'Norepinephrine STD 32 mg Dextrose 5% 282 ml (mcg/min)'
               , 'Norepinephrine STD 32 mg Dextrose 5% 500 ml (mcg/min)'
               , 'Norepinephrine STD 4 mg Dextrose 5% 250 ml (mcg/min)'
               , 'Norepinephrine STD 4 mg Dextrose 5% 500 ml (mcg/min)'
               , 'Norepinephrine STD 8 mg Dextrose 5% 250 ml (mcg/min)'
               , 'Norepinephrine STD 8 mg Dextrose 5% 500 ml (mcg/min)'
               , 'Norepinephrine (units/min)'
               , 'Norepinephrine (Unknown)'
               , 'norepinephrine Volume (ml)'
               , 'norepinephrine Volume (ml) (ml/hr)'
               -- levophed
              , 'Levophed (mcg/kg/min)'
              , 'levophed  (mcg/min)'
              , 'levophed (mcg/min)'
              , 'Levophed (mcg/min)'
              , 'Levophed (mg/hr)'
              , 'levophed (ml/hr)'
              , 'Levophed (ml/hr)'
              , 'NSS with LEVO (ml/hr)'
              , 'NSS w/ levo/vaso (ml/hr)'
              )
          then 1 else 0 end) as norepinephrine
    , max(case
          when drugname in
          (
             'Phenylephrine'
           , 'Phenylephrine ()'
           , 'Phenylephrine  MAX 100 mg Sodium Chloride 0.9% 250 ml (mcg/min)'
           , 'Phenylephrine (mcg/hr)'
           , 'Phenylephrine (mcg/kg/min)'
           , 'Phenylephrine (mcg/kg/min) (mcg/kg/min)'
           , 'Phenylephrine (mcg/min)'
           , 'Phenylephrine (mcg/min) (mcg/min)'
           , 'Phenylephrine (mg/hr)'
           , 'Phenylephrine (mg/kg/min)'
           , 'Phenylephrine (ml/hr)'
           , 'Phenylephrine  STD 20 mg Sodium Chloride 0.9% 250 ml (mcg/min)'
           , 'Phenylephrine  STD 20 mg Sodium Chloride 0.9% 500 ml (mcg/min)'
           , 'Volume (ml) Phenylephrine'
           , 'Volume (ml) Phenylephrine ()'
           -- neosynephrine is a synonym
           , 'neo-synephrine (mcg/min)'
           , 'neosynephrine (mcg/min)'
           , 'Neosynephrine (mcg/min)'
           , 'Neo Synephrine (mcg/min)'
           , 'Neo-Synephrine (mcg/min)'
           , 'NeoSynephrine (mcg/min)'
           , 'NEO-SYNEPHRINE (mcg/min)'
           , 'Neosynephrine (ml/hr)'
           , 'neosynsprine'
           , 'neosynsprine (mcg/kg/hr)'
          )
        then 1 else 0 end) as phenylephrine
    , max(case
            when drugname in
            (
                 'EPI (mcg/min)'
               , 'Epinepherine (mcg/min)'
               , 'Epinephrine'
               , 'Epinephrine ()'
               , 'EPINEPHrine(Adrenalin)MAX 30 mg Sodium Chloride 0.9% 250 ml (mcg/min)'
               , 'EPINEPHrine(Adrenalin)STD 4 mg Sodium Chloride 0.9% 250 ml (mcg/min)'
               , 'EPINEPHrine(Adrenalin)STD 4 mg Sodium Chloride 0.9% 500 ml (mcg/min)'
               , 'EPINEPHrine(Adrenalin)STD 7 mg Sodium Chloride 0.9% 250 ml (mcg/min)'
               , 'Epinephrine (mcg/hr)'
               , 'Epinephrine (mcg/kg/min)'
               , 'Epinephrine (mcg/min)'
               , 'Epinephrine (mg/hr)'
               , 'Epinephrine (mg/kg/min)'
               , 'Epinephrine (ml/hr)'
            ) then 1 else 0 end)
          as epinephrine
    , max(case
            when drugname in
            (
                'Vasopressin'
              , 'Vasopressin ()'
              , 'Vasopressin 20 Units Sodium Chloride 0.9% 100 ml (units/hr)'
              , 'Vasopressin 20 Units Sodium Chloride 0.9% 250 ml (units/hr)'
              , 'Vasopressin 40 Units Sodium Chloride 0.9% 100 ml (units/hr)'
              , 'Vasopressin 40 Units Sodium Chloride 0.9% 100 ml (units/kg/hr)'
              , 'Vasopressin 40 Units Sodium Chloride 0.9% 100 ml (units/min)'
              , 'Vasopressin 40 Units Sodium Chloride 0.9% 100 ml (Unknown)'
              , 'Vasopressin 40 Units Sodium Chloride 0.9% 200 ml (units/min)'
              , 'Vasopressin (mcg/kg/min)'
              , 'Vasopressin (mcg/min)'
              , 'Vasopressin (mg/hr)'
              , 'Vasopressin (mg/min)'
              , 'vasopressin (ml/hr)'
              , 'Vasopressin (ml/hr)'
              , 'Vasopressin (units/hr)'
              , 'Vasopressin (units/kg/min)'
              , 'vasopressin (units/min)'
              , 'Vasopressin (units/min)'
              , 'VAsopressin (units/min)'
              , 'Vasopressin (Unknown)'
            ) then 1 else 0 end)
          as vasopressin
    , max(case when drugname in
      (
           'Milrinone'
         , 'Milrinone ()'
         , 'Milrinone (mcg/kg/hr)'
         , 'Milrinone (mcg/kg/min)'
         , 'Milrinone (ml/hr)'
         , 'Milrinone (Primacor) 40 mg Dextrose 5% 200 ml (mcg/kg/min)'
         , 'Milronone (mcg/kg/min)'
         , 'primacore (mcg/kg/min)'
      ) then 1 else 0 end)
      as milrinone
    , max(case when drugname in
      (
          'Hepain (ml/hr)'
        , 'Heparin'
        , 'Heparin ()'
        , 'Heparin 25,000 Unit/D5w 250 ml (ml/hr)'
        , 'Heparin 25000 Units Dextrose 5% 500 ml  Premix (units/hr)'
        , 'Heparin 25000 Units Dextrose 5% 500 ml  Premix (units/kg/hr)'
        , 'Heparin 25000 Units Dextrose 5% 950 ml  Premix (units/kg/hr)'
        , 'HEPARIN #2 (units/hr)'
        , 'Heparin 8000u/1L NS (ml/hr)'
        , 'Heparin-EKOS (units/hr)'
        , 'Heparin/Femoral Sheath   (units/hr)'
        , 'Heparin (mcg/kg/hr)'
        , 'Heparin (mcg/kg/min)'
        , 'Heparin (ml/hr)'
        , 'heparin (units/hr)'
        , 'Heparin (units/hr)'
        , 'HEPARIN (units/hr)'
        , 'Heparin (units/kg/hr)'
        , 'Heparin (Unknown)'
        , 'Heparin via sheath (units/hr)'
        , 'Left  Heparin (units/hr)'
        , 'NSS carrier heparin (ml/hr)'
        , 'S-Heparin (units/hr)'
        , 'Volume (ml) Heparin-heparin 25,000 units in 0.45 % sodium chloride 500 mL infusion'
        , 'Volume (ml) Heparin-heparin 25,000 units in 0.45 % sodium chloride 500 mL infusion (ml/hr)'
        , 'Volume (ml) Heparin-heparin 25,000 units in dextrose 500 mL infusion'
        , 'Volume (ml) Heparin-heparin 25,000 units in dextrose 500 mL infusion (ml/hr)'
        , 'Volume (ml) Heparin-heparin infusion 2 units/mL in 0.9% sodium chloride (ARTERIAL LINE)'
        , 'Volume (ml) Heparin-heparin infusion 2 units/mL in 0.9% sodium chloride (ARTERIAL LINE) (ml/hr)'
      ) then 1 else 0 end)
      as heparin
  from infusiondrug
  group by patientunitstayid, infusionoffset
)
select
  patientunitstayid
  , infusionoffset as chartoffset
  , dopamine::SMALLINT as dopamine
  , dobutamine::SMALLINT as dobutamine
  , norepinephrine::SMALLINT as norepinephrine
  , phenylephrine::SMALLINT as phenylephrine
  , epinephrine::SMALLINT as epinephrine
  , vasopressin::SMALLINT as vasopressin
  , milrinone::SMALLINT as milrinone
  , heparin::SMALLINT as heparin
from vw0
-- at least one of our drugs should be non-zero
where dopamine = 1
OR dobutamine = 1
OR norepinephrine = 1
OR phenylephrine = 1
OR epinephrine = 1
OR vasopressin = 1
OR milrinone = 1
OR heparin = 1
order by patientunitstayid, infusionoffset;
"""

In [8]:
with con.begin() as connection:
    connection.execute(text(create_table_infusion))  # Use text() to wrap the raw SQL

select_infusion = "SELECT * FROM pivoted_infusion;"
df_infusion = pd.read_sql_query(text(select_infusion),con)
print(df_infusion)


         patientunitstayid  chartoffset  dopamine  dobutamine  norepinephrine  \
0                   242040          457       NaN         NaN               0   
1                   242082           55       NaN         NaN               1   
2                   242082          125       NaN         NaN               1   
3                   242082          230       NaN         NaN               1   
4                   242082          275       NaN         NaN               1   
...                    ...          ...       ...         ...             ...   
1083069            3353251         4767       NaN         NaN               0   
1083070            3353251         4789       NaN         NaN               0   
1083071            3353251         4822       NaN         NaN               0   
1083072            3353263          328       NaN         NaN               0   
1083073            3353263          800       NaN         NaN               0   

         phenylephrine  epi

In [12]:
# df_infusion.to_csv(porcesseddir + 'pivoted_infusion.csv',  index=True)
# print("Data exported successfully to 'pivoted_infusion'.")

Data exported successfully to 'pivoted_infusion'.


## lab
- 从 lab 表中提取与实验室化验相关的数据，进行清理和去重，然后汇总每个患者在不同时间点的实验室结果

In [5]:
create_table_lab = query_schema + """
DROP TABLE IF EXISTS pivoted_lab CASCADE;
CREATE TABLE pivoted_lab as
-- remove duplicate labs if they exist at the same time
with vw0 as
(
  select
      patientunitstayid
    , labname
    , labresultoffset
    , labresultrevisedoffset
  from lab
  where labname in
  (
      'albumin'
    , 'total bilirubin'
    , 'BUN'
    , 'calcium'
    , 'chloride'
    , 'creatinine'
    , 'bedside glucose', 'glucose'
    , 'bicarbonate' -- HCO3
    , 'Total CO2'
    , 'Hct'
    , 'Hgb'
    , 'PT - INR'
    , 'PTT'
    , 'lactate'
    , 'platelets x 1000'
    , 'potassium'
    , 'sodium'
    , 'WBC x 1000'
    , '-bands'
    -- Liver enzymes
    , 'ALT (SGPT)'
    , 'AST (SGOT)'
    , 'alkaline phos.'
  )
  group by patientunitstayid, labname, labresultoffset, labresultrevisedoffset
  having count(distinct labresult)<=1
)
-- get the last lab to be revised
, vw1 as
(
  select
      lab.patientunitstayid
    , lab.labname
    , lab.labresultoffset
    , lab.labresultrevisedoffset
    , lab.labresult
    , ROW_NUMBER() OVER
        (
          PARTITION BY lab.patientunitstayid, lab.labname, lab.labresultoffset
          ORDER BY lab.labresultrevisedoffset DESC
        ) as rn
  from lab
  inner join vw0
    ON  lab.patientunitstayid = vw0.patientunitstayid
    AND lab.labname = vw0.labname
    AND lab.labresultoffset = vw0.labresultoffset
    AND lab.labresultrevisedoffset = vw0.labresultrevisedoffset
  -- only valid lab values
  WHERE
       (lab.labname = 'albumin' and lab.labresult >= 0.5 and lab.labresult <= 6.5)
    OR (lab.labname = 'total bilirubin' and lab.labresult >= 0.2 and lab.labresult <= 70.175)
    OR (lab.labname = 'BUN' and lab.labresult >= 1 and lab.labresult <= 280)
    OR (lab.labname = 'calcium' and lab.labresult > 0 and lab.labresult <= 9999)
    OR (lab.labname = 'chloride' and lab.labresult > 0 and lab.labresult <= 9999)
    OR (lab.labname = 'creatinine' and lab.labresult >= 0.1 and lab.labresult <= 28.28)
    OR (lab.labname in ('bedside glucose', 'glucose') and lab.labresult >= 25 and lab.labresult <= 1500)
    OR (lab.labname = 'bicarbonate' and lab.labresult >= 0 and lab.labresult <= 9999)
    OR (lab.labname = 'Total CO2' and lab.labresult >= 0 and lab.labresult <= 9999)
    -- will convert hct unit to fraction later
    OR (lab.labname = 'Hct' and lab.labresult >= 5 and lab.labresult <= 75)
    OR (lab.labname = 'Hgb' and lab.labresult >  0 and lab.labresult <= 9999)
    OR (lab.labname = 'PT - INR' and lab.labresult >= 0.5 and lab.labresult <= 15)
    OR (lab.labname = 'lactate' and lab.labresult >= 0.1 and lab.labresult <= 30)
    OR (lab.labname = 'platelets x 1000' and lab.labresult >  0 and lab.labresult <= 9999)
    OR (lab.labname = 'potassium' and lab.labresult >= 0.05 and lab.labresult <= 12)
    OR (lab.labname = 'PTT' and lab.labresult >  0 and lab.labresult <= 500)
    OR (lab.labname = 'sodium' and lab.labresult >= 90 and lab.labresult <= 215)
    OR (lab.labname = 'WBC x 1000' and lab.labresult > 0 and lab.labresult <= 100)
    OR (lab.labname = '-bands' and lab.labresult >= 0 and lab.labresult <= 100)
    OR (lab.labname = 'ALT (SGPT)' and lab.labresult > 0)
    OR (lab.labname = 'AST (SGOT)' and lab.labresult > 0)
    OR (lab.labname = 'alkaline phos.' and lab.labresult > 0)
)
select
    patientunitstayid
  , labresultoffset as chartoffset
  , MAX(case when labname = 'albumin' then labresult else null end) as albumin
  , MAX(case when labname = 'total bilirubin' then labresult else null end) as bilirubin
  , MAX(case when labname = 'BUN' then labresult else null end) as BUN
  , MAX(case when labname = 'calcium' then labresult else null end) as calcium
  , MAX(case when labname = 'chloride' then labresult else null end) as chloride
  , MAX(case when labname = 'creatinine' then labresult else null end) as creatinine
  , MAX(case when labname in ('bedside glucose', 'glucose') then labresult else null end) as glucose
  , MAX(case when labname = 'bicarbonate' then labresult else null end) as bicarbonate
  , MAX(case when labname = 'Total CO2' then labresult else null end) as TotalCO2
  , MAX(case when labname = 'Hct' then labresult else null end) as hematocrit
  , MAX(case when labname = 'Hgb' then labresult else null end) as hemoglobin
  , MAX(case when labname = 'PT - INR' then labresult else null end) as INR
  , MAX(case when labname = 'lactate' then labresult else null end) as lactate
  , MAX(case when labname = 'platelets x 1000' then labresult else null end) as platelets
  , MAX(case when labname = 'potassium' then labresult else null end) as potassium
  , MAX(case when labname = 'PTT' then labresult else null end) as ptt
  , MAX(case when labname = 'sodium' then labresult else null end) as sodium
  , MAX(case when labname = 'WBC x 1000' then labresult else null end) as wbc
  , MAX(case when labname = '-bands' then labresult else null end) as bands
  , MAX(case when labname = 'ALT (SGPT)' then labresult else null end) as alt
  , MAX(case when labname = 'AST (SGOT)' then labresult else null end) as ast
  , MAX(case when labname = 'alkaline phos.' then labresult else null end) as alp
from vw1
where rn = 1
group by patientunitstayid, labresultoffset
order by patientunitstayid, labresultoffset;

"""

In [6]:
with con.begin() as connection:
    connection.execute(text(create_table_lab))  # Use text() to wrap the raw SQL

select_lab = "SELECT * FROM pivoted_lab;"
df_lab = pd.read_sql_query(text(select_lab),con)
print(df_lab)

         patientunitstayid  chartoffset  albumin  bilirubin   bun  calcium  \
0                   141168          231      NaN        NaN   NaN      NaN   
1                   141168          516      3.1        2.6  26.0      8.8   
2                   141168         1133      3.3        4.1  27.0      9.2   
3                   141168         2026      3.0        5.2  29.0      8.5   
4                   141178         -280      4.0        0.4  11.0      8.0   
...                    ...          ...      ...        ...   ...      ...   
5314158            3353263         6446      NaN        NaN   NaN      NaN   
5314159            3353263         6919      NaN        NaN   NaN      NaN   
5314160            3353263         7303      NaN        NaN   NaN      NaN   
5314161            3353263         7490      NaN        NaN   NaN      NaN   
5314162            3353263         8926      NaN        NaN   NaN      NaN   

         chloride  creatinine  glucose  bicarbonate  ...  lacta

In [7]:
# df_lab.to_csv(porcesseddir + 'pivoted_lab.csv',  index=True)
# print("Data exported successfully to 'pivoted_lab'.")

Data exported successfully to 'pivoted_lab'.


## medication
- 主要目标：从 medication 表中提取药物的使用记录，并将药物的名称标准化。然后根据患者的药物使用时间（订购、开始、停止）汇总每个患者的用药情况。

- 药物识别：通过药物的编码（drughiclseqno）或名称（drugname）来标准化药物名称。对于某些编码缺失的情况，使用 LIKE 操作符通过名称来匹配药物。

- 结果表示：对于每个患者和时间点，输出患者是否使用了某种特定的药物，并以 0 或 1 来表示药物的使用情况。

In [5]:
create_table_med= query_schema+ """
DROP TABLE IF EXISTS pivoted_med CASCADE;
CREATE TABLE pivoted_med as
-- remove duplicate labs if they exist at the same time
with vw0 as
(
  select
    patientunitstayid
    -- due to issue in ETL, times of 0 should likely be null
    , case when drugorderoffset = 0 then null else drugorderoffset end as drugorderoffset
    , case when drugstartoffset = 0 then null else drugstartoffset end as drugstartoffset
    , case when drugstopoffset = 0 then null else drugstopoffset end as drugstopoffset

    -- assign our own identifier based off HICL codes
    -- the following codes have multiple drugs: 35779, 1874, 189
    , case
        when drughiclseqno in (37410, 36346, 2051) then 'norepinephrine'
        when drughiclseqno in (37407, 39089, 36437, 34361, 2050) then 'epinephrine'
        when drughiclseqno in (8777, 40) then 'dobutamine'
        when drughiclseqno in (2060, 2059) then 'dopamine'
        when drughiclseqno in (37028, 35517, 35587, 2087) then 'phenylephrine'
        when drughiclseqno in (38884, 38883, 2839) then 'vasopressin'
        when drughiclseqno in (9744) then 'milrinone'
        when drughiclseqno in (39654, 9545, 2807, 33442, 8643, 33314, 2808, 2810) then 'heparin'
        when drughiclseqno in (2812, 24859) then 'warfarin'
        -- now do missing HICL
        when drughiclseqno is null
          and lower(drugname) like '%heparin%' then 'heparin'
        when drughiclseqno is null
          and (lower(drugname) like '%warfarin%' OR lower(drugname) like '%coumadin%') then 'warfarin'

        when drughiclseqno is null
          and lower(drugname) like '%dobutamine%' then 'dobutamine'
        when drughiclseqno is null
          and lower(drugname) like '%dobutrex%' then 'dobutamine'
        when drughiclseqno is null
          and lower(drugname) like '%norepinephrine%' then 'norepinephrine'
        when drughiclseqno is null
          and lower(drugname) like '%levophed%' then 'norepinephrine'
        when drughiclseqno is null
          and lower(drugname) like 'epinephrine%' then 'epinephrine'
        when drughiclseqno is null
          and lower(drugname) like '%phenylephrine%' then 'phenylephrine'
        when drughiclseqno is null
          and lower(drugname) like '%neosynephrine%' then 'neosynephrine'
        when drughiclseqno is null
          and lower(drugname) like '%vasopressin%' then 'vasopressin'
        when drughiclseqno is null
          and lower(drugname) like '%milrinone%' then 'milrinone'
      else null end
        as drugname_structured

    -- raw identifiers
    , drugname, drughiclseqno, gtc

    -- delivery info
    , dosage, routeadmin, prn
    -- , loadingdose
  from medication m
  -- only non-zero dosages
  where dosage is not null
  -- not cancelled
  and drugordercancelled = 'No'
)
select
    patientunitstayid
  , drugorderoffset
  , drugstartoffset as chartoffset
  , drugstopoffset
  , max(case when drugname_structured = 'norepinephrine' then 1 else 0 end)::SMALLINT as norepinephrine
  , max(case when drugname_structured = 'epinephrine' then 1 else 0 end)::SMALLINT as epinephrine
  , max(case when drugname_structured = 'dopamine' then 1 else 0 end)::SMALLINT as dopamine
  , max(case when drugname_structured = 'dobutamine' then 1 else 0 end)::SMALLINT as dobutamine
  , max(case when drugname_structured = 'phenylephrine' then 1 else 0 end)::SMALLINT as phenylephrine
  , max(case when drugname_structured = 'vasopressin' then 1 else 0 end)::SMALLINT as vasopressin
  , max(case when drugname_structured = 'milrinone' then 1 else 0 end)::SMALLINT as milrinone
  , max(case when drugname_structured = 'heparin' then 1 else 0 end)::SMALLINT as heparin
  , max(case when drugname_structured = 'warfarin' then 1 else 0 end)::SMALLINT as warfarin
from vw0
WHERE
  -- have to have a start time
  drugstartoffset is not null
GROUP BY
  patientunitstayid, drugorderoffset, drugstartoffset, drugstopoffset
ORDER BY
  patientunitstayid, drugstartoffset, drugstopoffset, drugorderoffset;

"""

In [6]:
with con.begin() as connection:
    connection.execute(text(create_table_med))  # Use text() to wrap the raw SQL

select_med = "SELECT * FROM pivoted_med;"
df_med = pd.read_sql_query(text(select_med),con)
print(df_med)

         patientunitstayid  drugorderoffset  chartoffset  drugstopoffset  \
0                   141168            111.0           51          2050.0   
1                   141168            112.0          126           152.0   
2                   141168            112.0          126          1466.0   
3                   141168            117.0          246          1721.0   
4                   141168            470.0          471           513.0   
...                    ...              ...          ...             ...   
5167360            3353263            570.0          557          6538.0   
5167361            3353263           3768.0         3770          8156.0   
5167362            3353263           4817.0         5390          5390.0   
5167363            3353263           6556.0         6541          8156.0   
5167364            3353263           8159.0         8165          9592.0   

         norepinephrine  epinephrine  dopamine  dobutamine  phenylephrine  \
0         

In [7]:
# df_med.to_csv(porcesseddir + 'pivoted_med.csv',  index=True)
# print("Data exported successfully to 'pivoted_med'.")

Data exported successfully to 'pivoted_med'.


## nursecharting
- 从 nursecharting 表中提取与患者重要生命体征相关的数据，经过清洗和过滤后，按时间点汇总每个患者的生命体征信息
- 提取和验证每个生命体征的数据。
- 使用合适的范围对数据进行过滤（如心率在 25-225 之间）。
- 按患者和时间点对数据进行汇总和排序，确保输出的结果是结构化且无重复的。

In [8]:
create_table_vital = query_schema + """
-- This script duplicates the nurse charting table, making the following changes:
--  "major" vital signs -> pivoted_vital
--  "minor" vital signs -> pivoted_vital_other
DROP TABLE IF EXISTS pivoted_vital CASCADE;
CREATE TABLE pivoted_vital as
-- create columns with only numeric data
with nc as
(
select
    patientunitstayid
  , nursingchartoffset
  , nursingchartentryoffset
  , case
      when nursingchartcelltypevallabel = 'Heart Rate'
       and nursingchartcelltypevalname = 'Heart Rate'
       and nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$'
       and nursingchartvalue not in ('-','.')
          then cast(nursingchartvalue as numeric)
      else null end
    as heartrate
  , case
      when nursingchartcelltypevallabel = 'Respiratory Rate'
       and nursingchartcelltypevalname = 'Respiratory Rate'
       and nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$'
       and nursingchartvalue not in ('-','.')
          then cast(nursingchartvalue as numeric)
      else null end
    as RespiratoryRate
  , case
      when nursingchartcelltypevallabel = 'O2 Saturation'
       and nursingchartcelltypevalname = 'O2 Saturation'
       and nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$'
       and nursingchartvalue not in ('-','.')
          then cast(nursingchartvalue as numeric)
      else null end
    as o2saturation
  , case
      when nursingchartcelltypevallabel = 'Non-Invasive BP'
       and nursingchartcelltypevalname = 'Non-Invasive BP Systolic'
       and nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$'
       and nursingchartvalue not in ('-','.')
          then cast(nursingchartvalue as numeric)
      else null end
    as nibp_systolic
  , case
      when nursingchartcelltypevallabel = 'Non-Invasive BP'
       and nursingchartcelltypevalname = 'Non-Invasive BP Diastolic'
       and nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$'
       and nursingchartvalue not in ('-','.')
          then cast(nursingchartvalue as numeric)
      else null end
    as nibp_diastolic
  , case
      when nursingchartcelltypevallabel = 'Non-Invasive BP'
       and nursingchartcelltypevalname = 'Non-Invasive BP Mean'
       and nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$'
       and nursingchartvalue not in ('-','.')
          then cast(nursingchartvalue as numeric)
      else null end
    as nibp_mean
  , case
      when nursingchartcelltypevallabel = 'Temperature'
       and nursingchartcelltypevalname = 'Temperature (C)'
       and nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$'
       and nursingchartvalue not in ('-','.')
          then cast(nursingchartvalue as numeric)
      else null end
    as temperature
  , case
      when nursingchartcelltypevallabel = 'Temperature'
       and nursingchartcelltypevalname = 'Temperature Location'
          then nursingchartvalue
      else null end
    as TemperatureLocation
  , case
      when nursingchartcelltypevallabel = 'Invasive BP'
       and nursingchartcelltypevalname = 'Invasive BP Systolic'
       and nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$'
       and nursingchartvalue not in ('-','.')
          then cast(nursingchartvalue as numeric)
      else null end
    as ibp_systolic
  , case
      when nursingchartcelltypevallabel = 'Invasive BP'
       and nursingchartcelltypevalname = 'Invasive BP Diastolic'
       and nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$'
       and nursingchartvalue not in ('-','.')
          then cast(nursingchartvalue as numeric)
      else null end
    as ibp_diastolic
  , case
      when nursingchartcelltypevallabel = 'Invasive BP'
       and nursingchartcelltypevalname = 'Invasive BP Mean'
       and nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$'
       and nursingchartvalue not in ('-','.')
          then cast(nursingchartvalue as numeric)
      -- other map fields
      when nursingchartcelltypevallabel = 'MAP (mmHg)'
       and nursingchartcelltypevalname = 'Value'
       and nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$'
       and nursingchartvalue not in ('-','.')
          then cast(nursingchartvalue as numeric)
      when nursingchartcelltypevallabel = 'Arterial Line MAP (mmHg)'
       and nursingchartcelltypevalname = 'Value'
       and nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$'
       and nursingchartvalue not in ('-','.')
          then cast(nursingchartvalue as numeric)
      else null end
    as ibp_mean
  from nursecharting
  -- speed up by only looking at a subset of charted data
  where nursingchartcelltypecat in
  (
    'Vital Signs','Scores','Other Vital Signs and Infusions'
  )
)
select
  patientunitstayid
, nursingchartoffset as chartoffset
, nursingchartentryoffset as entryoffset
, avg(case when heartrate >= 25 and heartrate <= 225 then heartrate else null end) as heartrate
, avg(case when RespiratoryRate >= 0 and RespiratoryRate <= 60 then RespiratoryRate else null end) as RespiratoryRate
, avg(case when o2saturation >= 0 and o2saturation <= 100 then o2saturation else null end) as spo2
, avg(case when nibp_systolic >= 25 and nibp_systolic <= 250 then nibp_systolic else null end) as nibp_systolic
, avg(case when nibp_diastolic >= 1 and nibp_diastolic <= 200 then nibp_diastolic else null end) as nibp_diastolic
, avg(case when nibp_mean >= 1 and nibp_mean <= 250 then nibp_mean else null end) as nibp_mean
, avg(case when temperature >= 25 and temperature <= 46 then temperature else null end) as temperature
, max(temperaturelocation) as temperaturelocation
, avg(case when ibp_systolic >= 1 and ibp_systolic <= 300 then ibp_systolic else null end) as ibp_systolic
, avg(case when ibp_diastolic >= 1 and ibp_diastolic <= 200 then ibp_diastolic else null end) as ibp_diastolic
, avg(case when ibp_mean >= 1 and ibp_mean <= 250 then ibp_mean else null end) as ibp_mean
from nc
WHERE heartrate IS NOT NULL
OR RespiratoryRate IS NOT NULL
OR o2saturation IS NOT NULL
OR nibp_systolic IS NOT NULL
OR nibp_diastolic IS NOT NULL
OR nibp_mean IS NOT NULL
OR temperature IS NOT NULL
OR temperaturelocation IS NOT NULL
OR ibp_systolic IS NOT NULL
OR ibp_diastolic IS NOT NULL
OR ibp_mean IS NOT NULL
group by patientunitstayid, nursingchartoffset, nursingchartentryoffset
order by patientunitstayid, nursingchartoffset, nursingchartentryoffset;
"""

In [9]:
with con.begin() as connection:
    connection.execute(text(create_table_vital))  # Use text() to wrap the raw SQL

select_vital = "SELECT * FROM pivoted_vital;"
df_vital = pd.read_sql_query(text(select_vital),con)
print(df_vital)

          patientunitstayid  chartoffset  entryoffset  heartrate  \
0                    141168            6            6      140.0   
1                    141168           21           21       70.0   
2                    141168           36           36       70.0   
3                    141168           51           51      140.0   
4                    141168           66           66      140.0   
...                     ...          ...          ...        ...   
21038211            3353263          530          530       81.0   
21038212            3353263          590          590       78.0   
21038213            3353263          650          650       73.0   
21038214            3353263          710          710       82.0   
21038215            3353263          770          770       78.0   

          respiratoryrate  spo2  nibp_systolic  nibp_diastolic  nibp_mean  \
0                     NaN   NaN           82.0            59.0        NaN   
1                     NaN  94

In [10]:
# df_vital.to_csv(porcesseddir + 'pivoted_vital.csv',  index=True)
# print("Data exported successfully to 'pivoted_vital'.")

Data exported successfully to 'pivoted_vital'.


## nursecharting 
- 从nursecharting表中提取“次要”生命体征数据（与频繁测量的“主要”生命体征相对 如PA收缩压、PA舒张压、心输出量等），并将这些数据根据患者和时间点进行汇总
- 生命体征处理：通过 CASE 语句验证和提取每个生命体征数据，确保只处理有效的数值。对于每个生命体征，使用 AVG() 函数来处理数据并过滤掉不合理的值。
- 最终输出：对每个患者的每个时间点，输出多种“次要”生命体征的平均值或最大值，并按时间顺序排列。

In [11]:
create_table_vital_other= query_schema+ """
-- This script groups together like vital signs on the same row
--  "major" vital signs (frequently measured) -> pivoted_vital
--  "minor" vital signs (infrequently measured) -> pivoted_vital_other
DROP TABLE IF EXISTS pivoted_vital_other CASCADE;
CREATE TABLE pivoted_vital_other as
-- create columns with only numeric data
with nc as
(
select
    patientunitstayid
  , nursingchartoffset
  , nursingchartentryoffset
  -- pivot data - choose column names for consistency with vitalperiodic
  , case
        WHEN nursingchartcelltypevallabel = 'PA'
        AND  nursingchartcelltypevalname = 'PA Systolic'
        -- verify it's numeric
        AND nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$' and nursingchartvalue not in ('-','.')
          then cast(nursingchartvalue as numeric)
      else null end
    as pasystolic
  , case
        WHEN nursingchartcelltypevallabel = 'PA'
        AND  nursingchartcelltypevalname = 'PA Diastolic'
        -- verify it's numeric
        AND nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$' and nursingchartvalue not in ('-','.')
          then cast(nursingchartvalue as numeric)
      else null end
    as padiastolic
  , case
        WHEN nursingchartcelltypevallabel = 'PA'
        AND  nursingchartcelltypevalname = 'PA Mean'
        -- verify it's numeric
        AND nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$' and nursingchartvalue not in ('-','.')
          then cast(nursingchartvalue as numeric)
      else null end
    as pamean
  , case
        WHEN nursingchartcelltypevallabel = 'SV'
        AND  nursingchartcelltypevalname = 'SV'
        -- verify it's numeric
        AND nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$' and nursingchartvalue not in ('-','.')
          then cast(nursingchartvalue as numeric)
      else null end
    as sv
  , case
        WHEN nursingchartcelltypevallabel = 'CO'
        AND  nursingchartcelltypevalname = 'CO'
        -- verify it's numeric
        AND nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$' and nursingchartvalue not in ('-','.')
          then cast(nursingchartvalue as numeric)
      else null end
    as co
  , case
        WHEN nursingchartcelltypevallabel = 'SVR'
        AND  nursingchartcelltypevalname = 'SVR'
        -- verify it's numeric
        AND nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$' and nursingchartvalue not in ('-','.')
          then cast(nursingchartvalue as numeric)
      else null end
    as svr
  , case
        WHEN nursingchartcelltypevallabel = 'ICP'
        AND  nursingchartcelltypevalname = 'ICP'
        -- verify it's numeric
        AND nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$' and nursingchartvalue not in ('-','.')
          then cast(nursingchartvalue as numeric)
      else null end
    as icp
  , case
        WHEN nursingchartcelltypevallabel = 'CI'
        AND  nursingchartcelltypevalname = 'CI'
        -- verify it's numeric
        AND nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$' and nursingchartvalue not in ('-','.')
          then cast(nursingchartvalue as numeric)
      else null end
    as ci
  , case
        WHEN nursingchartcelltypevallabel = 'SVRI'
        AND  nursingchartcelltypevalname = 'SVRI'
        -- verify it's numeric
        AND nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$' and nursingchartvalue not in ('-','.')
          then cast(nursingchartvalue as numeric)
      else null end
    as svri
  , case
        WHEN nursingchartcelltypevallabel = 'CPP'
        AND  nursingchartcelltypevalname = 'CPP'
        -- verify it's numeric
        AND nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$' and nursingchartvalue not in ('-','.')
          then cast(nursingchartvalue as numeric)
      else null end
    as cpp
  , case
        WHEN nursingchartcelltypevallabel = 'SVO2'
        AND  nursingchartcelltypevalname = 'SVO2'
        -- verify it's numeric
        AND nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$' and nursingchartvalue not in ('-','.')
          then cast(nursingchartvalue as numeric)
      else null end
    as svo2
  , case
        WHEN nursingchartcelltypevallabel = 'PAOP'
        AND  nursingchartcelltypevalname = 'PAOP'
        -- verify it's numeric
        AND nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$' and nursingchartvalue not in ('-','.')
          then cast(nursingchartvalue as numeric)
      else null end
    as paop
  , case
        WHEN nursingchartcelltypevallabel = 'PVR'
        AND  nursingchartcelltypevalname = 'PVR'
        -- verify it's numeric
        AND nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$' and nursingchartvalue not in ('-','.')
          then cast(nursingchartvalue as numeric)
      else null end
    as pvr
  , case
        WHEN nursingchartcelltypevallabel = 'PVRI'
        AND  nursingchartcelltypevalname = 'PVRI'
        -- verify it's numeric
        AND nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$' and nursingchartvalue not in ('-','.')
          then cast(nursingchartvalue as numeric)
      else null end
    as pvri
  , case
      WHEN nursingchartcelltypevallabel = 'IAP'
      AND  nursingchartcelltypevalname = 'IAP'
      -- verify it's numeric
      AND nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$' and nursingchartvalue not in ('-','.')
        then cast(nursingchartvalue as numeric)
    else null end
  as iap
  from nursecharting
  -- speed up by only looking at a subset of charted data
  where nursingchartcelltypecat = 'Vital Signs'
)
select
  patientunitstayid
, nursingchartoffset as chartoffset
, nursingchartentryoffset as entryoffset
, AVG(CASE WHEN pasystolic >= 0 AND pasystolic <= 1000 THEN pasystolic ELSE NULL END) AS pasystolic
, AVG(CASE WHEN padiastolic >= 0 AND padiastolic <= 1000 THEN padiastolic ELSE NULL END) AS padiastolic
, AVG(CASE WHEN pamean >= 0 AND pamean <= 1000 THEN pamean ELSE NULL END) AS pamean
, AVG(CASE WHEN sv >= 0 AND sv <= 1000 THEN sv ELSE NULL END) AS sv
, AVG(CASE WHEN co >= 0 AND co <= 1000 THEN co ELSE NULL END) AS co
, AVG(CASE WHEN svr >= 0 AND svr <= 1000 THEN svr ELSE NULL END) AS svr
, AVG(CASE WHEN icp >= 0 AND icp <= 1000 THEN icp ELSE NULL END) AS icp
, AVG(CASE WHEN ci >= 0 AND ci <= 1000 THEN ci ELSE NULL END) AS ci
, AVG(CASE WHEN svri >= 0 AND svri <= 1000 THEN svri ELSE NULL END) AS svri
, AVG(CASE WHEN cpp >= 0 AND cpp <= 1000 THEN cpp ELSE NULL END) AS cpp
, AVG(CASE WHEN svo2 >= 0 AND svo2 <= 1000 THEN svo2 ELSE NULL END) AS svo2
, AVG(CASE WHEN paop >= 0 AND paop <= 1000 THEN paop ELSE NULL END) AS paop
, AVG(CASE WHEN pvr >= 0 AND pvr <= 1000 THEN pvr ELSE NULL END) AS pvr
, AVG(CASE WHEN pvri >= 0 AND pvri <= 1000 THEN pvri ELSE NULL END) AS pvri
, AVG(CASE WHEN iap >= 0 AND iap <= 1000 THEN iap ELSE NULL END) AS iap
from nc
WHERE pasystolic IS NOT NULL
OR padiastolic IS NOT NULL
OR pamean IS NOT NULL
OR sv IS NOT NULL
OR co IS NOT NULL
OR svr IS NOT NULL
OR icp IS NOT NULL
OR ci IS NOT NULL
OR svri IS NOT NULL
OR cpp IS NOT NULL
OR svo2 IS NOT NULL
OR paop IS NOT NULL
OR pvr IS NOT NULL
OR pvri IS NOT NULL
OR iap IS NOT NULL
group by patientunitstayid, nursingchartoffset, nursingchartentryoffset
order by patientunitstayid, nursingchartoffset, nursingchartentryoffset;
"""

In [12]:
with con.begin() as connection:
    connection.execute(text(create_table_vital_other))  # Use text() to wrap the raw SQL

select_vital_other = "SELECT * FROM pivoted_vital_other;"
df_vital_other = pd.read_sql_query(text(select_vital_other),con)
print(df_vital_other)

        patientunitstayid  chartoffset  entryoffset  pasystolic  padiastolic  \
0                  245811         1823         1898         NaN          NaN   
1                  245811         1913         1934         NaN          NaN   
2                  245811         2063         2101         NaN          NaN   
3                  245811         2223         2429         NaN          NaN   
4                  245811         2513         2663         NaN          NaN   
...                   ...          ...          ...         ...          ...   
213438            3352620         5110         5110         NaN          NaN   
213439            3352620         5170         5170         NaN          NaN   
213440            3352620         5290         5290         NaN          NaN   
213441            3352620         5470         5470         NaN          NaN   
213442            3352620         5590         5590         NaN          NaN   

        pamean     sv    co  svr  icp  

In [13]:
# df_vital_other.to_csv(porcesseddir + 'pivoted_vital_other.csv',  index=True)
# print("Data exported successfully to 'pivoted_vital_other'.")

Data exported successfully to 'pivoted_vital_other'.


## nursecharting_score
- 从 nursecharting 表中提取与评分（如GCS评分、跌倒风险、谵妄评分、镇静评分、疼痛评分等）相关的数据

In [5]:
create_table_score= query_schema + """
DROP TABLE IF EXISTS pivoted_score CASCADE;
CREATE TABLE pivoted_score as
-- create columns with only numeric data
with nc as
(
select
    patientunitstayid
  , nursingchartoffset
  , nursingchartentryoffset
  , case
    when nursingchartcelltypecat = 'Scores'
     and nursingchartcelltypevallabel = 'Glasgow coma score'
     and nursingchartcelltypevalname = 'GCS Total'
     and nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$'
     and nursingchartvalue not in ('-','.')
        then cast(nursingchartvalue as numeric)
    when nursingchartcelltypecat = 'Other Vital Signs and Infusions'
     and nursingchartcelltypevallabel = 'Score (Glasgow Coma Scale)'
     and nursingchartcelltypevalname = 'Value'
     and nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$'
     and nursingchartvalue not in ('-','.')
        then cast(nursingchartvalue as numeric)
    else null end
  as gcs
  -- components of GCS
  , case
    when nursingchartcelltypecat = 'Scores'
     and nursingchartcelltypevallabel = 'Glasgow coma score'
     and nursingchartcelltypevalname = 'Motor'
     and nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$'
     and nursingchartvalue not in ('-','.')
        then cast(nursingchartvalue as numeric)
    when nursingchartcelltypecat = 'Other Vital Signs and Infusions'
     and nursingchartcelltypevallabel = 'Best Motor Response'
        then case
          when nursingchartvalue in ('1', '1-->(M1) none', 'Flaccid') then 1
          when nursingchartvalue in ('2', '2-->(M2) extension to pain', 'Abnormal extension') then 2
          when nursingchartvalue in ('3', '3-->(M3) flexion to pain', 'Abnormal flexion') then 3
          when nursingchartvalue in ('4', '4-->(M4) withdraws from pain', 'Withdraws') then 4
          when nursingchartvalue in ('5', '5-->(M5) localizes pain', 'Localizes to noxious stimuli') then 5
          when nursingchartvalue in ('6','6-->(M6) obeys commands', 'Obeys simple commands') then 6
        else null end
    else null end
  as gcs_motor
  , case
    when nursingchartcelltypecat = 'Scores'
     and nursingchartcelltypevallabel = 'Glasgow coma score'
     and nursingchartcelltypevalname = 'Verbal'
     and nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$'
     and nursingchartvalue not in ('-','.')
        then cast(nursingchartvalue as numeric)
    when nursingchartcelltypecat = 'Other Vital Signs and Infusions'
     and nursingchartcelltypevallabel = 'Best Verbal Response'
        then case
          -- when nursingchartvalue in ('Trached or intubated') then 0
          when nursingchartvalue in ('1', '1-->(V1) none', 'None', 'Clearly unresponsive') then 1
          when nursingchartvalue in ('2', '2-->(V2) incomprehensible speech', 'Incomprehensible sounds') then 2
          when nursingchartvalue in ('3', '3-->(V3) inappropriate words', 'Inappropriate words') then 3
          when nursingchartvalue in ('4', '4-->(V4) confused', 'Confused') then 4
          when nursingchartvalue in ('5', '5-->(V5) oriented', 'Oriented',
                                    'Orientation/ability to communicate questionable',
                                    'Clearly oriented/can indicate needs') then 5
        else null end
    else null end
  as gcs_verbal
  , case
    when nursingchartcelltypecat = 'Scores'
     and nursingchartcelltypevallabel = 'Glasgow coma score'
     and nursingchartcelltypevalname = 'Eyes'
     and nursingchartvalue ~ '^[-]?[0-9]+[.]?[0-9]*$'
     and nursingchartvalue not in ('-','.')
        then cast(nursingchartvalue as numeric)
    when nursingchartcelltypecat = 'Other Vital Signs and Infusions'
     and nursingchartcelltypevallabel = 'Best Eye Response'
        then case
          when nursingchartvalue in ('1', '1-->(E1) none') then 1
          when nursingchartvalue in ('2', '2-->(E2) to pain') then 2
          when nursingchartvalue in ('3', '3-->(E3) to speech') then 3
          when nursingchartvalue in ('4', '4-->(E4) spontaneous') then 4
        else null end
    else null end
  as gcs_eyes
  -- unable/other misc info
  , case
    when nursingchartcelltypecat = 'Scores'
     and nursingchartcelltypevallabel = 'Glasgow coma score'
     and nursingchartcelltypevalname = 'GCS Total'
     and nursingchartvalue = 'Unable to score due to medication'
        then 1
    else null end
  as gcs_unable
  , case
    when nursingchartcelltypecat = 'Other Vital Signs and Infusions'
     and nursingchartcelltypevallabel = 'Best Verbal Response'
     and nursingchartvalue = 'Trached or intubated'
        then 1
    else null end
  as gcs_intub
  -- fall risk
  , case
    when nursingchartcelltypecat = 'Scores'
     and nursingchartcelltypevallabel = 'Fall Risk'
     and nursingchartcelltypevalname = 'Fall Risk'
        then case
          when nursingchartvalue = 'Low' then 1
          when nursingchartvalue = 'Medium' then 2
          when nursingchartvalue = 'High' then 3
        else null end
    else null end::numeric
  as fall_risk
  -- delirium
  , case
    when nursingchartcelltypecat = 'Scores'
     and nursingchartcelltypevallabel = 'Delirium Scale/Score'
     and nursingchartcelltypevalname = 'Delirium Scale'
        then nursingchartvalue
    else null end
  as delirium_scale
  , case
    when nursingchartcelltypecat = 'Scores'
     and nursingchartcelltypevallabel = 'Delirium Scale/Score'
     and nursingchartcelltypevalname = 'Delirium Score'
        then case
          when nursingchartvalue in ('No', 'NO') then 0
          when nursingchartvalue in ('Yes', 'YES') then 1
          when nursingchartvalue = 'N/A' then NULL
        else cast(nursingchartvalue as numeric) end
    else null end
  as delirium_score
  -- sedation
  , case
    when nursingchartcelltypecat = 'Scores'
     and nursingchartcelltypevallabel = 'Sedation Scale/Score/Goal'
     and nursingchartcelltypevalname = 'Sedation Scale'
        then nursingchartvalue
    else null end
  as sedation_scale
  , case
    when nursingchartcelltypecat = 'Scores'
     and nursingchartcelltypevallabel = 'Sedation Scale/Score/Goal'
     and nursingchartcelltypevalname = 'Sedation Score'
        then cast(nursingchartvalue as numeric)
    else null end
  as sedation_score
  , case
    when nursingchartcelltypecat = 'Scores'
     and nursingchartcelltypevallabel = 'Sedation Scale/Score/Goal'
     and nursingchartcelltypevalname = 'Sedation Goal'
        then cast(nursingchartvalue as numeric)
    else null end
  as sedation_goal
  -- pain
  , case
    when nursingchartcelltypecat = 'Scores'
     and nursingchartcelltypevallabel = 'Pain Score/Goal'
     and nursingchartcelltypevalname = 'Pain Score'
        then cast(nursingchartvalue as numeric)
    else null end
  as pain_score
  , case
    when nursingchartcelltypecat = 'Scores'
     and nursingchartcelltypevallabel = 'Pain Score/Goal'
     and nursingchartcelltypevalname = 'Pain Goal'
        then cast(nursingchartvalue as numeric)
    else null end
  as pain_goal
  from nursecharting
  -- speed up by only looking at a subset of charted data
  where nursingchartcelltypecat IN
  (
        'Scores'
      , 'Other Vital Signs and Infusions'
  )
)
select
  patientunitstayid
, nursingchartoffset as chartoffset
, nursingchartentryoffset as entryoffset
, AVG(gcs) as gcs
, AVG(gcs_motor) as gcs_motor
, AVG(gcs_verbal) as gcs_verbal
, AVG(gcs_eyes) as gcs_eyes
, MAX(gcs_unable) as gcs_unable
, MAX(gcs_intub) as gcs_intub
, AVG(fall_risk) as fall_risk
, MAX(delirium_scale) as delirium_scale
, AVG(delirium_score) as delirium_score
, MAX(sedation_scale) as sedation_scale
, AVG(sedation_score) as sedation_score
, AVG(sedation_goal) as sedation_goal
, AVG(pain_score) as pain_score
, AVG(pain_goal) as pain_goal
from nc
WHERE gcs IS NOT NULL
OR gcs_motor IS NOT NULL
OR gcs_verbal IS NOT NULL
OR gcs_eyes IS NOT NULL
OR gcs_unable IS NOT NULL
OR gcs_intub IS NOT NULL
OR fall_risk IS NOT NULL
OR delirium_scale IS NOT NULL
OR delirium_score IS NOT NULL
OR sedation_scale IS NOT NULL
OR sedation_score IS NOT NULL
OR sedation_goal IS NOT NULL
OR pain_score IS NOT NULL
OR pain_goal IS NOT NULL
group by patientunitstayid, nursingchartoffset, nursingchartentryoffset
order by patientunitstayid, nursingchartoffset, nursingchartentryoffset;
"""

In [6]:
with con.begin() as connection:
    connection.execute(text(create_table_score))  # Use text() to wrap the raw SQL

select_score = "SELECT * FROM pivoted_score;"
df_score = pd.read_sql_query(text(select_score),con)
print(df_score)

         patientunitstayid  chartoffset  entryoffset   gcs  gcs_motor  \
0                   141168           21           21   NaN        NaN   
1                   141168          396          396   NaN        NaN   
2                   141168          561          561   NaN        NaN   
3                   141168          809          809   NaN        NaN   
4                   141168         1066         1066   NaN        NaN   
...                    ...          ...          ...   ...        ...   
5709673            3353263         4306         4306  15.0        6.0   
5709674            3353263         4837         4837  15.0        6.0   
5709675            3353263         5651         5651  15.0        6.0   
5709676            3353263         7100         7100  15.0        6.0   
5709677            3353263         8525         8525  15.0        6.0   

         gcs_verbal  gcs_eyes  gcs_unable  gcs_intub  fall_risk  \
0               NaN       NaN         NaN        NaN    

In [7]:
# df_score.to_csv(porcesseddir + 'pivoted_score.csv',  index=True)
# print("Data exported successfully to 'pivoted_score'.")

Data exported successfully to 'pivoted_score'.
