<a href="https://colab.research.google.com/github/johnsonjzhou/comp90089-project/blob/main/sql/initial_cohort.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **COMP90089 Final Project**

## **Selecting Initial Patient Cohort from MIMICIV**

### **Set up the environemnt**

In [2]:
from google.colab import drive

drive.mount('/content/drive')
path = '/content/drive/My Drive'

Mounted at /content/drive


In [3]:
!pip install -U pandasql

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pandasql
  Downloading pandasql-0.7.3.tar.gz (26 kB)
Building wheels for collected packages: pandasql
  Building wheel for pandasql (setup.py) ... [?25l[?25hdone
  Created wheel for pandasql: filename=pandasql-0.7.3-py3-none-any.whl size=26784 sha256=7d3dcb5ff8e135bbbb6950767366d45ee65d2499afcb9fc65420a7b9a92e4b2f
  Stored in directory: /root/.cache/pip/wheels/5c/4b/ec/41f4e116c8053c3654e2c2a47c62b4fca34cc67ef7b55deb7f
Successfully built pandasql
Installing collected packages: pandasql
Successfully installed pandasql-0.7.3


In [4]:
#Set up the environement

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import functools as ft
from pandasql import sqldf
%matplotlib inline


#Project_ID
project_id = "mimic-iv-projects" 

# Read data from BigQuery into pandas dataframes.
def run_query(query, project_id=project_id):
    return pd.io.gbq.read_gbq(
        query,
        project_id=project_id,
        dialect='standard')


### **SQL queries**

#### **1. Queries for finding the patients with criteria:**
 * n_stay >= 1
 * sum Los >= 3.3 
 * age between 18 and 90
 * Date of Death is undefined 

In [5]:
#Select patients from ICU_stays based on: n_stay >= 1 and sum los >= 3.3 per 	hadm_id:

c1= f"""
SELECT
    icustays.subject_id As subject_id,
    icustays.hadm_id As hadm_id,
    count(icustays.stay_id) As n_stays,
    sum(icustays.los_icu) As sum_los,
FROM
  `physionet-data.mimiciv_derived.icustay_detail` AS icustays
GROUP BY
    subject_id,	hadm_id
HAVING 
   n_stays >= 1 AND 
   sum_los >= 3.3 
ORDER BY
    subject_id,	hadm_id
"""

In [6]:
#Filter the patients from c1 to include the cohort that has age range (18 - 90) and survival status (dod is null):
cohort_c1 = run_query(f"""
    
    SELECT DISTINCT
           c1.subject_id As subject_id,
           c1.hadm_id As hadm_id,
           icustays.stay_id As stay_id,
           c1.n_stays As n_stays,
           c1.sum_los As sum_los,
           icustays.gender As gender,
           icustays.admission_age As admission_age
    FROM  ({c1}) As c1
    INNER JOIN `physionet-data.mimiciv_derived.icustay_detail` AS icustays
    ON icustays.hadm_id = c1.hadm_id AND (icustays.admission_age BETWEEN 18 AND 90) AND  (icustays.dod IS NULL)
    ORDER BY
         c1.subject_id
""")

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=725825577420-unm2gnkiprugilg743tkbig250f4sfsj.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fbigquery&state=HF8GIIxjUdmJj6je4HS3ov4fjeTAct&prompt=consent&access_type=offline
Enter the authorization code: 4/1ARtbsJrE_-rmqzWqPlSu-M72tQgiOSqi0x3bC2XaskruUA2Tya3orkrVMEg


In [7]:
cohort_c1.to_csv('/content/drive/MyDrive/cohort_criteria_data.csv', index=False)

In [8]:
#Desired cohort subject_ids, hadm_id, and stay_ids 
cohort_subject_ids = list(cohort_c1['subject_id'].unique())
cohort_stay_ids = list(cohort_c1['stay_id'].unique())
cohort_hadm_ids = list(cohort_c1['hadm_id'].unique())

#### **2. Queries for selecting demographic features for the cohort:** 

In [9]:
#Demographic_1
# Presence of infection
# Table: mimiciv_derived.suspicion_of_infection
# Presence of infection (suspected_infection = 1 or 0)


# ---- Note: some stay ids has different suspected_infection values based on 
# the used the used antibiotic and the antibiotic_time; so only the last value is included  

demographic_1 = run_query(f"""
    SELECT 
           suspicion_of_infection.subject_id As subject_id,
           suspicion_of_infection.stay_id As stay_id,
           max(suspicion_of_infection.suspected_infection) As suspected_infection
    FROM `physionet-data.mimiciv_derived.suspicion_of_infection` As suspicion_of_infection
    GROUP By 
       subject_id, stay_id
""")

In [10]:
demographic_1.to_csv('/content/drive/MyDrive/cohort_demographic_1.csv',index=False)

In [11]:
#Demographic_2
# Table: mimiciv_hosp.admissions
# Type of admission (admission_type)

#Note: Some patients have more than one admission type

demographic_2 = run_query(f"""
    SELECT  admission.subject_id As subject_id,
            admission.hadm_id As hadm_id,
            admission.admission_type As admission_type
    FROM `physionet-data.mimiciv_hosp.admissions` As admission    
""")

In [12]:
demographic_2.to_csv('/content/drive/MyDrive/cohort_demographic_2.csv',index=False)

In [13]:
##Demographic_3
# Table: mimiciv_derived.first_day_weight
# weight

demographic_3 = run_query(f"""
    SELECT  first_day_weight.subject_id As subject_id,
            first_day_weight.stay_id As stay_id,
            first_day_weight.weight As weight
    FROM `physionet-data.mimiciv_derived.first_day_weight` as first_day_weight
   """)

In [14]:
demographic_3.to_csv('/content/drive/MyDrive/cohort_demographic_3.csv',index=False)

In [15]:
##Demographic_4
# Table: mimiciv_derived.first_day_height
demographic_4 = run_query(f"""
    SELECT  first_day_height.subject_id As subject_id,
            first_day_height.stay_id As stay_id,
            first_day_height.height As height
      FROM `physionet-data.mimiciv_derived.first_day_height` As first_day_height
    """)

In [16]:
demographic_4.to_csv('/content/drive/MyDrive/cohort_demographic_4.csv',index=False)

#### **3. Queries for selecting vitalsigns features for the cohort:** 

In [17]:
# Vital signs
#Table: mimiciv_derived.first_day_vitalsign`

#heart rate(heart_rate_min, heart_ratemax)
#respiratory_rate (resp_rate_min, resp_rate_max)
#body_temp (temperature_min, temperature_max)

# Invasive mean arterial pressure (mbp_min, mbp_max)
# Invasive systolic blood pressure (sbp_min sbp_ max)
# Invasive diastolic blood pressure (dbp_min dbp_max)
# Oxygen saturation (SPO2_mean)

#(Non-invasive values) not included in the first_day_vitalsign
# Non-invasive mean arterial pressure
# Non-invasive Systolic blood pressure
# Non-invasive Diastolic blood pressure

vitalsigns = run_query(f"""
    SELECT first_day_vitalsigns.subject_id As subject_id,
          first_day_vitalsigns.stay_id As stay_id,
          first_day_vitalsigns.heart_rate_min As heart_rate_min,
          first_day_vitalsigns.heart_rate_max As heart_rate_max,
          first_day_vitalsigns.temperature_min As temperature_min,
          first_day_vitalsigns.temperature_max As temperature_max,
          first_day_vitalsigns.mbp_min As mbp_min,
          first_day_vitalsigns.mbp_max As mbp_max,
          first_day_vitalsigns.sbp_min As sbp_min,
          first_day_vitalsigns.sbp_max As sbp_max,
          first_day_vitalsigns.dbp_min As dbp_min,
          first_day_vitalsigns.dbp_max As dbp_max,
          first_day_vitalsigns.spo2_min As spo2_min,
          first_day_vitalsigns.spo2_max As spo2_max,
    FROM `physionet-data.mimiciv_derived.first_day_vitalsign`  As first_day_vitalsigns
""")

In [18]:
vitalsigns.to_csv('/content/drive/MyDrive/cohort_vitalsigns.csv',index=False)

#### **4. Queries for selecting lab tests features for the cohort:** 

In [19]:

# Laboratory tests 1


#Table: mimiciv_derived.first_day_lab

# Albumin(albumin_min, albumin_max)
# Blood urea nitrogen (bun_min, bun_max)
# Calcium (calcium_min, calcium_max)
# creatinine (creatinine_min, creatinine_max)
# Glucose (glucose_min, glucose_max)
# Bicarbonate (bicarbonate_min, bicarbonate_max)
# Potassium (potassium_min, potassium_max)
# Sodium (sodium_min, sodium_max)
# Platelets (platelets_min, platelets_max)
# Bilirubin (bilirubin_total_min, bilirubin_total_max)
# White blood cell count (wbc_min, wbc_max)



#Not Found! 

# Lactate Dehydrogenase,
# Magnesium (Mg), 
#Leukocytes
# Urea

cohort_lab_tests_1 = run_query(f"""
SELECT 
       lab_tests.subject_id As subject_id,
       lab_tests.stay_id As stay_id,
       lab_tests.albumin_min As albumin_min,
       lab_tests.albumin_max As albumin_max,
       lab_tests.bun_min As bun_min,
       lab_tests.bun_max As bun_max,
       lab_tests.calcium_min As calcium_min,
       lab_tests.calcium_max As calcium_max,
       lab_tests.creatinine_min As creatinine_min,
       lab_tests.creatinine_max As creatinine_max,
       lab_tests.glucose_min As glucose_min,
       lab_tests.glucose_max As glucose_max,
       lab_tests.bicarbonate_min As bicarbonate_min,
       lab_tests.bicarbonate_max As bicarbonate_max,
       lab_tests.potassium_min As potassium_min,
       lab_tests.potassium_max As potassium_max,
       lab_tests.sodium_min As sodium_min,
       lab_tests.sodium_max As sodium_max,
       lab_tests.platelets_min As platelets_min,
       lab_tests.platelets_max As platelets_max,
       lab_tests.bilirubin_total_min As bilirubin_total_min,
       lab_tests.bilirubin_total_max As bilirubin_total_max,
       lab_tests.wbc_min As wbc_min,
       lab_tests.wbc_max As wbc_max,
 FROM `physionet-data.mimiciv_derived.first_day_lab`  As lab_tests
""")


In [20]:
cohort_lab_tests_1.to_csv('/content/drive/MyDrive/cohort_lab_tests_1.csv',index=False)

In [21]:
# Laboratory tests 2

#Table: mimiciv_derived.first_day_bg

# Lactate (lactate_min, lactate_max)
# pH (ph_min, ph_max)
# Partial pressure of carbon dioxide (pco2_min, pco2_max)
# Partial pressure of Oxygen (po2_min, po2_max)
# PaO2/FiO2 ratio (pao2fio2ratio_min, pao2fio2ratio_max)   

cohort_lab_tests_2 = run_query(f"""
SELECT
       lab_bg_tests.subject_id As subject_id,
       lab_bg_tests.stay_id As stay_id,
       lab_bg_tests.lactate_min As lactate_min,
       lab_bg_tests.lactate_max As lactate_max,
       lab_bg_tests.ph_min As ph_min,
       lab_bg_tests.ph_max As ph_max,
       lab_bg_tests.pco2_min As pco2_min,
       lab_bg_tests.pco2_max As pco2_max,
       lab_bg_tests.po2_min As po2_min,
       lab_bg_tests.po2_max As po2_max,
       lab_bg_tests.pao2fio2ratio_min As pao2fio2ratio_min,
       lab_bg_tests.pao2fio2ratio_max As pao2fio2ratio_max
       FROM `physionet-data.mimiciv_derived.first_day_bg` As lab_bg_tests
       """)


In [22]:
cohort_lab_tests_2.to_csv('/content/drive/MyDrive/cohort_lab_tests_2.csv',index=False)

#### **5. Queries for selecting comorbidities scores features for the cohort:** 

In [29]:
# Comorbidities 1

# Table: mimiciv_derived.charlson
# Charlson index (charlson_comorbidity_index)


cohort_comorbidities_1 = run_query(f"""
SELECT charlson_comorbidity.subject_id As subject_id,
       charlson_comorbidity.hadm_id As hadm_id,
       charlson_comorbidity.charlson_comorbidity_index As charlson
FROM `physionet-data.mimiciv_derived.charlson` As charlson_comorbidity 
""")

In [30]:
cohort_comorbidities_1.to_csv('/content/drive/MyDrive/cohort_comorbidities_1.csv',index=False)

In [31]:
#Comorbidities 2

# Table: mimiciv_derived.first_day_gcs
# Glasgow coma scale (gcs_min)


cohort_comorbidities_2 = run_query(f"""
    SELECT gcs_first_day.subject_id As subject_id,
          gcs_first_day.stay_id As stay_id,
          gcs_first_day.gcs_min As gcs_min   
    FROM `physionet-data.mimiciv_derived.first_day_gcs` As gcs_first_day
    """)

In [32]:
cohort_comorbidities_2.to_csv('/content/drive/MyDrive/cohort_comorbidities_2.csv',index=False)

In [33]:
#Comorbidities 3
# Table: mimiciv_derived.first_day_sofa
# Sequential Organ Failure Assessment score (SOFA)

cohort_comorbidities_3 = run_query(f"""
    SELECT first_day_sofa.subject_id As subject_id,
          first_day_sofa.stay_id As stay_id,
          first_day_sofa.SOFA As sofa
    FROM `physionet-data.mimiciv_derived.first_day_sofa` As first_day_sofa 
""")   

In [34]:
cohort_comorbidities_3.to_csv('/content/drive/MyDrive/cohort_comorbidities_3.csv',index=False)

#### **6. Query for selecting ventilation status feature for the cohort:** 

In [35]:
# Device use

# Table: mimiciv_derived.ventilation
# Ventilation status (ventilation_status)

cohort_device_use = run_query(f"""
    SELECT  ventilation.stay_id As stay_id,
           ventilation.ventilation_status As ventilation_status
     FROM `physionet-data.mimiciv_derived.ventilation` As ventilation
    """)

In [36]:
cohort_device_use.to_csv('/content/drive/MyDrive/cohort_device_use.csv',index=False)

#### **7. Queries for selecting Input/Output feature for the cohort:** 

In [37]:
# Input/output: 


# Table: mimiciv_derived.first_day_urine_output 
# Urine output (urineoutput)


cohort_urine_output = run_query(f"""
    SELECT urine_output.subject_id As subject_id,
          urine_output.stay_id As stay_id,
          urine_output.urineoutput As urineoutput          
    FROM `physionet-data.mimiciv_derived.first_day_urine_output` As urine_output
""")

In [38]:
cohort_urine_output.to_csv('/content/drive/MyDrive/cohort_urine_output.csv',index=False)

In [50]:
# Input/output: 

# Table: mimiciv_derived.vasoactive_agent
#ratio of all given drugs within the first 24 hours of icu_stay

#Find all drugs given within first day of stay

vasopressin_first_stay_day = f"""
    SELECT icustays.stay_id As stay_id,
           vasopressin.dopamine,
           vasopressin.epinephrine,
           vasopressin.norepinephrine,
           vasopressin.phenylephrine,
           vasopressin.vasopressin,
           vasopressin.dobutamine,
           vasopressin.milrinone
    FROM `physionet-data.mimiciv_derived.icustay_detail` AS icustays
    Inner JOIN `physionet-data.mimiciv_derived.vasoactive_agent` As vasopressin
      ON vasopressin.stay_id = icustays.stay_id AND
       DATETIME_DIFF(CAST(vasopressin.starttime AS DATETIME) , CAST(icustays.icu_intime AS DATETIME) , HOUR) <= 24
"""

In [51]:
cohort_vasopressin = run_query(f""" 
    SELECT cohort_vasopressin.stay_id,
    sum(cohort_vasopressin.dopamine) As dopamine,
    sum(cohort_vasopressin.epinephrine) As epinephrine,
    sum(cohort_vasopressin.norepinephrine) As norepinephrine,
    sum(cohort_vasopressin.phenylephrine) As phenylephrine,
    sum(cohort_vasopressin.vasopressin) As vasopressin,
    sum(cohort_vasopressin.dobutamine) As dobutamine,
    sum(cohort_vasopressin.milrinone) As milrinone
    FROM ({vasopressin_first_stay_day}) As cohort_vasopressin

    GROUP BY stay_id
    """)

In [52]:
cohort_vasopressin.to_csv('/content/drive/MyDrive/cohort_vasopressin.csv',index=False)

### **Join all result tables**

In [53]:
#Join tables: 

cohort_criteria = pd.read_csv('/content/drive/MyDrive/cohort_criteria_data.csv')
cohort_demographic_1 = pd.read_csv('/content/drive/MyDrive/cohort_demographic_1.csv')
cohort_demographic_2 = pd.read_csv('/content/drive/MyDrive/cohort_demographic_2.csv')
cohort_demographic_3 = pd.read_csv('/content/drive/MyDrive/cohort_demographic_3.csv')
cohort_demographic_4 = pd.read_csv('/content/drive/MyDrive/cohort_demographic_4.csv')
cohort_vitalsigns = pd.read_csv('/content/drive/MyDrive/cohort_vitalsigns.csv')
cohort_lab_tests_1 = pd.read_csv('/content/drive/MyDrive/cohort_lab_tests_1.csv')
cohort_lab_tests_2 = pd.read_csv('/content/drive/MyDrive/cohort_lab_tests_2.csv')
cohort_comorbidities_1 = pd.read_csv('/content/drive/MyDrive/cohort_comorbidities_1.csv')
cohort_comorbidities_2 = pd.read_csv('/content/drive/MyDrive/cohort_comorbidities_2.csv')
cohort_comorbidities_3 = pd.read_csv('/content/drive/MyDrive/cohort_comorbidities_3.csv')
cohort_device_use = pd.read_csv('/content/drive/MyDrive/cohort_device_use.csv')
cohort_urine = pd.read_csv('/content/drive/MyDrive/cohort_urine_output.csv')
cohort_vasopressin = pd.read_csv('/content/drive/MyDrive/cohort_vasopressin.csv')


In [54]:
#1. Join demographic_1:
query1 = f"""
SELECT DISTINCT cohort_criteria.*,
       cohort_demographic_1.* 
FROM cohort_criteria  As cohort_criteria
LEFT JOIN cohort_demographic_1 AS cohort_demographic_1
   ON cohort_demographic_1.stay_id = cohort_criteria.stay_id 
""" 

In [55]:
#Run query
demographic_1_joined  = sqldf(query1, globals())

In [56]:
#Remove duplicate Ids columns 
demographic_1_joined = demographic_1_joined.loc[:,~demographic_1_joined.columns.duplicated()].copy()

In [57]:
#2. Join demographic_2:
query2 = f"""
SELECT DISTINCT demographic_1_joined.*,
       cohort_demographic_2.*
FROM demographic_1_joined  As demographic_1_joined
LEFT JOIN cohort_demographic_2 AS cohort_demographic_2
ON demographic_1_joined.hadm_id = cohort_demographic_2.hadm_id
""" 

In [58]:
#Run query
demographic_2_joined  = sqldf(query2, globals())

In [59]:
#Remove duplicate Ids columns 
demographic_2_joined = demographic_2_joined.loc[:,~demographic_2_joined.columns.duplicated()].copy()

In [60]:
#3. Join lab tests 
query3 = f"""
SELECT DISTINCT demographic_2_joined.*,
      cohort_demographic_3.* 
FROM demographic_2_joined  As demographic_2_joined
LEFT JOIN cohort_demographic_3 AS cohort_demographic_3
ON demographic_2_joined.stay_id = cohort_demographic_3.stay_id
""" 

In [61]:
#Run query
demographic_3_joined = sqldf(query3, globals())

In [62]:
#Remove duplicate Ids columns 
demographic_3_joined = demographic_3_joined.loc[:,~demographic_3_joined.columns.duplicated()].copy()

In [63]:
#4. Join comorbidities scores 

query4 = f"""
SELECT DISTINCT demographic_3_joined.*,
      cohort_demographic_4.*
FROM demographic_3_joined  As demographic_3_joined
LEFT JOIN cohort_demographic_4 AS cohort_demographic_4
ON demographic_3_joined.stay_id = cohort_demographic_4.stay_id
""" 

In [64]:
#Run query
demographic_4_joined  = sqldf(query4, globals())

In [65]:
#Remove duplicate Ids columns 
demographic_4_joined = demographic_4_joined.loc[:,~demographic_4_joined.columns.duplicated()].copy()

In [66]:
#5. Join vital signs 

query5 = f"""
SELECT DISTINCT demographic_4_joined.*,
     cohort_vitalsigns.*
FROM demographic_4_joined  As demographic_4_joined
LEFT JOIN cohort_vitalsigns AS cohort_vitalsigns
ON demographic_4_joined.stay_id = cohort_vitalsigns.stay_id
""" 

In [67]:
#Run query
vitalsigns_joined  = sqldf(query5, globals())

In [68]:
#Remove duplicate Ids columns 
vitalsigns_joined = vitalsigns_joined.loc[:,~vitalsigns_joined.columns.duplicated()].copy()

In [69]:
#6. Join lab tests 1 

query6 = f"""
SELECT DISTINCT vitalsigns_joined.*,
     cohort_lab_tests_1.*
FROM vitalsigns_joined As vitalsigns_joined
LEFT JOIN cohort_lab_tests_1 AS cohort_lab_tests_1
ON vitalsigns_joined.stay_id = cohort_lab_tests_1.stay_id
""" 

In [70]:
#Run query
lab_test_1_joined  = sqldf(query6, globals())

In [71]:
#Remove duplicate Ids columns 
lab_test_1_joined = lab_test_1_joined.loc[:,~lab_test_1_joined.columns.duplicated()].copy()

In [72]:
#7. Join lab tests 2

query7 = f"""
SELECT DISTINCT lab_test_1_joined.*,
     cohort_lab_tests_2.*
FROM lab_test_1_joined As lab_test_1_joined
LEFT JOIN cohort_lab_tests_2 AS cohort_lab_tests_2
ON lab_test_1_joined.stay_id = cohort_lab_tests_2.stay_id
""" 

In [73]:
#Run query
lab_test_2_joined  = sqldf(query7, globals())

In [74]:
#Remove duplicate Ids columns 
lab_test_2_joined = lab_test_2_joined.loc[:,~lab_test_2_joined.columns.duplicated()].copy()

In [75]:
#8. Join comorbiditiy score 1 

query8 = f"""
SELECT DISTINCT lab_test_2_joined.*,
     cohort_comorbidities_1.*
FROM lab_test_2_joined As lab_test_2_joined
LEFT JOIN cohort_comorbidities_1 AS cohort_comorbidities_1
ON lab_test_2_joined.hadm_id = cohort_comorbidities_1.hadm_id
""" 


In [76]:
#Run query
comorbidities_1_joined  = sqldf(query8, globals())

In [77]:
#Remove duplicate Ids columns 
comorbidities_1_joined = comorbidities_1_joined.loc[:,~comorbidities_1_joined.columns.duplicated()].copy()

In [78]:
#9. Join comorbiditiy score 2

query9 = f"""
SELECT DISTINCT comorbidities_1_joined.*,
     cohort_comorbidities_2.*
FROM comorbidities_1_joined As comorbidities_1_joined
LEFT JOIN cohort_comorbidities_2 AS cohort_comorbidities_2
ON comorbidities_1_joined.stay_id = cohort_comorbidities_2.stay_id
""" 


In [79]:
#Run query
comorbidities_2_joined  = sqldf(query9, globals())

In [80]:
#Remove duplicate Ids columns 
comorbidities_2_joined = comorbidities_2_joined.loc[:,~comorbidities_2_joined.columns.duplicated()].copy()

In [81]:
#10. Join comorbiditiy score 3

query10 = f"""
SELECT DISTINCT comorbidities_2_joined.*,
     cohort_comorbidities_3.*
FROM comorbidities_2_joined As comorbidities_2_joined
LEFT JOIN cohort_comorbidities_3 AS cohort_comorbidities_3
ON comorbidities_2_joined.stay_id = cohort_comorbidities_3.stay_id
""" 

In [82]:
#Run query
comorbidities_3_joined  = sqldf(query10, globals())

In [83]:
#Remove duplicate Ids columns 
comorbidities_3_joined = comorbidities_3_joined.loc[:,~comorbidities_3_joined.columns.duplicated()].copy()

In [84]:
# 11. Join ventilation status
query11 = f"""
SELECT DISTINCT comorbidities_3_joined.*,
     cohort_device_use.*
FROM comorbidities_3_joined As comorbidities_3_joined
LEFT JOIN cohort_device_use AS cohort_device_use
ON comorbidities_3_joined.stay_id = cohort_device_use.stay_id
""" 


In [85]:
#Run query
device_use_joined = sqldf(query11, globals())

In [86]:
#Remove duplicate Ids columns 
device_use_joined = device_use_joined.loc[:,~device_use_joined.columns.duplicated()].copy()

In [87]:
# 12. Join Input/Output 1 

query12 = f"""
SELECT DISTINCT device_use_joined.*,
     cohort_urine.*
FROM device_use_joined As device_use_joined
LEFT JOIN  cohort_urine AS  cohort_urine
ON device_use_joined.stay_id =  cohort_urine.stay_id
""" 

In [88]:
#Run query
urine_output_joined = sqldf(query12, globals())

In [89]:
#Remove duplicate Ids columns 
urine_output_joined = urine_output_joined.loc[:,~urine_output_joined.columns.duplicated()].copy()

In [90]:
# 13. Join Input/Output 2

query13 = f"""
SELECT DISTINCT urine_output_joined.*,
     cohort_vasopressin.*
FROM urine_output_joined As urine_output_joined
LEFT JOIN cohort_vasopressin AS cohort_vasopressin
ON urine_output_joined.stay_id = cohort_vasopressin.stay_id
""" 

In [91]:
#Run query
vasopressin_input_joined = sqldf(query13, globals())

In [92]:
#Remove duplicate Ids columns 
vasopressin_input_joined = vasopressin_input_joined.loc[:,~vasopressin_input_joined.columns.duplicated()].copy()

### **Final result table**

In [102]:
initial_cohort = vasopressin_input_joined.copy()

In [105]:
initial_cohort.to_csv('/content/drive/MyDrive/initial_cohort_final.csv')

In [104]:
initial_cohort.describe()

Unnamed: 0,subject_id,hadm_id,stay_id,n_stays,sum_los,admission_age,suspected_infection,weight,height,heart_rate_min,...,gcs_min,sofa,urineoutput,dopamine,epinephrine,norepinephrine,phenylephrine,vasopressin,dobutamine,milrinone
count,22071.0,22071.0,22071.0,22071.0,22071.0,22071.0,18017.0,21158.0,13557.0,22029.0,...,22015.0,22071.0,21566.0,465.0,1842.0,5112.0,4976.0,1693.0,361.0,818.0
mean,15004420.0,24987600.0,34961450.0,1.429704,10.077974,60.791083,0.940168,87.046797,170.029554,71.990603,...,11.457188,6.197499,1905.505657,77.22882,0.719172,2.583992,14.583995,42.112632,46.825632,5.537971
std,2903088.0,2882473.0,2900560.0,0.741137,8.996689,16.009583,0.237183,27.071463,10.435485,16.006078,...,3.773051,3.857264,1307.023634,131.950262,1.371509,4.356997,26.435675,35.363039,62.380004,5.838874
min,10002350.0,20001360.0,30000150.0,1.0,3.3,18.0,0.0,1.0,122.0,1.0,...,3.0,0.0,-1590.0,1.500451,0.008003,0.010001,0.050001,0.96,0.500015,0.125004
25%,12459210.0,22533330.0,32451940.0,1.0,4.54,51.0,1.0,70.0,163.0,60.0,...,9.0,3.0,1047.0,15.012234,0.160019,0.40004,2.273994,15.600198,7.496755,1.876385
50%,15055820.0,24976130.0,34907820.0,1.0,6.79,63.0,1.0,83.0,170.0,70.0,...,13.0,6.0,1645.0,34.520197,0.376142,1.190748,6.503536,32.430477,20.029645,4.39776
75%,17528850.0,27463420.0,37450380.0,2.0,12.04,73.0,1.0,99.4,178.0,82.0,...,14.0,9.0,2468.75,81.97297,0.819917,3.013383,16.710587,58.891999,59.45864,7.379839
max,19999440.0,29999620.0,39999810.0,7.0,101.75,90.0,1.0,1251.0,203.0,157.0,...,15.0,21.0,30595.0,1117.700063,25.719639,111.513319,505.147697,376.997914,389.320802,100.715046
