<a href="https://colab.research.google.com/github/lasiadhi/Prediction-of-hypoxemia-trend/blob/master/Generate_demographics_and_other_variables.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#################################################
###### @author: Lasith Adhikari (python 3)  #####
############ Created on Nov 27, 2019 ############
# Code description: 
# Generate demographics and other variables

In [2]:
# Import libraries
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.path as path
import tensorflow as tf

# Below imports are used to print out pretty pandas dataframes
from IPython.display import display, HTML

# Imports for accessing Datathon data using Google BigQuery.
from google.colab import auth
from google.cloud import bigquery

In [0]:
from google.colab import files
import seaborn as sns
import pickle
import matplotlib.pyplot as plt 
pd.options.display.max_columns = 200
pd.options.display.max_rows = 200

In [0]:
auth.authenticate_user()

In [0]:
# Note that this should be the project for the datathon work,
# not the physionet-data project which is for data hosting.
project_id = 'hst-953-2019'
os.environ['GOOGLE_CLOUD_PROJECT'] = project_id

In [0]:
# Read data from BigQuery into pandas dataframes.
def run_query(query):
  return pd.io.gbq.read_gbq(
      query,
      project_id=project_id,
      dialect='standard')

In [7]:

from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# read ICU IV start times:
first_icuid_dict = pickle.load(open("/content/drive/My Drive/Colab Notebooks/data/IV_LOS24_adult_firstICU_dict.pkl", "rb" ) )

In [9]:
len(first_icuid_dict.keys())

38823

## Read patient deom and admission related data

In [0]:
df_patient = run_query("""
select patientunitstayid, gender, age, ethnicity, hospitalid, wardid, admissionheight,admissionweight, hospitaladmitsource, unittype, unitstaytype 
from `physionet-data.eicu_crd.patient`
"""
)

In [11]:
df_patient.head()

Unnamed: 0,patientunitstayid,gender,age,ethnicity,hospitalid,wardid,admissionheight,admissionweight,hospitaladmitsource,unittype,unitstaytype
0,1137569,Female,75,Other/Unknown,208,513,172.7,95.7,Acute Care/Floor,MICU,readmit
1,3036927,Male,51,Caucasian,420,1026,177.8,,Emergency Department,Med-Surg ICU,admit
2,3058863,Female,31,Caucasian,420,1026,162.6,,Emergency Department,Med-Surg ICU,admit
3,3072720,Male,58,Caucasian,420,1026,173.0,,Other Hospital,Med-Surg ICU,admit
4,3075429,Female,68,Caucasian,420,1026,165.1,,Other Hospital,Med-Surg ICU,admit


## Keep required ICU stay ids

In [0]:
df_required_icu_stays =  df_patient[df_patient.patientunitstayid.isin(first_icuid_dict.keys())].reset_index(drop=True)

In [19]:
df_required_icu_stays.shape

(38823, 11)

## Compute BMI
Formula: weight (kg) / [height (m)]2

In [0]:
df_required_icu_stays['BMI'] = df_required_icu_stays.apply(lambda row: round(row['admissionweight']/(row['admissionheight']/100.0)**2,2) if (~np.isnan(row['admissionheight'])) & (row['admissionheight']!=0) else np.NaN ,axis=1)

In [40]:
sum(df_required_icu_stays['BMI'].isnull())

749

In [41]:
df_required_icu_stays.head()

Unnamed: 0,patientunitstayid,gender,age,ethnicity,hospitalid,wardid,admissionheight,admissionweight,hospitaladmitsource,unittype,unitstaytype,BMI
0,3132737,Male,55,African American,428,1048,162.56,51.5,,Med-Surg ICU,admit,19.49
1,3139462,Male,83,Caucasian,428,1048,172.72,58.0,,Med-Surg ICU,admit,19.44
2,2259342,Female,53,,301,798,157.0,102.0,,Med-Surg ICU,admit,41.38
3,3168101,Female,61,African American,443,1068,172.0,64.0,,Neuro ICU,admit,21.63
4,2130984,Female,62,Caucasian,310,836,162.6,37.5,,Med-Surg ICU,admit,14.18


## Clean variables

In [0]:
### gender:

In [45]:
df_required_icu_stays.gender.value_counts()

Male       22054
Female     16760
Other          4
Unknown        4
               1
Name: gender, dtype: int64

In [0]:
df_required_icu_stays.loc[df_required_icu_stays.gender=='','gender'] = 'Unknown'

In [48]:
df_required_icu_stays.gender.value_counts()

Male       22054
Female     16760
Unknown        5
Other          4
Name: gender, dtype: int64

In [0]:
### age

In [0]:
df_required_icu_stays.age = pd.to_numeric(df_required_icu_stays.age, errors='coerce')

In [53]:
df_required_icu_stays.age.describe()

count    38136.000000
mean        62.118523
std         15.643424
min         15.000000
25%         53.000000
50%         64.000000
75%         74.000000
max         89.000000
Name: age, dtype: float64

In [54]:
sum(df_required_icu_stays.age.isnull())

687

In [0]:
### ethnicity

In [56]:
df_required_icu_stays.ethnicity.value_counts()

Caucasian           29698
African American     4438
Other/Unknown        1832
Hispanic             1430
Asian                 683
                      444
Native American       298
Name: ethnicity, dtype: int64

In [0]:
df_required_icu_stays.loc[df_required_icu_stays.ethnicity=='','ethnicity'] = 'Other/Unknown'

In [58]:
df_required_icu_stays.ethnicity.value_counts()

Caucasian           29698
African American     4438
Other/Unknown        2276
Hispanic             1430
Asian                 683
Native American       298
Name: ethnicity, dtype: int64

In [0]:
### hospitaladmitsource

In [59]:
df_required_icu_stays.hospitaladmitsource.value_counts()

Emergency Department    12575
                        10086
Operating Room           7135
Floor                    3024
Direct Admit             2574
Other Hospital           1008
Recovery Room             817
Step-Down Unit (SDU)      621
Acute Care/Floor          567
PACU                      200
Other ICU                 119
Chest Pain Center          49
ICU to SDU                 27
ICU                        18
Observation                 3
Name: hospitaladmitsource, dtype: int64

In [0]:
df_required_icu_stays.loc[df_required_icu_stays.hospitaladmitsource=='','hospitaladmitsource'] = 'Unknown'

In [61]:
df_required_icu_stays.hospitaladmitsource.value_counts()

Emergency Department    12575
Unknown                 10086
Operating Room           7135
Floor                    3024
Direct Admit             2574
Other Hospital           1008
Recovery Room             817
Step-Down Unit (SDU)      621
Acute Care/Floor          567
PACU                      200
Other ICU                 119
Chest Pain Center          49
ICU to SDU                 27
ICU                        18
Observation                 3
Name: hospitaladmitsource, dtype: int64

In [0]:
### unittype

In [97]:
df_required_icu_stays.unittype.value_counts()

Med-Surg ICU    19594
CCU-CTICU        3772
MICU             3625
SICU             2850
Neuro ICU        2635
CTICU            2261
CSICU            2079
Cardiac ICU      2007
Name: unittype, dtype: int64

In [0]:
### unitstaytype

In [95]:
df_required_icu_stays.unitstaytype.value_counts()

admit             36948
transfer           1735
readmit              99
stepdown/other       41
Name: unitstaytype, dtype: int64

In [0]:
### BMI:

In [66]:
df_required_icu_stays['BMI'].describe()

count     38074.000000
mean        163.599577
std        6446.945383
min           0.000000
25%          23.850000
50%          28.100000
75%          33.570000
max      549218.750000
Name: BMI, dtype: float64

In [0]:
# Group BMI based on the definition: https://en.wikipedia.org/wiki/Body_mass_index#/media/File:BMI_chart.svg

In [0]:
def BMI_label(bmi):
  if bmi < 18.5:
    return 'Underweight'
  elif (bmi>=18.5) & (bmi<25):
    return 'Normal weight'
  elif (bmi>=25) & (bmi<30):
    return 'Overweight'
  elif bmi>=30:
    return 'Obese'

In [0]:
df_required_icu_stays['BMI_category'] = df_required_icu_stays['BMI'].apply(BMI_label)

In [83]:
df_required_icu_stays.BMI_category.value_counts()

Obese            15160
Overweight       10927
Normal weight    10387
Underweight       1600
Name: BMI_category, dtype: int64

In [0]:
df_required_icu_stays.loc[df_required_icu_stays.BMI_category.isnull(),'BMI_category'] = 'Unknown'

In [93]:
df_required_icu_stays.BMI_category.value_counts()

Obese            15160
Overweight       10927
Normal weight    10387
Underweight       1600
Unknown            749
Name: BMI_category, dtype: int64

In [98]:
df_required_icu_stays.head()

Unnamed: 0,patientunitstayid,gender,age,ethnicity,hospitalid,wardid,admissionheight,admissionweight,hospitaladmitsource,unittype,unitstaytype,BMI,BMI_category
0,3132737,Male,55.0,African American,428,1048,162.56,51.5,Unknown,Med-Surg ICU,admit,19.49,Normal weight
1,3139462,Male,83.0,Caucasian,428,1048,172.72,58.0,Unknown,Med-Surg ICU,admit,19.44,Normal weight
2,2259342,Female,53.0,Other/Unknown,301,798,157.0,102.0,Unknown,Med-Surg ICU,admit,41.38,Obese
3,3168101,Female,61.0,African American,443,1068,172.0,64.0,Unknown,Neuro ICU,admit,21.63,Normal weight
4,2130984,Female,62.0,Caucasian,310,836,162.6,37.5,Unknown,Med-Surg ICU,admit,14.18,Underweight


In [0]:
df_required_icu_stays[['patientunitstayid','gender','age','ethnicity','hospitalid','wardid','hospitaladmitsource','unittype','unitstaytype','BMI','BMI_category']].to_csv('demo_variables.csv')
#files.download('demo_variables.csv')