In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

## Data Cleaning

In [2]:
patients_df = pd.read_csv('./raw/patients.csv')
patients_df.head()

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10000032,F,52,2180,2014 - 2016,2180-09-09
1,10000048,F,23,2126,2008 - 2010,
2,10000058,F,33,2168,2020 - 2022,
3,10000068,F,19,2160,2008 - 2010,
4,10000084,M,72,2160,2017 - 2019,2161-02-13


In [3]:
admissions_df = pd.read_csv('./raw/admissions.csv')
admissions_df.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag
0,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,,URGENT,P49AFC,TRANSFER FROM HOSPITAL,HOME,Medicaid,English,WIDOWED,WHITE,2180-05-06 19:17:00,2180-05-06 23:30:00,0
1,10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,,EW EMER.,P784FA,EMERGENCY ROOM,HOME,Medicaid,English,WIDOWED,WHITE,2180-06-26 15:54:00,2180-06-26 21:31:00,0
2,10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,,EW EMER.,P19UTS,EMERGENCY ROOM,HOSPICE,Medicaid,English,WIDOWED,WHITE,2180-08-05 20:58:00,2180-08-06 01:44:00,0
3,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,,EW EMER.,P06OTX,EMERGENCY ROOM,HOME,Medicaid,English,WIDOWED,WHITE,2180-07-23 05:54:00,2180-07-23 14:00:00,0
4,10000068,25022803,2160-03-03 23:16:00,2160-03-04 06:26:00,,EU OBSERVATION,P39NWO,EMERGENCY ROOM,,,English,SINGLE,WHITE,2160-03-03 21:55:00,2160-03-04 06:26:00,0


In [4]:
icustays_df = pd.read_csv('./raw/icustays.csv')
icustays_df.head()

Unnamed: 0,subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime,los
0,10000032,29079034,39553978,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2180-07-23 14:00:00,2180-07-23 23:50:47,0.410266
1,10000690,25860671,37081114,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2150-11-02 19:37:00,2150-11-06 17:03:17,3.893252
2,10000980,26913865,39765666,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2189-06-27 08:42:00,2189-06-27 20:38:27,0.497535
3,10001217,24597018,37067082,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-11-20 19:18:02,2157-11-21 22:08:00,1.118032
4,10001217,27703517,34592300,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-12-19 15:42:24,2157-12-20 14:27:41,0.948113


### Patients

In [5]:
patients_df.head()

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10000032,F,52,2180,2014 - 2016,2180-09-09
1,10000048,F,23,2126,2008 - 2010,
2,10000058,F,33,2168,2020 - 2022,
3,10000068,F,19,2160,2008 - 2010,
4,10000084,M,72,2160,2017 - 2019,2161-02-13


In [6]:
# Take only the columns we need
patients_df = patients_df[['subject_id', 'gender', 'anchor_age']]
patients_df.columns = ['subject_id', 'gender', 'age']

patients_df.head()

Unnamed: 0,subject_id,gender,age
0,10000032,F,52
1,10000048,F,23
2,10000058,F,33
3,10000068,F,19
4,10000084,M,72


In [7]:
# Count the number of missing values in each column
patients_df.isnull().sum()

subject_id    0
gender        0
age           0
dtype: int64

In [8]:
# One-hot encode
patients_df = pd.get_dummies(patients_df, columns=['gender'], drop_first=True)
patients_df.rename(columns={'gender_M' : 'Male'}, inplace=True)
patients_df.head()

Unnamed: 0,subject_id,age,Male
0,10000032,52,False
1,10000048,23,False
2,10000058,33,False
3,10000068,19,False
4,10000084,72,True


Reduce the memory usage (by performing downcasting on the columns with data types larger than necessary)

In [9]:
print("Before downcasting:", patients_df.memory_usage(deep=True).sum(), "bytes")

# Data Compression through Type Conversion (downcasting)
patients_df['subject_id'] = patients_df['subject_id'].astype(np.uint32)
patients_df['age'] = patients_df['age'].astype(np.uint8)
patients_df['Male'] = patients_df['Male'].astype(np.uint8)

print("After downcasting:", patients_df.memory_usage(deep=True).sum(), "bytes")

patients_df.head()

Before downcasting: 6198791 bytes
After downcasting: 2187894 bytes


Unnamed: 0,subject_id,age,Male
0,10000032,52,0
1,10000048,23,0
2,10000058,33,0
3,10000068,19,0
4,10000084,72,1


### Admissions

In [10]:
admissions_df.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag
0,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,,URGENT,P49AFC,TRANSFER FROM HOSPITAL,HOME,Medicaid,English,WIDOWED,WHITE,2180-05-06 19:17:00,2180-05-06 23:30:00,0
1,10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,,EW EMER.,P784FA,EMERGENCY ROOM,HOME,Medicaid,English,WIDOWED,WHITE,2180-06-26 15:54:00,2180-06-26 21:31:00,0
2,10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,,EW EMER.,P19UTS,EMERGENCY ROOM,HOSPICE,Medicaid,English,WIDOWED,WHITE,2180-08-05 20:58:00,2180-08-06 01:44:00,0
3,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,,EW EMER.,P06OTX,EMERGENCY ROOM,HOME,Medicaid,English,WIDOWED,WHITE,2180-07-23 05:54:00,2180-07-23 14:00:00,0
4,10000068,25022803,2160-03-03 23:16:00,2160-03-04 06:26:00,,EU OBSERVATION,P39NWO,EMERGENCY ROOM,,,English,SINGLE,WHITE,2160-03-03 21:55:00,2160-03-04 06:26:00,0


In [11]:
# Take only the columns we need
admissions_df = admissions_df[['subject_id', 'hadm_id', 'admittime', 'dischtime', 'admission_type', 'insurance', 'race']]
admissions_df.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,admission_type,insurance,race
0,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,URGENT,Medicaid,WHITE
1,10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,EW EMER.,Medicaid,WHITE
2,10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,EW EMER.,Medicaid,WHITE
3,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,EW EMER.,Medicaid,WHITE
4,10000068,25022803,2160-03-03 23:16:00,2160-03-04 06:26:00,EU OBSERVATION,,WHITE


In [12]:
admissions_df.isnull().sum()

subject_id           0
hadm_id              0
admittime            0
dischtime            0
admission_type       0
insurance         9355
race                 0
dtype: int64

So there're many `null` values in `insurance` column.

In [13]:
admissions_df['insurance'].value_counts()

insurance
Medicare     244576
Private      173399
Medicaid     104229
Other         14006
No charge       463
Name: count, dtype: int64

We can replace `null` by `no charge` because at that time, the patient couldn't show his/her isurance.

In [14]:
# Replace missing values with 'No charge`
admissions_df['insurance'] = admissions_df['insurance'].fillna('No charge')

In [15]:
admissions_df['admission_type'].value_counts()

admission_type
EW EMER.                       177459
EU OBSERVATION                 119456
OBSERVATION ADMIT               84437
URGENT                          54929
SURGICAL SAME DAY ADMISSION     42898
DIRECT OBSERVATION              24551
DIRECT EMER.                    21973
ELECTIVE                        13130
AMBULATORY OBSERVATION           7195
Name: count, dtype: int64

In [16]:
admissions_df['race'].value_counts()

race
WHITE                                        336538
BLACK/AFRICAN AMERICAN                        75482
OTHER                                         19788
WHITE - OTHER EUROPEAN                        13972
UNKNOWN                                       13870
HISPANIC/LATINO - PUERTO RICAN                10903
HISPANIC OR LATINO                             8287
ASIAN                                          7809
ASIAN - CHINESE                                7644
WHITE - RUSSIAN                                6597
BLACK/CAPE VERDEAN                             6205
HISPANIC/LATINO - DOMINICAN                    6070
BLACK/CARIBBEAN ISLAND                         3875
BLACK/AFRICAN                                  3495
UNABLE TO OBTAIN                               3478
PATIENT DECLINED TO ANSWER                     2162
PORTUGUESE                                     2082
ASIAN - SOUTH EAST ASIAN                       1973
WHITE - EASTERN EUROPEAN                       1886
HISPANI

In [17]:
admissions_df['race'].nunique()

33

In [18]:
# One-hot encode
admissions_df = pd.get_dummies(admissions_df, 
                               columns=['admission_type', 'insurance', 'race'], 
                               drop_first=True,
                               dtype='uint8')
admissions_df.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,admission_type_DIRECT EMER.,admission_type_DIRECT OBSERVATION,admission_type_ELECTIVE,admission_type_EU OBSERVATION,admission_type_EW EMER.,admission_type_OBSERVATION ADMIT,...,race_PATIENT DECLINED TO ANSWER,race_PORTUGUESE,race_SOUTH AMERICAN,race_UNABLE TO OBTAIN,race_UNKNOWN,race_WHITE,race_WHITE - BRAZILIAN,race_WHITE - EASTERN EUROPEAN,race_WHITE - OTHER EUROPEAN,race_WHITE - RUSSIAN
0,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
2,10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,10000068,25022803,2160-03-03 23:16:00,2160-03-04 06:26:00,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


In [19]:
admissions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 546028 entries, 0 to 546027
Data columns (total 48 columns):
 #   Column                                          Non-Null Count   Dtype 
---  ------                                          --------------   ----- 
 0   subject_id                                      546028 non-null  int64 
 1   hadm_id                                         546028 non-null  int64 
 2   admittime                                       546028 non-null  object
 3   dischtime                                       546028 non-null  object
 4   admission_type_DIRECT EMER.                     546028 non-null  uint8 
 5   admission_type_DIRECT OBSERVATION               546028 non-null  uint8 
 6   admission_type_ELECTIVE                         546028 non-null  uint8 
 7   admission_type_EU OBSERVATION                   546028 non-null  uint8 
 8   admission_type_EW EMER.                         546028 non-null  uint8 
 9   admission_type_OBSERVATION ADMIT     

Take the total time that the patient is in the hospital in a addmission. 

**los_admission = (dischtime - admittime)/ seconds_in_a_day**

In [20]:
admissions_df['admittime'] = pd.to_datetime(admissions_df['admittime'])
admissions_df['dischtime'] = pd.to_datetime(admissions_df['dischtime'])
admissions_df['los_admission'] = (admissions_df['dischtime'] - admissions_df['admittime']).dt.total_seconds()/(24*60*60)

# drop the original columns
admissions_df.drop(columns=['admittime', 'dischtime'], inplace=True)

In [21]:
admissions_df.head()

Unnamed: 0,subject_id,hadm_id,admission_type_DIRECT EMER.,admission_type_DIRECT OBSERVATION,admission_type_ELECTIVE,admission_type_EU OBSERVATION,admission_type_EW EMER.,admission_type_OBSERVATION ADMIT,admission_type_SURGICAL SAME DAY ADMISSION,admission_type_URGENT,...,race_PORTUGUESE,race_SOUTH AMERICAN,race_UNABLE TO OBTAIN,race_UNKNOWN,race_WHITE,race_WHITE - BRAZILIAN,race_WHITE - EASTERN EUROPEAN,race_WHITE - OTHER EUROPEAN,race_WHITE - RUSSIAN,los_admission
0,10000032,22595853,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0.786111
1,10000032,22841357,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1.015278
2,10000032,25742920,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1.754167
3,10000032,29079034,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,2.222222
4,10000068,25022803,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0.298611


Downcasting

In [22]:
print("Before downcasting:", patients_df.memory_usage(deep=True).sum(), "bytes")

# Data Compression through Type Conversion (downcasting)
patients_df['subject_id'] = patients_df['subject_id'].astype(np.uint32)
patients_df['age'] = patients_df['age'].astype(np.uint8)
patients_df['Male'] = patients_df['Male'].astype(np.uint8)

print("After downcasting:", patients_df.memory_usage(deep=True).sum(), "bytes")

patients_df.head()

Before downcasting: 2187894 bytes
After downcasting: 2187894 bytes


Unnamed: 0,subject_id,age,Male
0,10000032,52,0
1,10000048,23,0
2,10000058,33,0
3,10000068,19,0
4,10000084,72,1


In [23]:
print("Before downcasting:", admissions_df.memory_usage(deep=True).sum(), "bytes")

# Data Compression through Type Conversion (downcasting)
admissions_df['subject_id'] = admissions_df['subject_id'].astype('uint32')
admissions_df['hadm_id'] = admissions_df['hadm_id'].astype('uint32')
admissions_df['los_admission'] = admissions_df['los_admission'].astype('float32')

print("After downcasting:", admissions_df.memory_usage(deep=True).sum(), "bytes")

admissions_df.head()

Before downcasting: 37130036 bytes
After downcasting: 30577700 bytes


Unnamed: 0,subject_id,hadm_id,admission_type_DIRECT EMER.,admission_type_DIRECT OBSERVATION,admission_type_ELECTIVE,admission_type_EU OBSERVATION,admission_type_EW EMER.,admission_type_OBSERVATION ADMIT,admission_type_SURGICAL SAME DAY ADMISSION,admission_type_URGENT,...,race_PORTUGUESE,race_SOUTH AMERICAN,race_UNABLE TO OBTAIN,race_UNKNOWN,race_WHITE,race_WHITE - BRAZILIAN,race_WHITE - EASTERN EUROPEAN,race_WHITE - OTHER EUROPEAN,race_WHITE - RUSSIAN,los_admission
0,10000032,22595853,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0.786111
1,10000032,22841357,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1.015278
2,10000032,25742920,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1.754167
3,10000032,29079034,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,2.222222
4,10000068,25022803,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0.298611


### ICUstays

In [24]:
icustays_df.head()

Unnamed: 0,subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime,los
0,10000032,29079034,39553978,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2180-07-23 14:00:00,2180-07-23 23:50:47,0.410266
1,10000690,25860671,37081114,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2150-11-02 19:37:00,2150-11-06 17:03:17,3.893252
2,10000980,26913865,39765666,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2189-06-27 08:42:00,2189-06-27 20:38:27,0.497535
3,10001217,24597018,37067082,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-11-20 19:18:02,2157-11-21 22:08:00,1.118032
4,10001217,27703517,34592300,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-12-19 15:42:24,2157-12-20 14:27:41,0.948113


In [25]:
# Take the columns we need
icustays_df = icustays_df[['subject_id', 'hadm_id', 'stay_id', 'intime', 'outtime', 'first_careunit']]
icustays_df.head()

Unnamed: 0,subject_id,hadm_id,stay_id,intime,outtime,first_careunit
0,10000032,29079034,39553978,2180-07-23 14:00:00,2180-07-23 23:50:47,Medical Intensive Care Unit (MICU)
1,10000690,25860671,37081114,2150-11-02 19:37:00,2150-11-06 17:03:17,Medical Intensive Care Unit (MICU)
2,10000980,26913865,39765666,2189-06-27 08:42:00,2189-06-27 20:38:27,Medical Intensive Care Unit (MICU)
3,10001217,24597018,37067082,2157-11-20 19:18:02,2157-11-21 22:08:00,Surgical Intensive Care Unit (SICU)
4,10001217,27703517,34592300,2157-12-19 15:42:24,2157-12-20 14:27:41,Surgical Intensive Care Unit (SICU)


In [26]:
# Take los_icu from intime and outtime
icustays_df['intime'] = pd.to_datetime(icustays_df['intime'])
icustays_df['outtime'] = pd.to_datetime(icustays_df['outtime'])

icustays_df['los_icu'] = (icustays_df['outtime'] - icustays_df['intime']).dt.total_seconds()/(24*60*60)

In [27]:
icustays_df.head()

Unnamed: 0,subject_id,hadm_id,stay_id,intime,outtime,first_careunit,los_icu
0,10000032,29079034,39553978,2180-07-23 14:00:00,2180-07-23 23:50:47,Medical Intensive Care Unit (MICU),0.410266
1,10000690,25860671,37081114,2150-11-02 19:37:00,2150-11-06 17:03:17,Medical Intensive Care Unit (MICU),3.893252
2,10000980,26913865,39765666,2189-06-27 08:42:00,2189-06-27 20:38:27,Medical Intensive Care Unit (MICU),0.497535
3,10001217,24597018,37067082,2157-11-20 19:18:02,2157-11-21 22:08:00,Surgical Intensive Care Unit (SICU),1.118032
4,10001217,27703517,34592300,2157-12-19 15:42:24,2157-12-20 14:27:41,Surgical Intensive Care Unit (SICU),0.948113


In [28]:
icustays_df['first_careunit'].value_counts()

first_careunit
Medical Intensive Care Unit (MICU)                  20703
Medical/Surgical Intensive Care Unit (MICU/SICU)    15449
Cardiac Vascular Intensive Care Unit (CVICU)        14771
Surgical Intensive Care Unit (SICU)                 13009
Coronary Care Unit (CCU)                            10775
Trauma SICU (TSICU)                                 10474
Neuro Intermediate                                   5776
Neuro Surgical Intensive Care Unit (Neuro SICU)      1751
Neuro Stepdown                                       1421
Surgery/Vascular/Intermediate                         145
PACU                                                  122
Intensive Care Unit (ICU)                              33
Medicine                                               16
Surgery/Trauma                                         10
Medicine/Cardiology Intermediate                        1
Med/Surg                                                1
Neurology                                               1

In [29]:
# icustays_df.shape = (94458, 7)
# Drop first_careunit values with les than 150 counts
for unit in icustays_df['first_careunit'].unique():
    if icustays_df['first_careunit'].value_counts()[unit] < 150:
        icustays_df = icustays_df[icustays_df['first_careunit'] != unit]

In [30]:
# One-hot encode
icustays_df = pd.get_dummies(icustays_df, columns=['first_careunit'], drop_first=True, dtype='uint8')
icustays_df.head()

Unnamed: 0,subject_id,hadm_id,stay_id,intime,outtime,los_icu,first_careunit_Coronary Care Unit (CCU),first_careunit_Medical Intensive Care Unit (MICU),first_careunit_Medical/Surgical Intensive Care Unit (MICU/SICU),first_careunit_Neuro Intermediate,first_careunit_Neuro Stepdown,first_careunit_Neuro Surgical Intensive Care Unit (Neuro SICU),first_careunit_Surgical Intensive Care Unit (SICU),first_careunit_Trauma SICU (TSICU)
0,10000032,29079034,39553978,2180-07-23 14:00:00,2180-07-23 23:50:47,0.410266,0,1,0,0,0,0,0,0
1,10000690,25860671,37081114,2150-11-02 19:37:00,2150-11-06 17:03:17,3.893252,0,1,0,0,0,0,0,0
2,10000980,26913865,39765666,2189-06-27 08:42:00,2189-06-27 20:38:27,0.497535,0,1,0,0,0,0,0,0
3,10001217,24597018,37067082,2157-11-20 19:18:02,2157-11-21 22:08:00,1.118032,0,0,0,0,0,0,1,0
4,10001217,27703517,34592300,2157-12-19 15:42:24,2157-12-20 14:27:41,0.948113,0,0,0,0,0,0,1,0


In [31]:
print("Before downcasting:", icustays_df.memory_usage(deep=True).sum(), "bytes")

# Data Compression through Type Conversion (downcasting)
icustays_df['subject_id'] = icustays_df['subject_id'].astype('uint32')
icustays_df['hadm_id'] = icustays_df['hadm_id'].astype('uint32')
icustays_df['stay_id'] = icustays_df['stay_id'].astype('uint32')
icustays_df['los_icu'] = icustays_df['los_icu'].astype('float32')

print("After downcasting:", icustays_df.memory_usage(deep=True).sum(), "bytes")

Before downcasting: 6024256 bytes
After downcasting: 4518192 bytes


Now, save them all !

In [32]:
patients_df.to_csv('./cleaned/patients_cleaned.csv', index=False)
admissions_df.to_csv('./cleaned/admissions_cleaned.csv', index=False)
icustays_df.to_csv('./cleaned/icustays_cleaned.csv', index=False)