## Libraries 

In [1]:
import pandas as pd
import numpy as np
import os
# change the working directory to the parent directory
current_dir = os.path.abspath(os.getcwd())

parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
os.chdir(parent_dir)

## Loading data from the MIMIC-IV dataset

Preprocessing can be performed in the same way for data intended for 12-, 24-, and 48-hour predictions. The data cleaning and transformation steps are identical in each case, meaning that only the file paths need to be adjusted for each time-based dataset. Once the paths are updated, the preprocessing process automatically adapts without requiring additional modifications, ensuring an efficient and consistent workflow.

In [2]:
data_test = pd.read_csv("Data_johan/mimic/48h_24h/demograficos.csv" )
data_test1 = pd.read_csv("Data_johan/mimic/48h_24h/vitalsigns.csv" )
data_test2 = pd.read_csv("Data_johan/mimic/48h_24h/lab.csv" )
data_test3 = pd.read_csv("Data_johan/mimic/48h_24h/vasopresores.csv" )
data_test4 = pd.read_csv("Data_johan/mimic/48h_24h/antibioticos.csv" )
data_test5 = pd.read_csv("Data_johan/mimic/48h_24h/gcs.csv" )
data_test6 = pd.read_csv("Data_johan/mimic/48h_24h/urineoutput.csv" )
data_test7 = pd.read_csv("Data_johan/mimic/48h_24h/sofa.csv" )
data_test8 = pd.read_csv('datapre/mimic/conflag_mimic_48_72.csv')

In [3]:
print(data_test.head())

   subject_id   hadm_id   stay_id gender                    race  \
0    10000690  25860671  37081114      F                   WHITE   
1    10001884  26184834  37510196      F  BLACK/AFRICAN AMERICAN   
2    10002155  23822395  33685454      F                   WHITE   
3    10002348  22725460  32610785      F                   WHITE   
4    10002428  28662225  38875437      F                   WHITE   

   admission_age  los_icu  hospital_expire_flag first_icu_stay  
0      86.837120     3.89                     0              t  
1      77.018296     9.17                     1              t  
2      81.592179     6.18                     0              t  
3      77.917014     9.79                     0              t  
4      81.280232     7.03                     0              f  


In [4]:
print(data_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10630 entries, 0 to 10629
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   subject_id            10630 non-null  int64  
 1   hadm_id               10630 non-null  int64  
 2   stay_id               10630 non-null  int64  
 3   gender                10630 non-null  object 
 4   race                  10630 non-null  object 
 5   admission_age         10630 non-null  float64
 6   los_icu               10630 non-null  float64
 7   hospital_expire_flag  10630 non-null  int64  
 8   first_icu_stay        10630 non-null  object 
dtypes: float64(2), int64(4), object(3)
memory usage: 747.6+ KB
None


In [5]:
print(data_test.describe())

         subject_id       hadm_id       stay_id  admission_age       los_icu  \
count  1.063000e+04  1.063000e+04  1.063000e+04   10630.000000  10630.000000   
mean   1.501466e+07  2.501610e+07  3.491269e+07      66.676217      7.514547   
std    2.873762e+06  2.881380e+06  2.885487e+06      16.768537      7.500857   
min    1.000069e+07  2.000130e+07  3.000481e+07      18.133808      2.000000   
25%    1.257655e+07  2.253831e+07  3.240168e+07      56.365412      2.980000   
50%    1.501426e+07  2.503430e+07  3.485589e+07      68.269995      4.790000   
75%    1.752694e+07  2.749369e+07  3.740836e+07      79.636882      9.000000   
max    1.999984e+07  2.999910e+07  3.999923e+07     100.516763    101.730000   

       hospital_expire_flag  
count          10630.000000  
mean               0.138758  
std                0.345710  
min                0.000000  
25%                0.000000  
50%                0.000000  
75%                0.000000  
max                1.000000  


## Data treatment for the age column

In [6]:
# Display unique values in the 'admission_age' column
print(data_test['admission_age'].unique())

[86.83712002 77.01829586 81.59217876 ... 53.50543262 42.61875832
 58.69426382]


In [7]:
# Round down and convert to integer
data_test['admission_age'] = np.floor(data_test['admission_age']).astype(int)

In [8]:
print(data_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10630 entries, 0 to 10629
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   subject_id            10630 non-null  int64  
 1   hadm_id               10630 non-null  int64  
 2   stay_id               10630 non-null  int64  
 3   gender                10630 non-null  object 
 4   race                  10630 non-null  object 
 5   admission_age         10630 non-null  int64  
 6   los_icu               10630 non-null  float64
 7   hospital_expire_flag  10630 non-null  int64  
 8   first_icu_stay        10630 non-null  object 
dtypes: float64(1), int64(5), object(3)
memory usage: 747.6+ KB
None


## Data treatment for the gender column

In [9]:
# Display unique values in the 'gender' column
print(data_test['gender'].unique())

['F' 'M']


In [10]:
# Define a mapping for the gender values
gender_mapping = {
    'F': 0,
    'M': 1,
}

# Replace values in the 'gender' column and cast to integer type
data_test['gender'] = data_test['gender'].replace(gender_mapping).astype('int64')

  data_test['gender'] = data_test['gender'].replace(gender_mapping).astype('int64')


In [11]:
print(data_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10630 entries, 0 to 10629
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   subject_id            10630 non-null  int64  
 1   hadm_id               10630 non-null  int64  
 2   stay_id               10630 non-null  int64  
 3   gender                10630 non-null  int64  
 4   race                  10630 non-null  object 
 5   admission_age         10630 non-null  int64  
 6   los_icu               10630 non-null  float64
 7   hospital_expire_flag  10630 non-null  int64  
 8   first_icu_stay        10630 non-null  object 
dtypes: float64(1), int64(6), object(2)
memory usage: 747.6+ KB
None


## Data treatment for the race column

In [12]:
# Display unique values in the 'race' column
print(data_test['race'].unique())



['WHITE' 'BLACK/AFRICAN AMERICAN' 'PORTUGUESE' 'BLACK/CAPE VERDEAN'
 'WHITE - OTHER EUROPEAN' 'WHITE - BRAZILIAN' 'BLACK/AFRICAN'
 'BLACK/CARIBBEAN ISLAND' 'ASIAN' 'UNKNOWN' 'HISPANIC OR LATINO'
 'HISPANIC/LATINO - PUERTO RICAN' 'ASIAN - CHINESE' 'OTHER'
 'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER' 'WHITE - EASTERN EUROPEAN'
 'HISPANIC/LATINO - DOMINICAN' 'HISPANIC/LATINO - CENTRAL AMERICAN'
 'HISPANIC/LATINO - COLUMBIAN' 'ASIAN - ASIAN INDIAN'
 'PATIENT DECLINED TO ANSWER' 'WHITE - RUSSIAN' 'ASIAN - SOUTH EAST ASIAN'
 'HISPANIC/LATINO - GUATEMALAN' 'UNABLE TO OBTAIN'
 'HISPANIC/LATINO - HONDURAN' 'AMERICAN INDIAN/ALASKA NATIVE'
 'ASIAN - KOREAN' 'SOUTH AMERICAN' 'HISPANIC/LATINO - CUBAN'
 'MULTIPLE RACE/ETHNICITY' 'HISPANIC/LATINO - SALVADORAN'
 'HISPANIC/LATINO - MEXICAN']


In [13]:
# Define a mapping for the ethnicity values
ethnicity_mapping = {
    'WHITE': 0,
    'WHITE - OTHER EUROPEAN': 0,
    'WHITE - BRAZILIAN': 0,
    'WHITE - EASTERN EUROPEAN': 0,
    'WHITE - RUSSIAN': 0,
    
    'BLACK/AFRICAN AMERICAN': 2,
    'BLACK/CAPE VERDEAN': 2,
    'BLACK/AFRICAN': 2,
    'BLACK/CARIBBEAN ISLAND': 2,
    
    'ASIAN': 1,
    'ASIAN - CHINESE': 1,
    'ASIAN - ASIAN INDIAN': 1,
    'ASIAN - SOUTH EAST ASIAN': 1,
    'ASIAN - KOREAN': 1,
    
    'HISPANIC OR LATINO': 3,
    'HISPANIC/LATINO - PUERTO RICAN': 3,
    'HISPANIC/LATINO - DOMINICAN': 3,
    'HISPANIC/LATINO - CENTRAL AMERICAN': 3,
    'HISPANIC/LATINO - COLUMBIAN': 3,
    'HISPANIC/LATINO - GUATEMALAN': 3,
    'HISPANIC/LATINO - HONDURAN': 3,
    'HISPANIC/LATINO - CUBAN': 3,
    'HISPANIC/LATINO - SALVADORAN': 3,
    'HISPANIC/LATINO - MEXICAN': 3,
    'SOUTH AMERICAN': 3,
    
    'AMERICAN INDIAN/ALASKA NATIVE': 4,
    
    'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER': 5,
    'PORTUGUESE': 5,
    'UNKNOWN': 5,
    'OTHER': 5,
    'PATIENT DECLINED TO ANSWER': 5,
    'UNABLE TO OBTAIN': 5,
    'MULTIPLE RACE/ETHNICITY': 5
}

# Replace values in the 'race' column using the mapping
data_test['race'] = data_test['race'].replace(ethnicity_mapping)

  data_test['race'] = data_test['race'].replace(ethnicity_mapping)


## Filtering ICU Stay Data: Retaining Only Initial ICU Admissions

In [14]:
print(data_test['first_icu_stay'].unique())

['t' 'f']


In [15]:
# Keep only the rows where first_icu_stay is 't'
data_test = data_test[data_test['first_icu_stay'] == 't']
data_test = data_test.drop(columns=['hadm_id'])
data_test = data_test.drop(columns=['first_icu_stay'])

## Renaming columns and adjusting data types

In [16]:
# Renaming columns
data_test = data_test.rename(columns={
    'gender': 'sex',
    'admission_age' : 'age',
})


In [17]:
data_test = data_test.merge(data_test8[['stay_id']], on=['stay_id'], how='inner')
print(data_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7511 entries, 0 to 7510
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   subject_id            7511 non-null   int64  
 1   stay_id               7511 non-null   int64  
 2   sex                   7511 non-null   int64  
 3   race                  7511 non-null   int64  
 4   age                   7511 non-null   int64  
 5   los_icu               7511 non-null   float64
 6   hospital_expire_flag  7511 non-null   int64  
dtypes: float64(1), int64(6)
memory usage: 410.9 KB
None


## Reviewing vital signs data from the MIMIC-IV dataset

In [21]:
print(data_test1.head())

   subject_id   stay_id  heart_rate_min  heart_rate_max  heart_rate_mean  \
0    10000690  37081114            81.0           137.0        99.961538   
1    10001884  37510196            63.0            73.0        67.458333   
2    10002155  33685454            69.0           108.0        90.269231   
3    10002348  32610785            49.0           103.0        69.750000   
4    10002428  38875437            82.0           107.0        92.500000   

   sbp_min  sbp_max    sbp_mean  dbp_min  dbp_max  ...   mbp_mean  \
0    110.0    179.0  137.200000     43.0    137.0  ...  83.880000   
1     93.0    138.0  119.416667     53.0    110.0  ...  84.500000   
2     87.0    127.0  103.791667     39.0     65.0  ...  62.125000   
3     96.0    137.0  119.888889     50.0    105.0  ...  87.444444   
4     93.0    162.0  117.980769     51.0    156.0  ...  89.560000   

   resp_rate_min  resp_rate_max  resp_rate_mean  temperature_min  \
0           19.0           33.0       24.280000            3

In [22]:
print(data_test1.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10630 entries, 0 to 10629
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   subject_id        10630 non-null  int64  
 1   stay_id           10630 non-null  int64  
 2   heart_rate_min    10601 non-null  float64
 3   heart_rate_max    10601 non-null  float64
 4   heart_rate_mean   10601 non-null  float64
 5   sbp_min           10590 non-null  float64
 6   sbp_max           10590 non-null  float64
 7   sbp_mean          10590 non-null  float64
 8   dbp_min           10589 non-null  float64
 9   dbp_max           10589 non-null  float64
 10  dbp_mean          10589 non-null  float64
 11  mbp_min           10589 non-null  float64
 12  mbp_max           10589 non-null  float64
 13  mbp_mean          10589 non-null  float64
 14  resp_rate_min     10590 non-null  float64
 15  resp_rate_max     10590 non-null  float64
 16  resp_rate_mean    10590 non-null  float6

In [23]:
print(data_test1.describe())

         subject_id       stay_id  heart_rate_min  heart_rate_max  \
count  1.063000e+04  1.063000e+04    10601.000000    10601.000000   
mean   1.501466e+07  3.491269e+07       73.480804      104.264692   
std    2.873762e+06  2.885487e+06       15.065475       20.145863   
min    1.000069e+07  3.000481e+07        8.000000       47.000000   
25%    1.257655e+07  3.240168e+07       63.000000       90.000000   
50%    1.501426e+07  3.485589e+07       73.000000      103.000000   
75%    1.752694e+07  3.740836e+07       83.000000      117.000000   
max    1.999984e+07  3.999923e+07      148.000000      227.000000   

       heart_rate_mean       sbp_min       sbp_max      sbp_mean  \
count     10601.000000  10590.000000  10590.000000  10590.000000   
mean         86.722542     96.497293    147.834136    121.299475   
std          15.762451     17.354643     23.019235     17.822366   
min          37.652174      9.000000     48.000000     39.166667   
25%          75.520000     85.000000  

In [24]:
data_test1 = data_test1.merge(data_test8[['stay_id']], on=['stay_id'], how='inner')
print(data_test1.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7511 entries, 0 to 7510
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   subject_id        7511 non-null   int64  
 1   stay_id           7511 non-null   int64  
 2   heart_rate_min    7483 non-null   float64
 3   heart_rate_max    7483 non-null   float64
 4   heart_rate_mean   7483 non-null   float64
 5   sbp_min           7473 non-null   float64
 6   sbp_max           7473 non-null   float64
 7   sbp_mean          7473 non-null   float64
 8   dbp_min           7472 non-null   float64
 9   dbp_max           7472 non-null   float64
 10  dbp_mean          7472 non-null   float64
 11  mbp_min           7472 non-null   float64
 12  mbp_max           7472 non-null   float64
 13  mbp_mean          7472 non-null   float64
 14  resp_rate_min     7477 non-null   float64
 15  resp_rate_max     7477 non-null   float64
 16  resp_rate_mean    7477 non-null   float64


## Removing columns with less than 80% non-null values in a DataFrame

In [25]:
# Calculate the threshold for non-null values
th = 0.80 * len(data_test1)
# Remove columns that have fewer than 'threshold' non-null values
data_test1 = data_test1.loc[:, data_test1.notnull().sum() >= th]
print(data_test1.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7511 entries, 0 to 7510
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   subject_id        7511 non-null   int64  
 1   stay_id           7511 non-null   int64  
 2   heart_rate_min    7483 non-null   float64
 3   heart_rate_max    7483 non-null   float64
 4   heart_rate_mean   7483 non-null   float64
 5   sbp_min           7473 non-null   float64
 6   sbp_max           7473 non-null   float64
 7   sbp_mean          7473 non-null   float64
 8   dbp_min           7472 non-null   float64
 9   dbp_max           7472 non-null   float64
 10  dbp_mean          7472 non-null   float64
 11  mbp_min           7472 non-null   float64
 12  mbp_max           7472 non-null   float64
 13  mbp_mean          7472 non-null   float64
 14  resp_rate_min     7477 non-null   float64
 15  resp_rate_max     7477 non-null   float64
 16  resp_rate_mean    7477 non-null   float64


## Reviewing laboratory data from the MIMIC-IV dataset

In [26]:
print(data_test2.head())

   subject_id   stay_id  lactate_min  lactate_max  hematocrit_min  \
0    17244693  30004811          3.0          3.0            23.2   
1    15850686  30005000          1.0          2.0            26.7   
2    17445720  30005199          NaN          NaN            25.5   
3    14569364  30005707          NaN          NaN            38.8   
4    15524760  30006565          NaN          NaN            36.7   

   hematocrit_max  hemoglobin_min  hemoglobin_max  platelets_min  \
0            28.8             8.8             9.3          104.0   
1            29.5             8.9            10.1           76.0   
2            25.5             8.4             8.4          546.0   
3            38.8            13.1            13.1          368.0   
4            36.7            12.0            12.0          236.0   

   platelets_max  ...  creatinine_min  creatinine_max  glucose_min  \
0          176.0  ...             0.6             0.7         95.0   
1           91.0  ...             2.

In [27]:
print(data_test2.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10630 entries, 0 to 10629
Data columns (total 34 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   subject_id       10630 non-null  int64  
 1   stay_id          10630 non-null  int64  
 2   lactate_min      2597 non-null   float64
 3   lactate_max      2597 non-null   float64
 4   hematocrit_min   10232 non-null  float64
 5   hematocrit_max   10232 non-null  float64
 6   hemoglobin_min   10223 non-null  float64
 7   hemoglobin_max   10223 non-null  float64
 8   platelets_min    10222 non-null  float64
 9   platelets_max    10222 non-null  float64
 10  wbc_min          10226 non-null  float64
 11  wbc_max          10226 non-null  float64
 12  albumin_min      1880 non-null   float64
 13  albumin_max      1880 non-null   float64
 14  aniongap_min     10289 non-null  float64
 15  aniongap_max     10289 non-null  float64
 16  bicarbonate_min  10293 non-null  float64
 17  bicarbonate_

In [28]:
print(data_test2.describe())

         subject_id       stay_id  lactate_min  lactate_max  hematocrit_min  \
count  1.063000e+04  1.063000e+04  2597.000000  2597.000000    10232.000000   
mean   1.501466e+07  3.491269e+07     1.814405     2.370905       29.548466   
std    2.873762e+06  2.885487e+06     1.589061     2.485989        5.644543   
min    1.000069e+07  3.000481e+07     0.100000     0.200000        9.300000   
25%    1.257655e+07  3.240168e+07     1.000000     1.100000       25.400000   
50%    1.501426e+07  3.485589e+07     1.400000     1.500000       28.900000   
75%    1.752694e+07  3.740836e+07     2.000000     2.600000       33.000000   
max    1.999984e+07  3.999923e+07    18.300000    28.000000       61.400000   

       hematocrit_max  hemoglobin_min  hemoglobin_max  platelets_min  \
count    10232.000000    10223.000000    10223.000000   10222.000000   
mean        30.363937        9.649633        9.870107     219.523283   
std          5.363827        1.892006        1.823733     142.370258   


In [29]:
data_test2 = data_test2.merge(data_test8[['stay_id']], on=['stay_id'], how='inner')
print(data_test2.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7511 entries, 0 to 7510
Data columns (total 34 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   subject_id       7511 non-null   int64  
 1   stay_id          7511 non-null   int64  
 2   lactate_min      1510 non-null   float64
 3   lactate_max      1510 non-null   float64
 4   hematocrit_min   7181 non-null   float64
 5   hematocrit_max   7181 non-null   float64
 6   hemoglobin_min   7176 non-null   float64
 7   hemoglobin_max   7176 non-null   float64
 8   platelets_min    7176 non-null   float64
 9   platelets_max    7176 non-null   float64
 10  wbc_min          7178 non-null   float64
 11  wbc_max          7178 non-null   float64
 12  albumin_min      1227 non-null   float64
 13  albumin_max      1227 non-null   float64
 14  aniongap_min     7230 non-null   float64
 15  aniongap_max     7230 non-null   float64
 16  bicarbonate_min  7233 non-null   float64
 17  bicarbonate_ma

## Removing columns with less than 80% non-null values in a DataFrame

In [30]:
# Calculate the threshold for non-null values
th = 0.80 * len(data_test2)
# Remove columns that have fewer than 'threshold' non-null values
data_test2 = data_test2.loc[:, data_test2.notnull().sum() >= th]
data_test2 = data_test2.rename(columns={
    'platelets_max': 'platelet_max',
    'platelets_min': 'platelet_min'
})
print(data_test2.info())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7511 entries, 0 to 7510
Data columns (total 28 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   subject_id       7511 non-null   int64  
 1   stay_id          7511 non-null   int64  
 2   hematocrit_min   7181 non-null   float64
 3   hematocrit_max   7181 non-null   float64
 4   hemoglobin_min   7176 non-null   float64
 5   hemoglobin_max   7176 non-null   float64
 6   platelet_min     7176 non-null   float64
 7   platelet_max     7176 non-null   float64
 8   wbc_min          7178 non-null   float64
 9   wbc_max          7178 non-null   float64
 10  aniongap_min     7230 non-null   float64
 11  aniongap_max     7230 non-null   float64
 12  bicarbonate_min  7233 non-null   float64
 13  bicarbonate_max  7233 non-null   float64
 14  bun_min          7229 non-null   float64
 15  bun_max          7229 non-null   float64
 16  calcium_min      7076 non-null   float64
 17  calcium_max   

## Reviewing vasopressors data from the MIMIC-IV dataset

In [31]:
print(data_test3.head())

    stay_id  dopamine  epinephrine  norepinephrine  phenylephrine  \
0  35621403         0            0               0              1   
1  34746208         0            0               0              0   
2  34898026         0            0              11              0   
3  30550789         0            0               0              0   
4  33240155         0            0               0              0   

   vasopressin  dobutamine  milrinone  
0            0           0          0  
1            0           0          0  
2            0           0          0  
3            0           0          0  
4            0           0          0  


In [32]:
print(data_test3.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10630 entries, 0 to 10629
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   stay_id         10630 non-null  int64
 1   dopamine        10630 non-null  int64
 2   epinephrine     10630 non-null  int64
 3   norepinephrine  10630 non-null  int64
 4   phenylephrine   10630 non-null  int64
 5   vasopressin     10630 non-null  int64
 6   dobutamine      10630 non-null  int64
 7   milrinone       10630 non-null  int64
dtypes: int64(8)
memory usage: 664.5 KB
None


In [33]:
print(data_test3.describe())

            stay_id      dopamine   epinephrine  norepinephrine  \
count  1.063000e+04  10630.000000  10630.000000    10630.000000   
mean   3.491269e+07      0.069520      0.120696        1.336500   
std    2.885487e+06      1.038067      1.890804        4.478436   
min    3.000481e+07      0.000000      0.000000        0.000000   
25%    3.240168e+07      0.000000      0.000000        0.000000   
50%    3.485589e+07      0.000000      0.000000        0.000000   
75%    3.740836e+07      0.000000      0.000000        0.000000   
max    3.999923e+07     48.000000     66.000000       69.000000   

       phenylephrine   vasopressin    dobutamine     milrinone  
count   10630.000000  10630.000000  10630.000000  10630.000000  
mean        0.429539      0.480621      0.036689      0.038758  
std         2.562287      3.096611      0.681716      0.953527  
min         0.000000      0.000000      0.000000      0.000000  
25%         0.000000      0.000000      0.000000      0.000000  
50%   

In [34]:
data_test3 = data_test3.merge(data_test8[['stay_id']], on=['stay_id'], how='inner')
print(data_test3.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7511 entries, 0 to 7510
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   stay_id         7511 non-null   int64
 1   dopamine        7511 non-null   int64
 2   epinephrine     7511 non-null   int64
 3   norepinephrine  7511 non-null   int64
 4   phenylephrine   7511 non-null   int64
 5   vasopressin     7511 non-null   int64
 6   dobutamine      7511 non-null   int64
 7   milrinone       7511 non-null   int64
dtypes: int64(8)
memory usage: 469.6 KB
None


## Reviewing antibiotics data from the MIMIC-IV dataset

In [35]:
print(data_test4.head())

    stay_id  Grupo Cefalosporinas  Grupo Penicilinas  Macrolides  Meropenem  \
0  30004811                     0                  0           0          0   
1  30005000                     0                  0           0          0   
2  30005199                     0                  0           0          0   
3  30005707                     0                  2           0          0   
4  30006565                     1                  0           0          0   

   Metronidazole  Quinolonas  Vancomycin  other  
0              0           0           1      0  
1              1           0           1      1  
2              0           0           0      1  
3              1           0           0      0  
4              1           0           1      0  


In [36]:
print(data_test4.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10630 entries, 0 to 10629
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   stay_id               10630 non-null  int64
 1   Grupo Cefalosporinas  10630 non-null  int64
 2   Grupo Penicilinas     10630 non-null  int64
 3   Macrolides            10630 non-null  int64
 4   Meropenem             10630 non-null  int64
 5   Metronidazole         10630 non-null  int64
 6   Quinolonas            10630 non-null  int64
 7   Vancomycin            10630 non-null  int64
 8   other                 10630 non-null  int64
dtypes: int64(9)
memory usage: 747.6 KB
None


In [37]:
print(data_test4.describe())

            stay_id  Grupo Cefalosporinas  Grupo Penicilinas    Macrolides  \
count  1.063000e+04          10630.000000       10630.000000  10630.000000   
mean   3.491269e+07              0.751082           0.434431      0.147037   
std    2.885487e+06              0.923418           0.884197      0.447779   
min    3.000481e+07              0.000000           0.000000      0.000000   
25%    3.240168e+07              0.000000           0.000000      0.000000   
50%    3.485589e+07              0.000000           0.000000      0.000000   
75%    3.740836e+07              1.000000           0.000000      0.000000   
max    3.999923e+07              5.000000           6.000000      5.000000   

          Meropenem  Metronidazole    Quinolonas    Vancomycin         other  
count  10630.000000   10630.000000  10630.000000  10630.000000  10630.000000  
mean       0.071966       0.172531      0.171778      0.742992      0.359925  
std        0.400624       0.436760      0.490414      0.9449

## Renaming columns and adjusting data types

In [38]:
data_test4 = data_test4.drop(columns=['other'])
# Renaming columns
data_test4 = data_test4.rename(columns={
    'Grupo Cefalosporinas': 'Cefalosporine',
    'Grupo Penicilinas': 'Penicillin',
    'Macrolides': 'Macrolide',
    'Quinolonas' : 'Quinolone',
})

# Converting specified columns to float64
data_test4[['Cefalosporine', 'Penicillin', 'Macrolide', 'Quinolone', 'Meropenem', 'Metronidazole', 'Vancomycin']] = data_test4[['Cefalosporine', 'Penicillin', 'Macrolide', 'Quinolone', 'Meropenem', 'Metronidazole', 'Vancomycin']].astype('float64')



In [39]:
print(data_test4.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10630 entries, 0 to 10629
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   stay_id        10630 non-null  int64  
 1   Cefalosporine  10630 non-null  float64
 2   Penicillin     10630 non-null  float64
 3   Macrolide      10630 non-null  float64
 4   Meropenem      10630 non-null  float64
 5   Metronidazole  10630 non-null  float64
 6   Quinolone      10630 non-null  float64
 7   Vancomycin     10630 non-null  float64
dtypes: float64(7), int64(1)
memory usage: 664.5 KB
None


In [40]:
data_test4 = data_test4.merge(data_test8[['stay_id']], on=['stay_id'], how='inner')
print(data_test4.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7511 entries, 0 to 7510
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   stay_id        7511 non-null   int64  
 1   Cefalosporine  7511 non-null   float64
 2   Penicillin     7511 non-null   float64
 3   Macrolide      7511 non-null   float64
 4   Meropenem      7511 non-null   float64
 5   Metronidazole  7511 non-null   float64
 6   Quinolone      7511 non-null   float64
 7   Vancomycin     7511 non-null   float64
dtypes: float64(7), int64(1)
memory usage: 469.6 KB
None


## Reviewing Glasgow Coma Scale (GCS) data from the MIMIC-IV dataset

In [41]:
print(data_test5.head())

   subject_id   stay_id  gcs_min  gcs_motor  gcs_verbal  gcs_eyes
0    17244693  30004811     15.0        1.0         0.0       1.0
1    15850686  30005000     14.0        6.0         4.0       4.0
2    17445720  30005199     15.0        6.0         5.0       4.0
3    14569364  30005707     15.0        6.0         5.0       4.0
4    15524760  30006565     15.0        6.0         0.0       4.0


In [42]:
print(data_test5.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10630 entries, 0 to 10629
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   subject_id  10630 non-null  int64  
 1   stay_id     10630 non-null  int64  
 2   gcs_min     10597 non-null  float64
 3   gcs_motor   10552 non-null  float64
 4   gcs_verbal  10560 non-null  float64
 5   gcs_eyes    10581 non-null  float64
dtypes: float64(4), int64(2)
memory usage: 498.4 KB
None


In [43]:
print(data_test5.describe())

         subject_id       stay_id       gcs_min     gcs_motor    gcs_verbal  \
count  1.063000e+04  1.063000e+04  10597.000000  10552.000000  10560.000000   
mean   1.501466e+07  3.491269e+07     13.421629      5.394996      3.208902   
std    2.873762e+06  2.885487e+06      2.500211      1.318620      1.977024   
min    1.000069e+07  3.000481e+07      3.000000      1.000000      0.000000   
25%    1.257655e+07  3.240168e+07     13.000000      5.000000      1.000000   
50%    1.501426e+07  3.485589e+07     14.000000      6.000000      4.000000   
75%    1.752694e+07  3.740836e+07     15.000000      6.000000      5.000000   
max    1.999984e+07  3.999923e+07     15.000000      6.000000      5.000000   

           gcs_eyes  
count  10581.000000  
mean       3.334656  
std        0.929153  
min        1.000000  
25%        3.000000  
50%        4.000000  
75%        4.000000  
max        4.000000  


In [44]:
data_test5 = data_test5.merge(data_test8[['stay_id']], on=['stay_id'], how='inner')
print(data_test5.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7511 entries, 0 to 7510
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   subject_id  7511 non-null   int64  
 1   stay_id     7511 non-null   int64  
 2   gcs_min     7479 non-null   float64
 3   gcs_motor   7452 non-null   float64
 4   gcs_verbal  7454 non-null   float64
 5   gcs_eyes    7466 non-null   float64
dtypes: float64(4), int64(2)
memory usage: 352.2 KB
None


## Reviewing urineoutput data from the MIMIC-IV dataset

In [45]:
data_test6 = data_test6.merge(data_test8[['stay_id']], on=['stay_id'], how='inner')
print(data_test6.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7511 entries, 0 to 7510
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   subject_id   7511 non-null   int64  
 1   stay_id      7511 non-null   int64  
 2   urineoutput  6995 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 176.2 KB
None


## Reviewing SOFA data from the MIMIC-IV dataset

In [46]:
data_test7 = data_test7.merge(data_test8[['stay_id']], on=['stay_id'], how='inner')
data_test7 = data_test7.drop(columns=['hadm_id'])
print(data_test7.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7511 entries, 0 to 7510
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   subject_id  7511 non-null   int64
 1   stay_id     7511 non-null   int64
 2   sofa        7511 non-null   int64
dtypes: int64(3)
memory usage: 176.2 KB
None


In [47]:
# Start with 'df1', containing the columns 'stay_id', 'hospital_expire_flag', 'age', 'sex', 'race', and 'los_icu'
# We exclude 'subject_id' here as it is not needed for merging
merged_df = data_test[['stay_id', 'hospital_expire_flag', 'age', 'sex', 'race', 'los_icu']]

# Iterate through the remaining dataframes (df2 to df8)
# Remove 'subject_id' from each dataframe if it exists, and then merge with 'merged_df' using 'stay_id' as the key
for df in [data_test1, data_test2, data_test3, data_test4, data_test5, data_test6, data_test7]:
    if 'subject_id' in df.columns:
        df = df.drop(columns=['subject_id'])  # Remove 'subject_id' if present
    merged_df = merged_df.merge(df, on=['stay_id'], how='inner')




In [48]:
print(merged_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7511 entries, 0 to 7510
Data columns (total 73 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   stay_id               7511 non-null   int64  
 1   hospital_expire_flag  7511 non-null   int64  
 2   age                   7511 non-null   int64  
 3   sex                   7511 non-null   int64  
 4   race                  7511 non-null   int64  
 5   los_icu               7511 non-null   float64
 6   heart_rate_min        7483 non-null   float64
 7   heart_rate_max        7483 non-null   float64
 8   heart_rate_mean       7483 non-null   float64
 9   sbp_min               7473 non-null   float64
 10  sbp_max               7473 non-null   float64
 11  sbp_mean              7473 non-null   float64
 12  dbp_min               7472 non-null   float64
 13  dbp_max               7472 non-null   float64
 14  dbp_mean              7472 non-null   float64
 15  mbp_min              

In [49]:
# Check for columns that contain only 0 values
columns_with_zeros = merged_df.columns[(merged_df == 0).all()]
print(columns_with_zeros)

# Drop the columns that contain only zeros
merged_df = merged_df.drop(columns=columns_with_zeros)
print(merged_df.info())

Index([], dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7511 entries, 0 to 7510
Data columns (total 73 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   stay_id               7511 non-null   int64  
 1   hospital_expire_flag  7511 non-null   int64  
 2   age                   7511 non-null   int64  
 3   sex                   7511 non-null   int64  
 4   race                  7511 non-null   int64  
 5   los_icu               7511 non-null   float64
 6   heart_rate_min        7483 non-null   float64
 7   heart_rate_max        7483 non-null   float64
 8   heart_rate_mean       7483 non-null   float64
 9   sbp_min               7473 non-null   float64
 10  sbp_max               7473 non-null   float64
 11  sbp_mean              7473 non-null   float64
 12  dbp_min               7472 non-null   float64
 13  dbp_max               7472 non-null   float64
 14  dbp_mean              7472 non-null   float64


In [50]:
merged_df.to_csv('datapre/mimic/conflag_mimic_24_48.csv', index=False)

In [51]:
merged_df = merged_df.drop(columns=['stay_id', 'hospital_expire_flag'])
print(merged_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7511 entries, 0 to 7510
Data columns (total 71 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               7511 non-null   int64  
 1   sex               7511 non-null   int64  
 2   race              7511 non-null   int64  
 3   los_icu           7511 non-null   float64
 4   heart_rate_min    7483 non-null   float64
 5   heart_rate_max    7483 non-null   float64
 6   heart_rate_mean   7483 non-null   float64
 7   sbp_min           7473 non-null   float64
 8   sbp_max           7473 non-null   float64
 9   sbp_mean          7473 non-null   float64
 10  dbp_min           7472 non-null   float64
 11  dbp_max           7472 non-null   float64
 12  dbp_mean          7472 non-null   float64
 13  mbp_min           7472 non-null   float64
 14  mbp_max           7472 non-null   float64
 15  mbp_mean          7472 non-null   float64
 16  resp_rate_min     7477 non-null   float64


In [52]:
merged_df.to_csv('datapre/mimic/sinflag_mimic_24_48.csv', index=False)