In [3]:
import pandas as pd

# Charger les données
url = "https://physionet.org/files/eicu-crd-demo/2.0.1/"
patients_data = pd.read_csv(url + "patient.csv.gz")

In [10]:
patients_data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2520 entries, 0 to 2519
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   gender                   2516 non-null   object 
 1   age                      2516 non-null   object 
 2   ethnicity                2481 non-null   object 
 3   apacheadmissiondx        2221 non-null   object 
 4   admissionheight          2451 non-null   float64
 5   hospitaladmitsource      1926 non-null   object 
 6   hospitaldischargestatus  2492 non-null   object 
 7   admissionweight          2322 non-null   float64
dtypes: float64(2), object(6)
memory usage: 157.6+ KB


In [7]:
# Colonnes à supprimer (celles qui ne sont pas utiles)
colonnes_inutiles = ['patientunitstayid', 'patienthealthsystemstayid', 'hospitalid', 'wardid',
                     'hospitaladmittime24', 'hospitaladmitoffset', 'hospitaldischargeyear',
                     'hospitaldischargetime24', 'hospitaldischargeoffset', 'hospitaldischargelocation',
                     'unittype', 'unitadmittime24', 'unitadmitsource', 'unitvisitnumber', 'unitstaytype',
                     'dischargeweight', 'unitdischargetime24', 'unitdischargeoffset', 'unitdischargelocation',
                     'unitdischargestatus', 'uniquepid']

# Supprimer les colonnes inutiles
patients_data.drop(columns=colonnes_inutiles, inplace=True)

In [11]:
patients_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2520 entries, 0 to 2519
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   gender                   2516 non-null   object 
 1   age                      2516 non-null   object 
 2   ethnicity                2481 non-null   object 
 3   apacheadmissiondx        2221 non-null   object 
 4   admissionheight          2451 non-null   float64
 5   hospitaladmitsource      1926 non-null   object 
 6   hospitaldischargestatus  2492 non-null   object 
 7   admissionweight          2322 non-null   float64
dtypes: float64(2), object(6)
memory usage: 157.6+ KB


In [32]:
patients_data.head()

Unnamed: 0,gender,age,ethnicity,apacheadmissiondx,admissionheight,hospitaladmitsource,hospitaldischargestatus,admissionweight
2,Male,76,Caucasian,"Endarterectomy, carotid",167.0,Operating Room,Alive,77.5
3,Female,34,Caucasian,"Overdose, other toxin, poison or drug",172.7,Emergency Department,Alive,60.3
4,Male,61,Caucasian,"GI perforation/rupture, surgery for",177.8,Emergency Department,Alive,91.7
6,Female,55,Caucasian,"Endarterectomy, carotid",157.5,Operating Room,Alive,72.5
8,Male,28,Caucasian,"Overdose, other toxin, poison or drug",182.9,Emergency Department,Alive,91.8


In [12]:
# drop rows with missing values
patients_data.dropna(inplace=True)


In [35]:
# Creating Dummy Variables

dummies = []
cols = ['gender','ethnicity','hospitaladmitsource']
for col in cols:
    dummies.append(pd.get_dummies(patients_data[col]))
    
dummies_df = pd.concat(dummies, axis=1)
dummies_df.head()

Unnamed: 0,Female,Male,African American,Asian,Caucasian,Hispanic,Native American,Other/Unknown,Acute Care/Floor,Chest Pain Center,Direct Admit,Emergency Department,Floor,ICU,Operating Room,Other Hospital,Other ICU,PACU,Recovery Room,Step-Down Unit (SDU)
2,False,True,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False
3,True,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
4,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
6,True,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False
8,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False


In [36]:
# extraire le Y et les x 

df = pd.concat([patients_data, dummies_df], axis=1)
df.drop(columns=cols, inplace=True)


In [37]:
df.head()

Unnamed: 0,age,apacheadmissiondx,admissionheight,hospitaldischargestatus,admissionweight,Female,Male,African American,Asian,Caucasian,...,Direct Admit,Emergency Department,Floor,ICU,Operating Room,Other Hospital,Other ICU,PACU,Recovery Room,Step-Down Unit (SDU)
2,76,"Endarterectomy, carotid",167.0,Alive,77.5,False,True,False,False,True,...,False,False,False,False,True,False,False,False,False,False
3,34,"Overdose, other toxin, poison or drug",172.7,Alive,60.3,True,False,False,False,True,...,False,True,False,False,False,False,False,False,False,False
4,61,"GI perforation/rupture, surgery for",177.8,Alive,91.7,False,True,False,False,True,...,False,True,False,False,False,False,False,False,False,False
6,55,"Endarterectomy, carotid",157.5,Alive,72.5,True,False,False,False,True,...,False,False,False,False,True,False,False,False,False,False
8,28,"Overdose, other toxin, poison or drug",182.9,Alive,91.8,False,True,False,False,True,...,False,True,False,False,False,False,False,False,False,False


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1562 entries, 2 to 2519
Columns: 226 entries, age to Weaning from mechanical ventilation (transfer from other unit or hospital only)
dtypes: bool(222), float64(2), object(2)
memory usage: 399.7+ KB


In [38]:
# fill the missing values
df['admissionweight'] = df['admissionweight'].interpolate() # fill the missing values with the mean

df['age'] = df['age'].interpolate() # fill the missing values with the mean

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1562 entries, 2 to 2519
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      1562 non-null   object 
 1   apacheadmissiondx        1562 non-null   object 
 2   admissionheight          1562 non-null   float64
 3   hospitaldischargestatus  1562 non-null   object 
 4   admissionweight          1562 non-null   float64
 5   Female                   1562 non-null   bool   
 6   Male                     1562 non-null   bool   
 7   African American         1562 non-null   bool   
 8   Asian                    1562 non-null   bool   
 9   Caucasian                1562 non-null   bool   
 10  Hispanic                 1562 non-null   bool   
 11  Native American          1562 non-null   bool   
 12  Other/Unknown            1562 non-null   bool   
 13  Acute Care/Floor         1562 non-null   bool   
 14  Chest Pain Center        1562

  df['age'] = df['age'].interpolate() # fill the missing values with the mean


In [39]:
x = df.values 
y = df['hospitaldischargestatus'].values

In [40]:
import numpy as np

x = np.delete(x, 0, axis=1) # delete the target variable from the features

In [41]:
# Devide the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
