In [None]:

df_prep = bpd.read_gbq("ml-pipeline-459822.data.raw_features")

Moving on to the part of the project more related to Machine Learning, we will begin by preparing the data: 

In [None]:
df_prep.shape

(406235959, 28)

**Null Values**

In [None]:

missing_values = df_prep.isnull().sum()


missing_percentage = (missing_values / len(df_prep)) * 100

missing_data = pd.DataFrame({
    'Valores Nulos': missing_values,
    'Porcentagem (%)': missing_percentage,
    'Tipo de Dado': df_prep.dtypes
})


missing_data = missing_data[missing_data['Valores Nulos'] > 0]
print(missing_data.sort_values(by='Porcentagem (%)', ascending=False))

                Valores Nulos  Porcentagem (%)                    Tipo de Dado
DEATHTIME           322990383        79.508073  timestamp[us, tz=UTC][pyarrow]
VALUEUOM            263805454        64.938972                 string[pyarrow]
ERROR               244699816        60.235883                           Int64
CATEGORY            231786777        57.057179                 string[pyarrow]
VALUENUM            223935703        55.124540                         Float64
LANGUAGE            147914602        36.411007                 string[pyarrow]
MARITAL_STATUS       18661087         4.593657                 string[pyarrow]
RELIGION              2693777         0.663106                 string[pyarrow]
VALUE                 2526502         0.621930                 string[pyarrow]
ICUSTAY_ID             261508         0.064373                           Int64
INTIME                 261508         0.064373  timestamp[us, tz=UTC][pyarrow]
LOS                    261508         0.064373      

Here is an approximate estimate of the percentage of missing values for some columns: 

DEATHTIME: ~80%

WARNING: 60%

ERROR: 60%

LANGUAGE: ~36%


Remove columns with a high percentage of missing values: 


In [None]:
df_prep = df_prep.drop(columns=['DEATHTIME', 'WARNING', 'ERROR', 'LANGUGE'])

ValueError: Column name LANGUGE doesn't exist

**AGE**

Age is one of the variables we consider important when predicting the length of stay of a patient in the Intensive Care Unit. 
However, according to the documentation, there are age values above 300, which need to be filtered out

300 = 90

301 = 91

(...)

In [None]:
df_prep['AGE'] = df_prep['ADMITTIME'].dt.year - df_prep['DOB'].dt.year
df_prep = df_prep.assign(
    AGE = df_prep['AGE'].where(df_prep['AGE'] < 300, df_prep['AGE'] - 300 + 90)
)
df_prep = df_prep.drop(columns=['ADMITTIME', 'DOB'])


**Repeated Valuess**:

In [None]:
duplicados = df_prep[df_prep.duplicated()]
print(f"Número de linhas duplicadas: {len(duplicados)}")

Número de linhas duplicadas: 0


**Checking categorial columns**

In [None]:

string_cols = [col for col, dtype in zip(df_prep.columns, df_prep.dtypes) if dtype == "string"]

for col in string_cols:
    count = df_prep[col].nunique()
    print(f"{col}: {count} distinct values")

ADMISSION_TYPE: 4 valores distintos
GENDER: 2 valores distintos
ETHNICITY: 41 valores distintos
RELIGION: 20 valores distintos
LANGUAGE: 69 valores distintos
MARITAL_STATUS: 7 valores distintos
INSURANCE: 5 valores distintos
LABEL: 4559 valores distintos
CATEGORY: 56 valores distintos
VALUE: 268260 valores distintos
ICD9_CODE: 5 valores distintos


We will now remove the columns containing strings, as they are unlikely to be relevant for the current classification task.

In [None]:
# 2. Remover colunas indesejadas
colunas_a_remover = ['RELIGION', 'ETHNICITY', 'MARITAL_STATUS', 'INSURANCE']
df_prep = df_prep.drop(columns=colunas_a_remover)


**Gender**

Convert the gender columns to:  
    
M -> 1

F -> 0

In [None]:

df_prep["GENDER"] = df_prep["GENDER"].map({"F": 0, "M": 1})


Since the 'LABEL' table contains many distinct values, they are grouped into categories in the 'CATEGORY' columns, so we only need to keep this columns as relevant 

In [None]:
df_prep["CATEGORY"].head(10)

**Admission Type**

In [None]:
df_prep["ADMISSION_TYPE"] = df_prep["ADMISSION_TYPE"].map({"EMERGENCY": 0, "ELECTIVE": 1, "URGENT": 2, "NEWBORN": 3})