# Data Processing

In [11]:
import sys
sys.path.append("..")

import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

from src.tests import validate_dataset, assert_group_identical
from src.data_processing import remove_outliers, any_one

## Import initial_cohort

In [12]:
df = pd.read_csv("../data/initial_cohort_final.csv", index_col=0)
len(df)

145187

In [13]:
# Exclude irrelevant columns
exclude_cols = ["icu_intime", "starttime", "icu_vasopressin_timediff"]
df = df.drop(columns=exclude_cols)

## One Hot Encode

- admission_type
- gender
- ventilation_status

In [14]:
df = pd.get_dummies(data=df, columns=["admission_type", "gender", "ventilation_status"])

## Handle ventilation_status

Propagate one-hot coded ventilation_status features across rows grouped by 
subject_id and hadm_id, as there are many rows each with one ventilation_status 
type.  

**These rows will need to be merged/flattened later.**

In [15]:
df = df.groupby(["subject_id", "hadm_id"]) \
    .apply(any_one, startswith="ventilation_status")

## Transform binary encode vasoactive drugs

In [16]:
va_drugs = ["dopamine", "epinephrine", "norepinephrine", "phenylephrine",
            "vasopressin", "dobutamine", "milrinone"]

for drug in va_drugs:
    df[drug] = np.where(df[drug].notna(), 1, 0)

df[va_drugs].head(10)

Unnamed: 0,dopamine,epinephrine,norepinephrine,phenylephrine,vasopressin,dobutamine,milrinone
0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0
2,0,0,0,1,0,0,0
3,0,0,0,1,0,0,0
4,0,0,0,1,0,0,0
5,0,0,0,1,0,0,0
6,0,0,0,1,0,0,0
7,0,0,0,1,0,0,0
8,0,0,1,0,0,0,0
9,0,0,1,0,0,0,0


## Manage NA values in specific features

### suspected_infection

If suspected_infection is NA, assume 0

In [17]:
df["suspected_infection"] = np.where(
    df["suspected_infection"].isna(), 0, df["suspected_infection"]
)

# Remove instances with missing data

In [18]:
df = df.dropna(axis=0, how="any")
len(df)

34640

## Glimpse and validate

In [19]:
df.head(10)

Unnamed: 0,subject_id,hadm_id,stay_id,n_stays,sum_los,admission_age,suspected_infection,weight,height,heart_rate_min,...,admission_type_SURGICAL SAME DAY ADMISSION,admission_type_URGENT,gender_F,gender_M,ventilation_status_HFNC,ventilation_status_InvasiveVent,ventilation_status_NonInvasiveVent,ventilation_status_None,ventilation_status_SupplementalOxygen,ventilation_status_Tracheostomy
215,10005606,29646384,38740124,1,6.58,38,0.0,84.1,178.0,83.0,...,0,1,0,1,0,1,0,0,1,0
216,10005606,29646384,38740124,1,6.58,38,0.0,84.1,178.0,83.0,...,0,1,0,1,0,1,0,0,1,0
348,10017531,21095812,32136798,1,39.83,64,1.0,127.1,163.0,100.0,...,0,1,0,1,0,1,0,0,0,1
349,10017531,21095812,32136798,1,39.83,64,1.0,127.1,163.0,100.0,...,0,1,0,1,0,1,0,0,0,1
350,10017531,21095812,32136798,1,39.83,64,1.0,127.1,163.0,100.0,...,0,1,0,1,0,1,0,0,0,1
351,10017531,21095812,32136798,1,39.83,64,1.0,127.1,163.0,100.0,...,0,1,0,1,0,1,0,0,0,1
352,10017531,21095812,32136798,1,39.83,64,1.0,127.1,163.0,100.0,...,0,1,0,1,0,1,0,0,0,1
353,10017531,21095812,32136798,1,39.83,64,1.0,127.1,163.0,100.0,...,0,1,0,1,0,1,0,0,0,1
354,10017531,21095812,32136798,1,39.83,64,1.0,127.1,163.0,100.0,...,0,1,0,1,0,1,0,0,0,1
355,10017531,21095812,32136798,1,39.83,64,1.0,127.1,163.0,100.0,...,0,1,0,1,0,1,0,0,0,1


In [20]:
validate_dataset(df)

ERROR:root:Group: [[10005606, 29646384], [10005606, 29646384]], length is 2


False