# Data Processing

In [11]:
import sys
sys.path.append("..")

import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

from src.tests import validate_dataset, assert_group_identical
from src.data_processing import remove_outliers, any_one

## Import initial_cohort

In [12]:
df = pd.read_csv("../data/initial_cohort_final.csv", index_col=0)
len(df)

145187

In [13]:
# Exclude irrelevant columns
exclude_cols = ["icu_intime", "starttime", "icu_vasopressin_timediff"]
df = df.drop(columns=exclude_cols)

## One Hot Encode

- admission_type
- gender
- ventilation_status

In [14]:
df = pd.get_dummies(data=df, columns=["admission_type", "gender", "ventilation_status"])

## Handle ventilation_status

Propagate one-hot coded ventilation_status features across rows grouped by 
subject_id and hadm_id, as there are many rows each with one ventilation_status 
type.  

**These rows will need to be merged/flattened later.**

In [15]:
df = df.groupby(["subject_id", "hadm_id"]) \
    .apply(any_one, startswith="ventilation_status")

## Transform binary encode vasoactive drugs

In [16]:
va_drugs = ["dopamine", "epinephrine", "norepinephrine", "phenylephrine",
            "vasopressin", "dobutamine", "milrinone"]

for drug in va_drugs:
    df[drug] = np.where(df[drug].notna(), 1, 0)

df[va_drugs].head(10)

Unnamed: 0,dopamine,epinephrine,norepinephrine,phenylephrine,vasopressin,dobutamine,milrinone
0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0
2,0,0,0,1,0,0,0
3,0,0,0,1,0,0,0
4,0,0,0,1,0,0,0
5,0,0,0,1,0,0,0
6,0,0,0,1,0,0,0
7,0,0,0,1,0,0,0
8,0,0,1,0,0,0,0
9,0,0,1,0,0,0,0


Similar to ventilation_status, we will need to propagate the vasoactive drugs status across rows within subject admission groups.

In [17]:
for drug in va_drugs:
    df = df.groupby(["subject_id", "hadm_id"]) \
    .apply(any_one, startswith=drug)

## Manage NA values in specific features

### suspected_infection

If suspected_infection is NA, assume 0

In [18]:
df["suspected_infection"] = np.where(
    df["suspected_infection"].isna(), 0, df["suspected_infection"]
)

# Remove instances with missing data

In [19]:
df = df.dropna(axis=0, how="any")
len(df)

34640

## Merge/flatten

Flatten each group of subject_id and hadm_id to one row, first checking that each group has identical rows.

In [20]:
try:
    # Check all groups have identical rows
    df.groupby(["subject_id", "hadm_id"]).apply(assert_group_identical)
    
    # Flatten the groups to one row per group
    df = df.groupby(["subject_id", "hadm_id"]).first()
    
    print(len(df))
except AssertionError as msg:
    print(msg)

Group: [[10792661, 26360527], [10792661, 26360527], [10792661, 26360527]], col stay_id has >= 1 unique values, being [30644482 39710436]


## Glimpse and validate

In [None]:
df.head(10)

In [None]:
validate_dataset(df)