# Data Processing

In [15]:
import sys
sys.path.append("..")

import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

from src.tests import validate_dataset, assert_group_identical
from src.data_processing import remove_outliers, any_one

## Import initial_cohort

In [16]:
df = pd.read_csv("../data/initial_cohort_final.csv", index_col=0)
len(df)

22071

## One Hot Encode

- admission_type
- gender
- ventilation_status

In [17]:
df = pd.get_dummies(data=df, columns=["admission_type", "gender", "ventilation_status"])

## Handle ventilation_status

Propagate one-hot coded ventilation_status features across rows grouped by 
subject_id and hadm_id, as there are many rows each with one ventilation_status 
type.  

**These rows will need to be merged/flattened later.**

In [18]:
df = df.groupby(["subject_id", "hadm_id"]) \
    .apply(any_one, startswith="ventilation_status")

## Transform binary encode vasoactive drugs

In [19]:
va_drugs = ["dopamine", "epinephrine", "norepinephrine", "phenylephrine",
            "vasopressin", "dobutamine", "milrinone"]

for drug in va_drugs:
    df[drug] = np.where(df[drug].notna(), 1, 0)

df[va_drugs].head(10)

Unnamed: 0,dopamine,epinephrine,norepinephrine,phenylephrine,vasopressin,dobutamine,milrinone
0,0,0,0,0,0,0,0
1,0,0,1,1,0,0,0
2,0,0,1,1,0,0,0
3,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0
5,1,0,0,1,0,0,0
6,1,0,0,1,0,0,0
7,0,1,1,1,1,0,0
8,0,1,1,1,1,0,0
9,0,0,0,1,0,0,0


Similar to ventilation_status, we will need to propagate the vasoactive drugs status across rows within subject admission groups.

In [20]:
for drug in va_drugs:
    df = df.groupby(["subject_id", "hadm_id"]) \
    .apply(any_one, startswith=drug)

## Manage NA values in specific features

### suspected_infection

If suspected_infection is NA, assume 0

In [21]:
df["suspected_infection"] = np.where(
    df["suspected_infection"].isna(), 0, df["suspected_infection"]
)

# Remove instances with missing data

In [22]:
df = df.dropna(axis=0, how="any")
len(df)

3573

## Merge/flatten

Flatten each group of subject_id and hadm_id to one row, first checking that each group has identical rows.

In [23]:
try:
    # Check all groups have identical rows
    # ! The uniqueness assertion is not used as to select only the first row
    # todo Need to fix this when the selection rules are decided
    # df.groupby(["subject_id", "hadm_id"]).apply(assert_group_identical)
    
    # Flatten the groups to one row per group
    df = df.groupby(["subject_id", "hadm_id"]).first().reset_index()
    
    print(len(df))
except AssertionError as msg:
    print(msg)

1773


## Remove outliers

In [24]:
# df = remove_outliers(df)
# len(df)

## Glimpse and validate

In [25]:
df.head(10)

Unnamed: 0,subject_id,hadm_id,stay_id,n_stays,sum_los,admission_age,suspected_infection,weight,height,heart_rate_min,...,admission_type_SURGICAL SAME DAY ADMISSION,admission_type_URGENT,gender_F,gender_M,ventilation_status_HFNC,ventilation_status_InvasiveVent,ventilation_status_NonInvasiveVent,ventilation_status_None,ventilation_status_SupplementalOxygen,ventilation_status_Tracheostomy
0,10005606,29646384,38740124,1,6.58,38,0.0,84.1,178.0,83.0,...,0,1,0,1,0,1,0,0,1,0
1,10017531,21095812,32136798,1,39.83,64,1.0,127.1,163.0,100.0,...,0,1,0,1,0,1,0,0,0,1
2,10021487,28998349,38197705,1,15.67,43,1.0,143.0,185.0,99.0,...,0,0,0,1,0,1,0,0,1,0
3,10032381,20176432,34622731,1,42.67,64,1.0,77.75,155.0,67.0,...,0,0,1,0,0,1,0,0,1,1
4,10034317,20827960,36228864,2,4.46,72,0.0,89.0,183.0,63.0,...,1,0,0,1,0,0,0,0,1,0
5,10036086,28728587,38809220,1,18.58,58,1.0,113.0,173.0,101.0,...,0,0,0,1,0,1,0,0,1,0
6,10038999,27189241,39711498,1,8.83,45,1.0,98.9,178.0,29.0,...,0,0,0,1,0,1,0,0,1,0
7,10048001,28426278,31975834,1,4.63,64,1.0,95.1,180.0,70.0,...,0,0,0,1,0,0,0,0,1,0
8,10048244,22354258,33135150,1,3.71,59,1.0,91.233333,175.0,72.0,...,0,0,0,1,0,0,0,0,1,0
9,10054716,25339060,33668354,1,6.04,61,1.0,107.0,178.0,83.0,...,0,1,0,1,0,1,0,0,1,0


In [26]:
validate_dataset(df)

INFO:root:Dataset is valid


True

## Save processed dataset

In [27]:
# Save copy of processed dataset
df.to_csv("../data/cohort_processed.csv")

# Drop identifiers and save feature set and label set separately
df = df.drop(columns=["subject_id", "hadm_id", "stay_id"])
df.index.name = "ID"
df_labels = df["sum_los"]
df_labels.to_csv("../data/cohort_labels.csv")
df_features = df.drop(columns=["sum_los"])
df_features.to_csv("../data/cohort_features.csv")

## Min Max Scale

In [28]:
df_features_scaled = df_features.copy()
df_features_scaled[:] = MinMaxScaler(feature_range=(0,1)).fit_transform(df_features_scaled)
df_features_scaled.to_csv("../data/cohort_features_scaled.csv")
df_features_scaled

Unnamed: 0_level_0,n_stays,admission_age,suspected_infection,weight,height,heart_rate_min,heart_rate_max,temperature_min,temperature_max,mbp_min,...,admission_type_SURGICAL SAME DAY ADMISSION,admission_type_URGENT,gender_F,gender_M,ventilation_status_HFNC,ventilation_status_InvasiveVent,ventilation_status_NonInvasiveVent,ventilation_status_None,ventilation_status_SupplementalOxygen,ventilation_status_Tracheostomy
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.000000,0.277778,0.0,0.362486,0.671053,0.426357,0.403846,0.818868,0.551402,0.610,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.000000,0.638889,1.0,0.550055,0.473684,0.558140,0.532051,0.794340,0.598131,0.640,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.000000,0.347222,1.0,0.619411,0.763158,0.550388,0.448718,0.825786,0.644860,0.510,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0.000000,0.638889,1.0,0.334787,0.368421,0.302326,0.442308,0.780503,0.634579,0.430,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
4,0.166667,0.750000,0.0,0.383860,0.736842,0.271318,0.410256,0.808805,0.551402,0.410,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1768,0.333333,0.555556,1.0,0.296619,0.500000,0.465116,0.474359,0.829560,0.644860,0.900,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1769,0.000000,0.555556,1.0,0.388659,0.565789,0.302326,0.141026,0.805031,0.523364,0.890,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1770,0.166667,0.652778,0.0,0.301200,0.671053,0.302326,0.262821,0.798113,0.675701,0.580,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1771,0.166667,0.722222,1.0,0.266085,0.605263,0.325581,0.397436,0.735849,0.663551,0.445,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
