# Preprocessing

Some preprocessing steps.

But not the ones mentioned by Patrick Rockenschaub's paper.

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

%load_ext autoreload
%autoreload 2

---
## Setting dtype of time and `float32`
The dtype of the 'time' column is automatically set to `Timedelta`.  
Hence, when loading DataFrame and converting it to numpy, the numpy array will not have a single dtype. Instead the numpy array will be of dtype 'object'.  
This can create many complications. Therefore, we change it to float.

In addition, using `float32` over `float64` is memory saving and the precision is good enough. So changing the dtypes to `float32` for all other dtypes. This includes `int32` for the id column and `bool`for the label column. Parquet is able in preserving the dtype, as it is a binary format (CSV format cannot preserve dtype).


In [11]:
# path to data
miiv_path_p = '~/Documents/data/ts/miiv/fully_observed/miiv_ts_wide.parquet'
# read in data
df = pd.read_parquet(miiv_path_p)
# change dtype of time column
# df['time'] = df['time'].apply(lambda x: x.total_seconds() / 60 / 60)
# change all float64 dtypes to float 32
# float64_columns = df.select_dtypes(include='float64').columns
# df[float64_columns] = df[float64_columns].astype('float32')
df = df.astype('float32')
# save data
df.to_parquet(miiv_path_p)
print(df.shape)
df.head()

(3552209, 51)


Unnamed: 0,id,time,label,alb,alp,alt,ast,be,bicar,bili,...,phos,plt,po2,ptt,resp,sbp,temp,tnt,urine,wbc
0,30000152.0,0.0,0.0,,,,,,,,...,,,,,14.0,124.5,36.0,,280.0,
1,30000152.0,1.0,0.0,,,,,-3.0,,,...,,,242.0,,16.0,141.0,37.277779,,45.0,
2,30000152.0,2.0,0.0,,,,,,,,...,,,,,,,,,50.0,
3,30000152.0,3.0,0.0,,,,,-4.0,19.0,,...,3.1,173.0,215.0,25.299999,14.0,116.0,37.5,,50.0,17.0
4,30000152.0,4.0,0.0,,,,,,,,...,,,,,20.0,111.0,,,45.0,


Run this code to do the final preprocessing steps for the data provided by Patrick.  
This includes getting the correct dtypes and applying z-score to selected columns.

In [4]:
path_original_data = Path('~/Documents/data/ts/miiv_old/fully_observed/dyn.parquet') # contains dynamic information
path_preprocessed_data = Path('~/Documents/data/ts/miiv/fully_observed/miiv_ts_wide.parquet') # path where to save
path_preprocessed_data = Path('~/Documents/data/ts/miiv/MCAR_1/miiv_MCAR_1_ts_wide.parquet') # path where to save
path_outcome = '~/Documents/data/ts/miiv_old/fully_observed/outc.parquet' # contains information about sepsis label

# load dynamic data
df = pd.read_parquet(path_original_data)
print('shape:', df.shape)
print('Num unique ids:', pd.unique(df['stay_id']).shape)
print('--- Transformed ---')

# rename stay_id column to id
df = df.rename(columns={'stay_id': 'id'})

# load outcome data (sepsis label)
df_outc = pd.read_parquet(path=path_outcome)
df_outc = df_outc.rename(columns={'stay_id':'id'})

# combine dynamic and outcome data
combined = pd.merge(df_outc, df, on=['id', 'time'], how='left')
df = combined

# change time column to int32
df['time'] = df['time'].apply(lambda x: x.total_seconds() / 60 / 60).astype('int32')

# change all float64 dtypes to float 32
float64_columns = df.select_dtypes(include='float64').columns
df[float64_columns] = df[float64_columns].astype('float32')

# # change label (bool) to float32
# df['label'] = df['label'].astype('float32')

# perform z-score 
labels_to_normalize = df.columns[3:]   # excluding columns: id, time, label
means = df[labels_to_normalize].mean(skipna=True)
std = df[labels_to_normalize].std(skipna=True, ddof=0)
df[labels_to_normalize] = (df[labels_to_normalize] - means) / std
# normalize time -> from 0 to 1
df['time'] = df['time'] / 100

# save data
df.to_parquet(path_preprocessed_data, index=False)

print('shape:', df.shape)
print('Num unique ids:', pd.unique(df['id']).shape)
print(df.dtypes)
df.head()

shape: (3552209, 50)
Num unique ids: (67056,)
--- Transformed ---
shape: (3552209, 51)
Num unique ids: (67056,)
id            int32
time        float64
label          bool
alb         float32
alp         float32
alt         float32
ast         float32
be          float32
bicar       float32
bili        float32
bili_dir    float32
bnd         float32
bun         float32
ca          float32
cai         float32
ck          float32
ckmb        float32
cl          float32
crea        float32
crp         float32
dbp         float32
fgn         float32
fio2        float32
glu         float32
hgb         float32
hr          float32
inr_pt      float32
k           float32
lact        float32
lymph       float32
map         float32
mch         float32
mchc        float32
mcv         float32
methb       float32
mg          float32
na          float32
neut        float32
o2sat       float32
pco2        float32
ph          float32
phos        float32
plt         float32
po2         float32
ptt     

Unnamed: 0,id,time,label,alb,alp,alt,ast,be,bicar,bili,...,phos,plt,po2,ptt,resp,sbp,temp,tnt,urine,wbc
0,30000153,0.0,False,,,,,,,,...,,,,,-0.948346,0.182569,-1.466281,,0.838709,
1,30000153,0.01,False,,,,,-0.49486,,,...,,,0.917195,,-0.581458,0.952453,0.648741,,-0.639859,
2,30000153,0.02,False,,,,,,,,...,,,,,,,,,-0.6084,
3,30000153,0.03,False,,,,,-0.698174,-1.027547,,...,-0.346602,-0.17064,0.644151,-0.65525,-0.948346,-0.214038,1.01657,,-0.6084,0.603746
4,30000153,0.04,False,,,,,,,,...,,,,,0.152318,-0.447336,,,-0.639859,
