<a href="https://colab.research.google.com/github/jwu19/FIDDLE/blob/master/FIDDLE_hands_on.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download files

In [None]:
!pip install --upgrade --no-cache-dir gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gdown
  Downloading gdown-4.6.0-py3-none-any.whl (14 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.4.0
    Uninstalling gdown-4.4.0:
      Successfully uninstalled gdown-4.4.0
Successfully installed gdown-4.6.0


In [None]:
# Download formatted sample data, PhysioNet Challenge 2012
# https://physionet.org/content/challenge-2012/
!gdown -q --id '1LabZCw6Ryjv1wwT4dWbQTWPvsehM6WiX'
!gdown -q --id '16UlO3E9q5pDmn_K6wgAelAwg2IhsQJtX'
!gdown -q --id '1JcO7FYY5QAix2Ui-dAlK8rEq9XGu_9sC'
!mkdir -p physionet_2012_data

# Download FIDDLE and unzip
!rm -rf FIDDLE FIDDLE-master.zip
# !wget -q https://gitlab.eecs.umich.edu/mld3/FIDDLE/-/archive/v0.1.0/FIDDLE-v0.1.0.zip
# !unzip -qq -j FIDDLE-v0.1.0.zip 'FIDDLE-v0.1.0/FIDDLE/*' -d FIDDLE/
!wget -q https://github.com/MLD3/FIDDLE/archive/refs/tags/v0.2.2.zip
!unzip -qq -j v0.2.2.zip 'FIDDLE-0.2.2/FIDDLE/*' -d FIDDLE/

# Update packages
!pip install -q -U pyyaml sparse scikit-learn
!pip install -q -U icd9cms icd10-cm

# DONE
!echo 'DONE!'

unzip:  cannot find or open FIDDLE-v0.2.2.zip, FIDDLE-v0.2.2.zip.zip or FIDDLE-v0.2.2.zip.ZIP.
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.8/77.8 KB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m738.4/738.4 KB[0m [31m64.7 MB/s[0m eta [36m0:00:00[0m
[?25hDONE!


In [None]:
import numpy as np
import pandas as pd
import sparse
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
df_input = pd.read_csv('./physionet_2012_input_data.csv')

# Input data

In [None]:
df_input.head(50)

Unnamed: 0,ID,t,variable_name,variable_value
0,132539,,Age,54.0
1,132539,,Gender,_0
2,132539,,ICUType,_4
3,132539,0.116667,GCS,15.0
4,132539,0.116667,HR,73.0
5,132539,0.116667,NIDiasABP,65.0
6,132539,0.116667,NIMAP,92.33
7,132539,0.116667,NISysABP,147.0
8,132539,0.116667,RespRate,19.0
9,132539,0.116667,Temp,35.1


# Run FIDDLE (takes ~1min)

In [None]:
!echo 'parallel: no' >> config.yaml     # turns off parallelization
!echo 'discretize: no' >> config.yaml   # turns off variable discretization

In [None]:
!python -m FIDDLE.run \
    --data_fname='./physionet_2012_input_data.csv' \
    --population_fname='./physionet_2012_pop.csv' \
    --config_fname='./config.yaml' \
    --output_dir='./physionet_2012_data/' \
    --T=48 \
    --dt=24 \
    --theta_1=0.01 --theta_2=0.01 --theta_freq=1 \
    --stats_functions 'min' 'max' 'mean' \
    --N=200

Input:
    Data      : ./physionet_2012_input_data.csv
    Population: ./physionet_2012_pop.csv
    Config    : ./config.yaml

Output directory: ./physionet_2012_data/

Input arguments:
    T      = 48.0
    dt     = 24.0
    θ₁     = 0.01
    θ₂     = 0.01
    θ_freq = 1.0
    k      = 3 ['min', 'max', 'mean']

discretize = no

N = 200
L = 2


1) Pre-filter
Remove rows not in population
Remove rows with t outside of [0, 48]
Remove rare variables (<= 0.01)
Total variables     : 41
Rare variables      : 0
Remaining variables : 41
# rows (original)   : 4878840
# rows (filtered)   : 4876437

2) Transform; 3) Post-filter

--------------------------------------------------------------------------------
*) Detecting and parsing value types
--------------------------------------------------------------------------------
Saved as: ./physionet_2012_data/value_types.csv

--------------------------------------------------------------------------------
*) Separate time-invariant and time-dependent

# Train a model

In [None]:
X = sparse.load_npz('{data_path}/X.npz'.format(data_path='physionet_2012_data')).todense()
S = sparse.load_npz('{data_path}/S.npz'.format(data_path='physionet_2012_data')).todense()[:200]
y = pd.read_csv('physionet_2012_labels.csv')['In-hospital_death']

N,L,D = X.shape
_,d = S.shape

X_all = np.hstack([S, X.reshape((N,L*D))])
y_all = y[:N]

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_all = scaler.fit_transform(X_all)

print(X_all.shape, y_all.shape)
Xtr, Xte, ytr, yte = train_test_split(X_all, y_all, stratify=y_all, random_state=1)

(200, 319) (200,)


In [None]:
clf = LogisticRegression()
clf.fit(Xtr, ytr)

In [None]:
score = metrics.roc_auc_score(yte, clf.decision_function(Xte))
print('Test AUROC score:', score)

Test AUROC score: 0.7121212121212122
