# Intro

* the goal is to evaluate the quality of generated data

In [1]:
# activate line execution
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import pickle
import numpy as np

# plotly
import plotly.express as px  # (version 4.7.0 or higher)
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [3]:
from fastNLP import DataSet, DataSetIter, RandomSampler, SequentialSampler

def gen_dataset(raw):
    sta, dyn = raw
    s = s_P.transform(sta)
    seq_len = [len(x) for x in dyn]
    d_lis=[d_P.transform(ds) for ds in dyn]
    d = [x[0].tolist() for x in d_lis]
    lag = [x[1].tolist() for x in d_lis]
    mask = [x[2].tolist() for x in d_lis]
    times = [x[-1].tolist() for x in d_lis]
    priv = [x[3].tolist() for x in d_lis]
    nex = [x[4].tolist() for x in d_lis]

    dataset = DataSet({"seq_len": seq_len, 
                       "dyn": d, "lag":lag, "mask": mask,
                       "sta": s, "times":times, "priv":priv, "nex":nex
                      })
    return dataset

In [4]:
add_data = '2012_result/test/2012.pkl'
with open(add_data, 'rb') as f:
    dataset = pickle.load(f)

add_raw = 'data/physio_data/full2012.pkl'
with open(add_raw, 'rb') as f:
    rawset = pickle.load(f)

d_P=rawset["dynamic_processor"]
s_P=rawset["static_processor"]
dataset_train2 = gen_dataset(dataset['train_set'])
dataset_train2

+---------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+
| seq_len | dyn           | lag           | mask          | sta           | times         | priv          | nex           |
+---------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+
| 47      | [[0.0, 0.0... | [[0.070486... | [[0.0, 0.0... | [0.7733333... | [[0.070486... | [[0.0, 0.0... | [[0.0, 0.0... |
| 55      | [[0.055175... | [[0.014583... | [[1.0, 1.0... | [0.4666666... | [[0.014583... | [[0.0, 0.0... | [[0.0, 0.0... |
| 40      | [[0.0, 0.0... | [[0.008333... | [[0.0, 0.0... | [0.6  1.  ... | [[0.008333... | [[0.0, 0.0... | [[0.0, 0.0... |
| 65      | [[0.0, 0.0... | [[0.024652... | [[0.0, 0.0... | [0.2933333... | [[0.024652... | [[0.0, 0.0... | [[0.0, 0.0... |
| 121     | [[0.0, 0.0... | [[0.003819... | [[0.0, 0.0... | [0.5733333... | [[0.003819... | [[0.0, 0.0... | [[0.0, 0.0... |
| 73    

In [6]:
rawset.keys()
dataset.keys()



dict_keys(['train_set', 'raw_set', 'test_set', 'val_set', 'dynamic_processor', 'static_processor'])

dict_keys(['train_set', 'raw_set', 'test_set', 'val_set', 'dynamic_processor', 'static_processor'])

In [11]:
# prevalence of label
rawset['test_set'][0]['Label'].sum()/len(rawset['test_set'][0]['Label'])

0.125

In [7]:
# what are the differences

# exact same test set
rawset['test_set'][0].sum()
dataset['test_set'][0].sum()

# exact same val set
rawset['val_set']
dataset['val_set']

# exact same raw set
rawset['raw_set'][0]
dataset['raw_set'][0]

# train set is different!!!

temp_syn = np.stack([sample['sta'] for sample in dataset_train2])
temp_syn.shape

temp_real = np.stack([sample['sta'] for sample in rawset['train_set']])
temp_real.shape

temp_syn.mean(axis=0), temp_real.mean(axis=0)
temp_syn.std(axis=0), temp_real.std(axis=0)


Age        25210.00
Gender       222.00
Height     35285.50
ICUType     1110.00
Weight     30755.18
Label         50.00
seq_len    29050.00
dtype: float64

Age        25210.00
Gender       222.00
Height     35285.50
ICUType     1110.00
Weight     30755.18
Label         50.00
seq_len    29050.00
dtype: float64

+---------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+-------+
| seq_len | dyn          | lag          | mask         | sta          | times        | priv         | nex          | label |
+---------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+-------+
| 44      | [[0.0, 0.... | [[0.02986... | [[0.0, 0.... | [0.466666... | [[0.02986... | [[0.0, 0.... | [[0.0, 0.... | 0.0   |
| 58      | [[0.0, 0.... | [[0.03576... | [[0.0, 0.... | [0.4     ... | [[0.03576... | [[0.0, 0.... | [[0.0, 0.... | 0.0   |
| 61      | [[0.0, 0.... | [[0.02430... | [[0.0, 0.... | [0.266666... | [[0.02430... | [[0.0, 0.... | [[0.0, 0.... | 0.0   |
| 118     | [[0.0, 0.... | [[0.00277... | [[0.0, 0.... | [0.4     ... | [[0.00277... | [[0.0, 0.... | [[0.0, 0.... | 0.0   |
| 65      | [[0.0, 0.... | [[0.06180... | [[0.0, 0.... | [0.52    ... | [[0.06180... | [[0.0, 0.... | [[0.0, 0.... | 1.0   |


+---------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+-------+
| seq_len | dyn          | lag          | mask         | sta          | times        | priv         | nex          | label |
+---------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+-------+
| 44      | [[0.0, 0.... | [[0.02986... | [[0.0, 0.... | [0.466666... | [[0.02986... | [[0.0, 0.... | [[0.0, 0.... | 0.0   |
| 58      | [[0.0, 0.... | [[0.03576... | [[0.0, 0.... | [0.4     ... | [[0.03576... | [[0.0, 0.... | [[0.0, 0.... | 0.0   |
| 61      | [[0.0, 0.... | [[0.02430... | [[0.0, 0.... | [0.266666... | [[0.02430... | [[0.0, 0.... | [[0.0, 0.... | 0.0   |
| 118     | [[0.0, 0.... | [[0.00277... | [[0.0, 0.... | [0.4     ... | [[0.00277... | [[0.0, 0.... | [[0.0, 0.... | 0.0   |
| 65      | [[0.0, 0.... | [[0.06180... | [[0.0, 0.... | [0.52    ... | [[0.06180... | [[0.0, 0.... | [[0.0, 0.... | 1.0   |


Unnamed: 0,Age,Gender,Height,ICUType,Weight,Label,seq_len
0,54.0,0.0,,4.0,,0.0,50
1,76.0,1.0,175.3,2.0,76.0,0.0,93
2,44.0,0.0,,3.0,56.7,0.0,70
3,68.0,1.0,180.3,3.0,84.6,0.0,68
4,88.0,0.0,,3.0,,0.0,50
...,...,...,...,...,...,...,...
3589,39.0,1.0,177.8,3.0,87.1,0.0,57
3590,58.0,0.0,,1.0,102.8,0.0,56
3591,90.0,1.0,,3.0,78.0,0.0,52
3592,56.0,0.0,170.2,2.0,131.2,0.0,91


Unnamed: 0,Age,Gender,Height,ICUType,Weight,Label,seq_len
0,54.0,0.0,,4.0,,0.0,50
1,76.0,1.0,175.3,2.0,76.0,0.0,93
2,44.0,0.0,,3.0,56.7,0.0,70
3,68.0,1.0,180.3,3.0,84.6,0.0,68
4,88.0,0.0,,3.0,,0.0,50
...,...,...,...,...,...,...,...
3589,39.0,1.0,177.8,3.0,87.1,0.0,57
3590,58.0,0.0,,1.0,102.8,0.0,56
3591,90.0,1.0,,3.0,78.0,0.0,52
3592,56.0,0.0,170.2,2.0,131.2,0.0,91


(3594, 13)

(3594, 13)

(array([0.6821406 , 0.58792432, 1.        , 0.20381375, 0.53505843,
        0.18670006, 0.23149694, 0.33889816, 0.24290484, 0.25011452,
        0.94685587, 0.13105175, 0.37282693]),
 array([0.6585383 , 0.56204786, 0.99916528, 0.20633342, 0.52782415,
        0.14190317, 0.22148024, 0.37228715, 0.26432944, 0.23670255,
        0.92097941, 0.14023372, 0.36035476]))

(array([0.15838522, 0.49220861, 0.        , 0.1975128 , 0.49876939,
        0.38967056, 0.42178917, 0.47333519, 0.42883806, 0.13995259,
        0.22432082, 0.33745695, 0.13949899]),
 array([0.23206309, 0.49613513, 0.02887954, 0.19823097, 0.49922522,
        0.3489508 , 0.415243  , 0.48341434, 0.44097549, 0.1180401 ,
        0.2697709 , 0.34722936, 0.11536292]))

# Missing pattern

In [12]:
# [x.shape for x in temp]
temp = [ np.array(sample['mask'], dtype=int)   for sample in rawset['train_set'] ]

X = np.concatenate(temp, axis=0)

X.shape
(X.sum(0)/X.shape[0]*100).round(1)

px.imshow(X[:120])

(266211, 35)

array([ 1. ,  1.1,  1.1,  1.1,  4.7,  0.1,  4.7, 49.3, 10.9, 20.7,  4.4,
        4.6,  6.2, 77.4,  4.9,  2.7,  4.6, 48.7, 10.5,  4.6, 33.3, 32.8,
       33.3,  7.9,  7.9,  8.2,  4.8, 18.4,  2.8, 49.3, 29.3,  0.1,  0.7,
       45.8,  4.4])

In [14]:
temp = [ (sample.notnull()*1).values   for sample in dataset['train_set'][1] ]

X= np.concatenate(temp, axis=0)[:,1:]
(X.sum(0)/X.shape[0]*100).round(1)

px.imshow(X[:120])

(44, 36)

array([ 1.2,  1.3,  1.1,  0.3,  4.4,  0. ,  3.6, 44.7,  3. ,  6.6,  2.5,
        4.6,  3.1, 72.1,  2.2,  1.2,  2.7, 44.3,  3. ,  4.6, 26.3, 25.2,
       26.8,  4.6,  4.7,  3.1,  3.7,  9.1,  5.4, 43.6, 10. ,  0. ,  0.3,
       13.4,  3. ])

# missing rate