In [1]:
import tensorflow as tf
from time import time
import numpy
import os
import json
import pickle
import pandas
import datetime
from functools import partial, reduce

import sys
sys.path.append('../libs')

import data_pipeline
import conv_model
import initialize
import prepare_data
import flacdb

tf.debugging.set_log_device_placement(True)

In [3]:
! nvidia-smi

Wed Oct 23 11:13:21 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 410.78       Driver Version: 410.78       CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GTX TIT...  On   | 00000000:89:00.0 Off |                  N/A |
| 22%   38C    P8    16W / 250W |      0MiB / 12212MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

### Partition Data

In [None]:
metadata = pandas.read_csv('/scr-ssd/mimic/metadata_matched.csv')
subject_ids = metadata['subject_id'].unique()
numpy.random.shuffle(subject_ids)
i = round(0.2*len(subject_ids))
# with open('../test_subject_ids.txt', 'w') as f:
#     f.write('\n'.join(subject_ids[:i].astype('str')))

### Initialize

In [3]:
%%time

H = initialize.load_hypes()
sig_data, metadata, partition = initialize.load_data(H)
initialize.describe_data_size(H, sig_data, metadata)

92 years,  188592 record segments
CPU times: user 36 s, sys: 7.65 s, total: 43.7 s
Wall time: 43.8 s


### Plot Batch

In [None]:
for k in ['train', 'validation']:
    data[k] = data_pipeline.build(H, data[k], k)

In [None]:
dataframe = initialize.sample_data(H_, sig_data, 'train')
tensors = initialize.dataframe_to_tensors(H_, dataframe)
dataset = data_pipeline.build(H_, tensors, is_validation=True)

In [None]:
%matplotlib notebook
from matplotlib import pyplot
from functools import partial
from ipywidgets import interact, IntSlider


S = H['output_sigs'] + H['input_sigs_train']
        
def plot_batch_example(H, x, y, i):
    pyplot.subplots_adjust(left=0.03, wspace=0, hspace=0)
    lines = {'sigs': {}, 'label': {}}
    axes = {}
    for j, s in enumerate(S):
        axes[s] = pyplot.subplot(len(S), 1, j + 1)
        line = axes[s].plot(x[i][:, j])[0]
        lines['sigs'][s] = line
        axes[s].set_ylabel(s)
        axes[s].yaxis.tick_right()
        if s in ['ABP', 'ART', 'CVP', 'PAP', 'ICP']:
            line.set_color('red')
        elif s in ['I', 'II', 'III', 'V', 'AVR', 'AVF', 'AVL', 'MCL', 'MCL1']:
            line.set_color('darkblue')
        elif s in ['RESP']:
            line.set_color('darkgreen')
        elif s in ['PLETH']:
            line.set_color('darkred')
    
    plot_y = lambda j: axes['ABP'].plot([0, H['window_size']], [y[i][j]] * 2, '--k')[0]
    lines['label'] = {'sys': plot_y(0), 'dia': plot_y(1)}
    
    return lines, axes

def update(i):
    for j, s in enumerate(S):
        lines['sigs'][s].set_ydata(x[i][:, j])
        low, high = x[i][:, j].min(), x[i][:, j].max()
        dx = max(0.01, high - low)
        axes[s].set_ylim(bottom = low - 0.1 * dx, top = high + 0.1 * dx)
    lines['label']['sys'].set_ydata([y[i][0]] * 2)
    lines['label']['dia'].set_ydata([y[i][1]] * 2)
    fig.axes[0].set_title('Systolic: %.1f, Diastolic: %.1f' % tuple(y[i]))
    fig.canvas.draw()
    pyplot.show()

H_ = {
    **H, 
    'input_sigs_train': ['ABP'] + H['input_sigs_train'], 
    'input_sigs_validation': ['ABP'] + H['input_sigs_validation'], 
    'epochs': 1, 
    'steps_per_epoch': 1,
    'batch_buffer_size': 1,
    'windows_per_chunk': 5,
}

dataframe = initialize.sample_data(H_, sig_data, is_validation=True)
tensors = initialize.dataframe_to_tensors(H_, dataframe)
dataset = data_pipeline.build(H_, tensors, is_validation=True)
x_tf, y_tf = next(iter(dataset))
x, y = x_tf.numpy(), y_tf.numpy()

fig = pyplot.figure(figsize=[8, 6])
lines, axes = plot_batch_example(H_, x, y, 0)

interact(update, i=IntSlider(min=0, max=H_['batch_size']-1, value=0));

### Filter Percentage

In [None]:
%%time

N = 10**4

H_ = {
    **H, 
    'epochs': 1,
    'steps_per_epoch': N / H['batch_size'] / 10 * 2,
    'batch_buffer_size': 1,
    'windows_per_chunk': 10,
    'filter_data': True
}

dataframe = initialize.sample_data(H_, sig_data)
print(dataframe.shape)
I = numpy.random.permutation(dataframe.shape[0] // H_['windows_per_chunk'])[:N // H_['windows_per_chunk']]
I = [i*H_['windows_per_chunk'] + j for i in I for j in range(H_['windows_per_chunk'])]
tensors = initialize.dataframe_to_tensors(H_, dataframe.iloc[I])
dataset = data_pipeline.build(H_, tensors)
n = sum(i[0].shape[0] for i in dataset)
round(n / N * 100, 1)

In [None]:
%%time

N = 10**4

H_ = {
    **H, 
    'epochs': 1,
    'steps_per_epoch': N / H['batch_size'] / 10 * 2,
    'batch_buffer_size': 1,
    'windows_per_chunk': 10,
    'filter_data': True
}

dataframe = initialize.sample_data(H_, sig_data)
print(dataframe.shape)
I = numpy.random.permutation(dataframe.shape[0] // H_['windows_per_chunk'])[:N // H_['windows_per_chunk']]
I = [i*H_['windows_per_chunk'] + j for i in I for j in range(H_['windows_per_chunk'])]
tensors = initialize.dataframe_to_tensors(H_, dataframe.iloc[I])
dataset = data_pipeline.build(H_, tensors)
n = sum(i[0].shape[0] for i in dataset)
round(n / N * 100, 1)

### Fit Model

In [None]:
%%time

dataset = {
    'train': pipeline.build(H, dataframes['train']),
    'validation': pipeline.build(H, dataframes['validation']),
}

model = conv_model.build(H)

model.summary()

logdir = os.path.join('/scr-ssd/tflogs', datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1, embeddings_freq=5)

model.fit(
    dataset['train'],
    validation_data = dataset['validation'],
    epochs = H['epochs'],
    steps_per_epoch = H['steps_per_epoch'],
    validation_steps = H['steps_per_epoch'],
    callbacks = [tensorboard_callback]
)

### Build Model

#### Fully Connected

In [None]:
def simple_fc_model(H):
    inputs = tf.keras.layers.Input(shape=(H['window_size'], len(H['input_sigs'])))
    z = tf.keras.layers.Flatten()(inputs)
    for i in range(3):
        z = tf.keras.layers.Dense(H['dense_units'], activation=H['activation'])(z)
    final_layer = tf.keras.layers.Dense(2)
    outputs = final_layer(z)

    model = tf.keras.models.Model(inputs=inputs, outputs=outputs)

    final_layer.set_weights([
        final_layer.get_weights()[0],
        tf.keras.backend.constant([120, 60], dtype='float32')
    ])

    optimizer = getattr(tf.keras.optimizers, H['optimizer']['name'].title())

    model.compile(
        optimizer = optimizer(**H['optimizer']['args']),
        loss='mean_squared_error',
        metrics=['mean_absolute_error']
    )
    
    return model

#### Simple Convolution

In [None]:
def simple_conv_model(H):
    inputs = z = tf.keras.layers.Input(shape=(H['window_size'], len(H['input_sigs'])))

    for i in range(3):
        layer = tf.keras.layers.Conv1D(
            filters=128, 
            padding='same', 
            strides=4,
            kernel_size=32,
            activation='relu'
        )
        z = layer(z)

    z = tf.keras.layers.Flatten()(z)
    z = tf.keras.layers.Dense(128, activation='relu')(z)
    final_layer = tf.keras.layers.Dense(2)
    outputs = final_layer(z)

    model = tf.keras.models.Model(inputs=inputs, outputs=outputs)

    final_layer.set_weights([
        final_layer.get_weights()[0],
        tf.keras.backend.constant([120, 60], dtype='float32')
    ])

    optimizer = getattr(tf.keras.optimizers, H['optimizer']['name'].title())

    model.compile(
        optimizer = optimizer(**H['optimizer']['args']),
        loss='mean_squared_error',
        metrics=['mean_absolute_error']
    )
    
    return model

#### Initial Bias

In [None]:
model = conv_model.build(H)
data = sample_data(H)
x, y = next(iter(data['train']))
model.predict_on_batch(x).numpy().mean(0)

#### Overfit Batch

In [None]:
data = sample_data(H)
x, y = next(iter(data['train']))
data['train'] = tf.data.Dataset.from_tensors((x, y)).repeat(5000)
model.fit(data['train'])

#### Zero Input

In [None]:
%%time

model = simple_conv_model(H)
for i in range(3):
    model.fit(data['train'].take(100).map(lambda x, y: (x*0, y)))
    model.evaluate(data['validation'].take(100))

#### Simple Conv

In [None]:
%%time

model = simple_conv_model(H)
for i in range(3):
    data = sample_data(H)
    model.fit(data['train'])
    model.evaluate(data['validation'])

#### Custom Conv

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs --host '0.0.0.0'

### Constant Prediction

In [2]:
H = initialize.load_hypes()

with open('/scr-ssd/mimic/initial_data.pkl', 'rb') as f:
    data = pickle.load(f)

for k in ['train', 'validation']:
    data[k] = data_pipeline.build(H, data[k], k)

Executing op GatherV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op GatherV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op GatherV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op GatherV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op GatherV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op TensorSliceDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Range in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op AddV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Mul in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Pack in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op GatherV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Transpose in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op BroadcastTo in device /job:localhost/replica:0/task:0/device:CPU:0


In [10]:
%%time

n = 10**5
n //= H['batch_size']
y_ = tf.constant(0, dtype='float32')
for x, y in data['train'].take(n):
    y_ += tf.reduce_mean(y, axis=0)
y_ /= n
print(y_.numpy())

68.937294
CPU times: user 18min 14s, sys: 32 s, total: 18min 46s
Wall time: 6min 47s


In [11]:
x = tf.keras.layers.Input(shape=(H['window_size'], len(H['input_sigs_train'])))
z = x[:1, 0, 0]
z *= tf.constant(0, dtype='float32')
z += y_

const_model = tf.keras.models.Model(inputs=x, outputs=z)
const_model.compile(loss='mean_absolute_error')
const_model.evaluate(data['validation'].take(n))

Executing op __inference_keras_scratch_graph_18972 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op __inference_keras_scratch_graph_18977 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op __inference_keras_scratch_graph_18982 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op __inference_keras_scratch_graph_18987 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op __inference_keras_scratch_graph_18992 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op __inference_distributed_function_19061 in device /job:localhost/replica:0/task:0/device:CPU:0


18.136708411254883

In [None]:
%%time

n = 5000
y_ = tf.constant([0, 0], dtype='float32')
for x, y in data['train'].take(n):
    y_ += tf.reduce_mean(y, axis=0)
y_ /= n
print(y_.numpy())

In [None]:
data_util.calculate_training_speed(H, 5000, seconds_to_train=110)

In [None]:
df = pandas.DataFrame([
    {'CPU count': 16, 'GPU count': 3, 'Format': 'flac',   'Windows': 10,   'Speed (days/hr)': 14},
    {'CPU count': 16, 'GPU count': 6, 'Format': 'flac',   'Windows': 10,   'Speed (days/hr)': 22},
    {'CPU count': 16, 'GPU count': 3, 'Format': 'flac',   'Windows': 50,   'Speed (days/hr)': 65},
    {'CPU count': 16, 'GPU count': 3, 'Format': 'flac',   'Windows': 1000, 'Speed (days/hr)': 107},
    {'CPU count': 16, 'GPU count': 3, 'Format': 'flac',   'Windows': 1000, 'Speed (days/hr)': 182},
    {'CPU count': 16, 'GPU count': 3, 'Format': 'zlib',   'Windows': 10,   'Speed (days/hr)': 24},
    {'CPU count': 16, 'GPU count': 3, 'Format': 'zlib',   'Windows': 50,   'Speed (days/hr)': 93},
    {'CPU count': 16, 'GPU count': 3, 'Format': 'zlib',   'Windows': 100,  'Speed (days/hr)': 112},
    {'CPU count': 16, 'GPU count': 3, 'Format': 'serial', 'Windows': 10,   'Speed (days/hr)': 68},
    {'CPU count': 16, 'GPU count': 3, 'Format': 'serial', 'Windows': 50,   'Speed (days/hr)': 178},
    {'CPU count': 16, 'GPU count': 3, 'Format': 'serial', 'Windows': 100,  'Speed (days/hr)': 210},
    {'CPU count': 16, 'GPU count': 3, 'Format': 'serial', 'Windows': 1000, 'Speed (days/hr)': 311},
    {'CPU count': 16, 'GPU count': 3, 'Format': 'chunks', 'Windows': 10,   'Speed (days/hr)': 248},
    {'CPU count': 16, 'GPU count': 3, 'Format': 'memory', 'Windows': -1,   'Speed (days/hr)': 345},
    {'CPU count': 16, 'GPU count': 6, 'Format': 'memory', 'Windows': -1,   'Speed (days/hr)': 459},
])
df.sort_values('Speed (days/hr)')

### Dummy Data in Memory

In [None]:
n_batches = 1000

x = tf.random.uniform(
    shape = [n_batches, H['batch_size'], H['window_size'], len(H['input_sigs'])],
    minval = -1,
    maxval = 1,
)
y = tf.random.uniform(shape=[n_batches, H['batch_size'], 2], minval=40, maxval=200)
dummy_data = tf.data.Dataset.from_tensor_slices((x, y))
model = conv_model.build(H)

%time model.fit(dummy_data, shuffle=False)