In [24]:
import os
import re
import time
import keras
import numpy as np
from tqdm import tqdm
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

In [25]:
RND_SEED = 777

MSGS_DIR = 'out/msgs'

MSG_SHAPE_IN = (16, 16, 64)
# MSG_SHAPE_OUT = (96, 2048, 16)
MSG_SHAPE_OUT = (16, 16, 64, 1)

TRAIN_SAMPLES_PER_FILE = 10000
TRAIN_FILE = 'out/train-msgs-%d.mem'
TRAIN_YS_FILE = 'out/train-ys-%d.npy'
TRAIN_PATS_FILE = 'out/train-pats-%d.npy'

VALIDATION_SIZE = 1000
VALIDATION_FILE = 'out/val-msgs.mem'
VALIDATION_YS_FILE = 'out/val-ys.npy'
VALIDATION_PATS_FILE = 'out/val-pats.npy'

PATIENT_FEATURES = {
    '1': [1., 0., 0.],
    '2': [0., 1., 0.],
    '3': [0., 0., 1.]
}

In [26]:
(1,) + MSG_SHAPE_OUT

(1, 16, 16, 64, 1)

In [27]:
np.random.seed(RND_SEED)

In [28]:
# list files in directories
def list_files(src_dirs):

    if not list == type(src_dirs): src_dirs = [src_dirs]
    
    f = []
    
    for d in src_dirs:
        df = []
        for (dirpath, dirnames, filenames) in os.walk(d):
            filenames = [dirpath + '/' + x for x in filenames]
            df.extend(filenames)
        f.extend(df)
    
    return f

In [29]:
# create list of fids+ys for training
train_items = []

for f in tqdm(list_files(MSGS_DIR)):
    
    m = re.findall(r'(\d+)_(\d+)_(\d+)\.npy$', f)
    
    if len(m) > 0:
        fid = "%s_%s_%s"%(m[0])
        y = float(m[0][-1])
        patient = m[0][0]
        train_items.append([fid, y, patient])
        
np.random.shuffle(train_items)
# train_items = np.array(train_items, dtype=np.object)

100%|██████████| 12168/12168 [00:00<00:00, 523938.15it/s]


In [30]:
# create dataset files
print 'Total training items:', len(train_items), '\n'; time.sleep(0.5)

Total training items: 6042 



In [31]:
i = 0
validation_set_created = False

while len(train_items):

    if validation_set_created:
        i += 1
        data_f = TRAIN_FILE%(i)
        ys_f = TRAIN_YS_FILE%(i)
        pats_f = TRAIN_PATS_FILE%(i)
        portion = train_items[:TRAIN_SAMPLES_PER_FILE]
        train_items = train_items[TRAIN_SAMPLES_PER_FILE:]
    else:
        portion = train_items[:VALIDATION_SIZE]
        train_items = train_items[VALIDATION_SIZE:]
        data_f = VALIDATION_FILE
        ys_f = VALIDATION_YS_FILE
        pats_f = VALIDATION_PATS_FILE
        validation_set_created = True
    
    samples = np.memmap(
        data_f, 
        dtype=np.float32, 
        mode='w+', 
        shape=(len(portion),) + MSG_SHAPE_OUT
    )
    
    ys = np.zeros([len(portion), 2], dtype=np.float32)
    pats = np.zeros([len(portion), 3], dtype=np.float32)
    
    s = 0

    for x in tqdm(portion):
        fid = x[0]
        y = [x[1], 1. - x[1]]
        p = PATIENT_FEATURES[x[2]]
        d = np.load(MSGS_DIR + '/' + fid + '.npy').reshape(MSG_SHAPE_OUT)
        samples[s] = d
        ys[s] = y
        pats[s] = p
        s += 1
        
    print 'flushing... '
    samples.flush()
    
    np.save(ys_f, ys)
    np.save(pats_f, pats)

    print 'Created ', data_f, ys_f, pats_f; time.sleep(0.5)

100%|██████████| 1000/1000 [00:00<00:00, 4665.38it/s]


flushing... 
Created  out/val-msgs.mem out/val-ys.npy out/val-pats.npy


100%|██████████| 5042/5042 [00:01<00:00, 4795.73it/s]


flushing... 
Created  out/train-msgs-1.mem out/train-ys-1.npy out/train-pats-1.npy


In [32]:
np.load('out/train-ys-1.npy').shape[0] * 16

80672

In [33]:
samples.shape[0] * 16

80672

In [None]:
np.fromfile()