In [1]:
import os
import re
import time
import keras
import numpy as np
from tqdm import tqdm
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

Using TensorFlow backend.


In [27]:
RND_SEED = 777

BOTTLENECKS_DIR = 'out/bottlenecks'

SAMPLE_SIZE = 3 + 16 * 32

TRAIN_SAMPLES_PER_FILE = 10000
TRAIN_FILE = 'out/train-samples-%d.mem'
TRAIN_YS_FILE = 'out/train-ys-%d.npy'

VALIDATION_SIZE = 604
VALIDATION_FILE = 'out/val-samples.mem'
VALIDATION_YS_FILE = 'out/val-ys.mem'

PATIENTS_ENCODING = {
    '1': [1.,0.,0.],
    '2': [0.,1.,0.],
    '3': [0.,0.,1.]
}

In [3]:
np.random.seed(RND_SEED)

In [4]:
# list files in directories
def list_files(src_dirs):

    if not list == type(src_dirs): src_dirs = [src_dirs]
    
    f = []
    
    for d in src_dirs:
        df = []
        for (dirpath, dirnames, filenames) in os.walk(d):
            filenames = [dirpath + '/' + x for x in filenames]
            df.extend(filenames)
        f.extend(df)
    
    return f

In [23]:
# create list of fids+ys for training
train_items = []
eval_items = []

for f in tqdm(list_files(BOTTLENECKS_DIR)):
    
    m = re.findall(r'((\d+)_(\d+)(?:_(\d))?)\.npy$', f)
    fid = m[0][0]
    patient = m[0][1]
    
    if m[0][3] == '':
        # test
        eval_items.append({'fid': fid, 'patient': patient})
    else:
        # train
        y = float(m[0][3])
        train_items.append({'fid': fid, 'y': y, 'patient': patient})
        
np.random.shuffle(train_items)

100%|██████████| 12168/12168 [00:00<00:00, 355156.13it/s]


In [None]:
# create dataset files
print 'Total training items:', len(train_items), '\n'
print 'Total eval items:', len(eval_items), '\n'; time.sleep(0.5)

In [7]:
i = 0
validation_set_created = False

while len(train_items):

    if validation_set_created:
        i += 1
        data_f = TRAIN_FILE%(i)
        ys_f = TRAIN_YS_FILE%(i)
        portion = train_items[:TRAIN_SAMPLES_PER_FILE]
        train_items = train_items[TRAIN_SAMPLES_PER_FILE:]
    else:
        portion = train_items[:VALIDATION_SIZE]
        train_items = train_items[VALIDATION_SIZE:]
        data_f = VALIDATION_FILE
        ys_f = VALIDATION_YS_FILE
        validation_set_created = True
    
    samples = np.memmap(
        data_f, 
        dtype=np.float32, 
        mode='w+', 
        shape=(len(portion), SAMPLE_SIZE)
    )
    
    ys = np.zeros([len(portion), 2], dtype=np.float32)
    
    s = 0

    for x in tqdm(portion):
        
        fid = x['fid']
        y = [x['y'], 1. - x['y']]
        patient = PATIENTS_ENCODING[x['patient']]
        
        bottlenecks = np.load(BOTTLENECKS_DIR + '/' + fid + '.npy')
        samples[s] = np.concatenate((patient, bottlenecks.flatten())).astype(np.float32)
        ys[s] = y
        
        s += 1
        
    print 'flushing... '
    samples.flush()
    ys.tofile(ys_f)
    print 'Created ', data_f, ys_f; time.sleep(0.5)

100%|██████████| 604/604 [00:00<00:00, 5873.03it/s]


flushing... 
Created  out/val-msgs.mem out/val-ys.mem


100%|██████████| 5438/5438 [00:00<00:00, 5882.08it/s]


flushing... 
Created  out/train-msgs-1.mem out/train-ys-1.npy


In [28]:
eval_fids = []
i = 0

while len(eval_items):

    i += 1
    data_f = EVAL_FILE%(i)
    portion = eval_items[:EVAL_SAMPLES_PER_FILE]
    eval_items = eval_items[EVAL_SAMPLES_PER_FILE:]
    
    samples = np.memmap(
        data_f, 
        dtype=np.float32, 
        mode='w+', 
        shape=(len(portion), SAMPLE_SIZE)
    )
    
    s = 0

    for x in tqdm(portion):
        
        fid = x['fid']
        patient = PATIENTS_ENCODING[x['patient']]
        
        bottlenecks = np.load(BOTTLENECKS_DIR + '/' + fid + '.npy')
        samples[s] = np.concatenate((patient, bottlenecks.flatten())).astype(np.float32)
        eval_fids.append(fid)
        
        s += 1
        
    print 'flushing... '
    samples.flush()
    eval_fids = np.array(eval_fids, dtype=np.str)
    np.save(eval_fids)
    print 'Created ', data_f; time.sleep(0.5)

100%|██████████| 6126/6126 [00:01<00:00, 3445.38it/s]


flushing... 
Created  out/eval-samples-1.mem


In [None]:
np.fromfile()