In [1]:
import os
import re
import time
import keras
import numpy as np
from tqdm import tqdm
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

Using TensorFlow backend.


In [2]:
RND_SEED = 12345

N_SAMPLES = 240000
N_CHANNELS = 16

WAVEFORMS_DIR = 'out/waveforms'
SAMPLES_PER_FILE = 1375
TRAIN_WAVEFORMS_FILE = 'out/train-waveforms-%d.mem'
TRAIN_YS_FILE = 'out/train-ys-%d.npy'
VALIDATION_SIZE = 642
VALIDATION_WAVEFORMS_FILE = 'out/val-waveforms.mem'
VALIDATION_YS_FILE = 'out/val-ys.mem'

In [3]:
np.random.seed(RND_SEED)

In [4]:
# list files in directories
def list_files(src_dirs):

    if not list == type(src_dirs): src_dirs = [src_dirs]
    
    f = []
    
    for d in src_dirs:
        df = []
        for (dirpath, dirnames, filenames) in os.walk(d):
            filenames = [dirpath + '/' + x for x in filenames]
            df.extend(filenames)
        f.extend(df)
    
    return f

In [5]:
# create list of fids+ys for training
train_items = []

for f in tqdm(list_files(WAVEFORMS_DIR)):
    
    m = re.findall(r'(\d+)_(\d+)_(\d+)\.npy$', f)
    
    if len(m) > 0:
        fid = "%s_%s_%s"%(m[0])
        y = float(m[0][-1])
        train_items.append([fid, y])
        
np.random.shuffle(train_items)
# train_items = np.array(train_items, dtype=np.object)

100%|██████████| 12168/12168 [00:00<00:00, 430940.56it/s]


In [6]:
# create dataset files

print 'Total training items:', len(train_items), '\n'; time.sleep(0.5)

i = 0
validation_set_created = False

while len(train_items):

    
    if validation_set_created:
        i += 1
        waveforms_f = TRAIN_WAVEFORMS_FILE%(i)
        ys_f = TRAIN_YS_FILE%(i)
        portion = train_items[:SAMPLES_PER_FILE]
        train_items = train_items[SAMPLES_PER_FILE:]
    else:
        portion = train_items[:VALIDATION_SIZE]
        train_items = train_items[VALIDATION_SIZE:]
        waveforms_f = VALIDATION_WAVEFORMS_FILE
        ys_f = VALIDATION_YS_FILE
        validation_set_created = True
    
    waves = np.memmap(
        waveforms_f, 
        dtype=np.float32, 
        mode='w+', 
        shape=(len(portion), N_CHANNELS, N_SAMPLES)
    )
    
    ys = np.zeros([len(portion), 2], dtype=np.float32)
    
    w = 0

    for x in tqdm(portion):
        fid = x[0]
        y = [x[1], 1. - x[1]]
        waves[w] = np.load(WAVEFORMS_DIR + '/' + fid + '.npy')
        ys[w] = y
        w += 1
        
    print 'flushing... '
    waves.flush()
    ys.tofile(ys_f)
    print 'Created ', waveforms_f, ys_f; time.sleep(0.5)

Total training items: 6042 



100%|██████████| 642/642 [00:42<00:00, 15.17it/s]


flushing... 
Created  out/val-waveforms.mem out/val-ys.mem


100%|██████████| 1375/1375 [01:53<00:00, 12.06it/s]


flushing... 
Created  out/train-waveforms-1.mem out/train-ys-1.npy


100%|██████████| 1375/1375 [01:52<00:00, 12.24it/s]


flushing... 
Created  out/train-waveforms-2.mem out/train-ys-2.npy


100%|██████████| 1375/1375 [01:55<00:00, 11.70it/s]


flushing... 
Created  out/train-waveforms-3.mem out/train-ys-3.npy


100%|██████████| 1275/1275 [01:41<00:00, 12.60it/s]


flushing... 
Created  out/train-waveforms-4.mem out/train-ys-4.npy
