In [1]:
import os
import re
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cross_validation import train_test_split

In [2]:
import lib

In [3]:
RND = 123
np.random.seed(RND)
import random
random.seed(RND)

In [5]:
RUN = 'D'
MODELS_DIR = '/d3/caches/kaggle-mls-v5/models/' + RUN
TFB_DIR = '/tmp-persistent/mls5/' + RUN

In [6]:
VAL_SIZE = 0.1

MSGS_CACHE_DIR = 'out/msgs-256-processed'

SAMPLING_RATE = 400
N_SAMPLES = 240000
N_CHANNELS = 16

N_FFT=512
N_MELS=256
DESIRED_MSG_W = 256
HOP_LEN = 1 + int(N_SAMPLES / (DESIRED_MSG_W - 1))

WARM_CACHE = False

print 'HOP_LEN', HOP_LEN

HOP_LEN 942


In [7]:
# make sure output dirs exist
for v in [MSGS_CACHE_DIR, MODELS_DIR, TFB_DIR]: 
    if not os.path.isdir(v): 
        os.makedirs(v)

In [8]:
# load inout files list
input_df = pd.read_csv('out/input_files.csv', index_col='file')

In [9]:
# split inout files into train/test sets
train_df = input_df[input_df['class'] != -1]
test_df = input_df[input_df['class'] == -1]

In [10]:
X_trainval_files = train_df.index.tolist()
X_trainval_patients = np.vstack((train_df['patient_1'], train_df['patient_2'], train_df['patient_3']))\
    .T.astype(np.float32)
y_trainval = np.array(train_df['class'], dtype=np.float32)

In [11]:
def gen_batch(X_files=None, X_patients=None, y=None, start_ix=0, n_samples=1, \
              silent=True, compute_means=False, means_per_ch=None, div_by=None):
    
    X_msgs_batch = np.zeros((n_samples, N_MELS, DESIRED_MSG_W, N_CHANNELS), dtype=np.float32)
    X_patients_batch = np.zeros((n_samples, X_patients.shape[1]), dtype=np.float32)        
    y_batch = np.zeros([n_samples, 2], dtype=np.float32)
    
    if compute_means:
        means = np.zeros([n_samples, N_CHANNELS], dtype=np.float32)
    else:
        means = None
        
    r = range(n_samples) if silent else tqdm(range(n_samples))

    for i in r:
        ii  = (i + start_ix) % len(X_files)
        
        mat_f = X_files[ii]
        mat_cache_fp = MSGS_CACHE_DIR + '/' + mat_f + '.msgs.mem'
        
        # check if msgs are chached
        if not os.path.isfile(mat_cache_fp):

            mat_fp = input_df.ix[mat_f]['path']
            waves = lib.read_mat(mat_fp)
            
            msgs = np.zeros((N_CHANNELS, N_MELS, DESIRED_MSG_W), dtype=np.float32)

            for ch in range(16):
                msgs[ch] = lib.compute_msg(waves[ch], \
                      desired_msg_w=DESIRED_MSG_W, hop_length=HOP_LEN, \
                      n_fft=N_FFT, n_mels=N_MELS, sr=SAMPLING_RATE)
                            
            # move channel axis, shape is now: (ix, h, w, ch)
            msgs_t = np.swapaxes(msgs, 0, 1)
            msgs_t = np.swapaxes(msgs_t, 1, 2)

            X_msgs_batch[i] = msgs_t
            
            if not means_per_ch is None:
                X_msgs_batch = np.subtract(X_msgs_batch, means_per_ch)

            if not div_by is None:
                X_msgs_batch = np.divide(X_msgs_batch, div_by)
                
            # save to cache
            X_msgs_batch[i].tofile(mat_cache_fp)
            
        else:
            
            X_msgs_batch[i] = np.fromfile(mat_cache_fp, dtype=np.float32).\
                reshape((N_MELS, DESIRED_MSG_W, N_CHANNELS))
                
        # compute means
        if compute_means:
            means[i] = np.mean(X_msgs_batch.T.reshape(N_CHANNELS, -1), axis=1)

        X_patients_batch[i] = X_patients[ii]
        y_batch[i] = [1., 0.] if y[ii] else [0., 1.]
            
    return X_msgs_batch, X_patients_batch, y_batch, means

In [12]:
# warm cache
if WARM_CACHE:
    
    means = []
    
    means_per_ch = np.load('out/means_per_ch.npy')

    X_files = input_df.index.tolist()
    X_patients = np.vstack((input_df['patient_1'], input_df['patient_2'], input_df['patient_3']))\
        .T.astype(np.float32)
    y = np.zeros((len(X_files), 1), dtype=np.float32)
    
    start = 0
    stop = len(X_files)

    for i in tqdm(xrange(start, stop)):
        _msgs, _patients, _ys, _means = \
            gen_batch(X_files, X_patients, y, start_ix=i, n_samples=1, \
                      silent=True, compute_means=True, \
                     means_per_ch=means_per_ch, div_by=255.)
#         means.append(_means[0])

#     means_per_ch = np.mean(means, axis=0)
#     np.save('out/means_per_ch.npy', means_per_ch)

In [13]:
TRAIN_N_PER_BATCH = 16
TRAIN_N_EPOCHS = 111

In [14]:
start_ix = 0
    
# training data generator
def train_generator():

    global start_ix
    
    while True:
        
        b = gen_batch(
            X_train_files, X_train_patients, y_train, 
            start_ix=start_ix, n_samples=TRAIN_N_PER_BATCH
        )
                
        start_ix += TRAIN_N_SAMPLES_PER_EPOCH
        
        yield [b[0], b[1]], b[2]

In [15]:
# create model

In [16]:
import keras
from keras.models import *
from keras.layers import *

Using TensorFlow backend.


In [17]:
input_msg = Input(shape=(N_MELS, DESIRED_MSG_W, N_CHANNELS), name='input_msg')

x = Convolution2D(64, 3, 3, border_mode='same', activation='relu')(input_msg)
x = Convolution2D(64, 3, 3, border_mode='same', activation='relu')(x)
x = MaxPooling2D((2, 2), strides=(2, 2))(x)

x = Convolution2D(128, 3, 3, border_mode='same', activation='relu')(x)
x = Convolution2D(128, 3, 3, border_mode='same', activation='relu')(x)
x = MaxPooling2D((2, 2), strides=(2, 2))(x)

x = Convolution2D(256, 3, 3, border_mode='same', activation='relu')(x)
x = Convolution2D(256, 3, 3, border_mode='same', activation='relu')(x)
x = MaxPooling2D((2, 2), strides=(2, 2))(x)

x = Convolution2D(512, 3, 3, border_mode='same', activation='relu')(x)
x = Convolution2D(512, 3, 3, border_mode='same', activation='relu')(x)
x = MaxPooling2D((2, 2), strides=(2, 2))(x)

x = Convolution2D(512, 3, 3, border_mode='same', activation='relu')(x)
x = Convolution2D(512, 3, 3, border_mode='same', activation='relu')(x)
x = MaxPooling2D((2, 2), strides=(2, 2))(x)

x = Convolution2D(512, 3, 3, border_mode='same', activation='relu')(x)
x = Convolution2D(512, 3, 3, border_mode='same', activation='relu')(x)
x = MaxPooling2D((2, 2), strides=(2, 2))(x)

conv_out = Flatten()(x)

In [18]:
input_patient = Input(shape=(3,), name='input_patient')

In [19]:
x = merge([conv_out, input_patient], mode='concat')
x = Dense(2, activation='softmax')(x)

In [20]:
model = Model(input=[input_msg, input_patient], output=[x])

In [21]:
model.compile(loss='binary_crossentropy', optimizer='SGD', metrics=['accuracy'])

In [22]:
model.summary()
# del model

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_msg (InputLayer)           (None, 256, 256, 16)  0                                            
____________________________________________________________________________________________________
convolution2d_1 (Convolution2D)  (None, 256, 256, 64)  9280        input_msg[0][0]                  
____________________________________________________________________________________________________
convolution2d_2 (Convolution2D)  (None, 256, 256, 64)  36928       convolution2d_1[0][0]            
____________________________________________________________________________________________________
maxpooling2d_1 (MaxPooling2D)    (None, 128, 128, 64)  0           convolution2d_2[0][0]            
___________________________________________________________________________________________

In [23]:
from keras.utils.visualize_util import plot
plot(model, to_file='model.png', show_shapes=True)

In [24]:
from keras.callbacks import *

#### Callbacks

In [25]:
from sklearn import metrics
import gc

scores = []

def score_auc():
    s = 0
    n = len(X_val_msgs)
    y_p = model.predict([X_val_msgs[s:s+n],
                         X_val_patients[s:s+n]], 
                        verbose=False)
    return metrics.roc_auc_score(y_val[s:s+n].T[0], y_p.T[0])

class MyCallback(keras.callbacks.Callback):
    def _validate(self):
        s = score_auc()
        scores.append(s)
        print "\n\n AUC = %.5f\n"%s; time.sleep(.5)
    def on_train_begin(self, epoch, logs={}):
        self._validate()
    def on_epoch_end(self, epoch, logs={}):
        self._validate()
        gc.collect()

In [26]:
from sklearn.utils import shuffle
def cb_shuffle_train_data(batch, logs):
    global X_train_files, X_train_patients, y_train
    X_train_files, X_train_patients, y_train = \
        shuffle(X_train_files, X_train_patients, y_train)

#### Train

In [None]:
# generate trainval set
X_trainval_msgs, X_trainval_patients, y_trainval, _means = \
    gen_batch(X_trainval_files, X_trainval_patients, y_trainval, \
                n_samples=len(X_trainval_files), silent=False)

 34%|███▍      | 1621/4764 [00:22<00:42, 73.33it/s]

In [None]:
print 'Memory for trainval set: %.2fG' % (X_trainval_msgs.size * 4. / pow(2, 30))

In [None]:
X_train_msgs, X_val_msgs, X_train_patients, X_val_patients, y_train, y_val = \
    train_test_split(X_trainval_msgs, X_trainval_patients, y_trainval, random_state=RND, test_size=VAL_SIZE)

In [33]:
del X_trainval_msgs
import gc
gc.collect()

In [None]:
# train
hist = model.fit(
        [X_train_msgs, X_train_patients],
        y_train,
        batch_size=TRAIN_N_PER_BATCH,
        nb_epoch=TRAIN_N_EPOCHS,
#         validation_split=0.2,
        validation_data=([X_val_msgs, X_val_patients], y_val),
        verbose=True,
        shuffle=True,
        callbacks = [
            MyCallback(),
            TensorBoard(log_dir=TFB_DIR, histogram_freq=0),
            ModelCheckpoint(
                MODELS_DIR + \
                '/e{epoch:02d}-l={loss:.5f}-vl={val_loss:.5f}-a={acc:.5f}-va={val_acc:.5f}.h5', 
                monitor='val_acc', verbose=0, save_best_only=False, 
                save_weights_only=False, mode='auto'
            )
        ]
     )

Train on 3573 samples, validate on 1191 samples
Epoch 1/111

In [34]:
for i in range(len(X_trainval_patients)):
    X_trainval_patients[i,:] = y_trainval[i][0]

In [36]:
X_trainval_patients[:16]

array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 1.,  1.,  1.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.]], dtype=float32)

In [37]:
y_trainval[:16]

array([[ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 1.,  0.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.]], dtype=float32)

In [21]:
for l in model.layers[:16]:
    print l.name
    l.trainable = False

input_msg
convolution2d_1
convolution2d_2
maxpooling2d_1
convolution2d_3
convolution2d_4
maxpooling2d_2
convolution2d_5
convolution2d_6
maxpooling2d_3
convolution2d_7
convolution2d_8
maxpooling2d_4
convolution2d_9
convolution2d_10
maxpooling2d_5
