In [1]:
import os
import re
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cross_validation import train_test_split

In [2]:
import lib

In [3]:
RND = 123
np.random.seed(RND)
import random
random.seed(RND)

In [4]:
RUN = 'C'
MODELS_DIR = '/d3/caches/kaggle-mls-v5/models/' + RUN
TFB_DIR = '/tmp-persistent/mls5/' + RUN

In [5]:
VAL_SIZE = 0.1

MSGS_CACHE_DIR = 'out/msgs-256'

SAMPLING_RATE = 400
N_SAMPLES = 240000
N_CHANNELS = 16

N_FFT=512
N_MELS=256
DESIRED_MSG_W = 256
HOP_LEN = 1 + int(N_SAMPLES / (DESIRED_MSG_W - 1))

WARM_CACHE = True

print 'HOP_LEN', HOP_LEN

HOP_LEN 942


In [6]:
# make sure output dirs exist
for v in [MSGS_CACHE_DIR, MODELS_DIR, TFB_DIR]: 
    if not os.path.isdir(v): 
        os.makedirs(v)

In [7]:
# load inout files list
input_df = pd.read_csv('out/input_files.csv', index_col='file')

In [8]:
# split inout files into train/test sets
train_df = input_df[input_df['class'] != -1]
test_df = input_df[input_df['class'] == -1]

In [9]:
X_trainval_files = train_df.index.tolist()
X_trainval_patients = np.vstack((train_df['patient_1'], train_df['patient_2'], train_df['patient_3']))\
    .T.astype(np.float32)
y_trainval = np.array(train_df['class'], dtype=np.float32)

In [82]:
def gen_batch(X_files=None, X_patients=None, y=None, start_ix=0, n_samples=1, silent=True):
    
    X_msgs_batch = np.zeros((n_samples, N_MELS, DESIRED_MSG_W, N_CHANNELS), dtype=np.float32)
    X_patients_batch = np.zeros((n_samples, X_patients.shape[1]), dtype=np.float32)        
    y_batch = np.zeros([n_samples, 2], dtype=np.float32)
    means = np.zeros([n_samples, N_CHANNELS], dtype=np.float32)
        
    r = range(n_samples) if silent else tqdm(range(n_samples))

    for i in r:
        ii  = (i + start_ix) % len(X_files)
        
        mat_f = X_files[ii]
        mat_cache_fp = MSGS_CACHE_DIR + '/' + mat_f + '.msgs.mem'
        
        # check if msgs are chached
        if not os.path.isfile(mat_cache_fp):

            mat_fp = input_df.ix[mat_f]['path']
            waves = lib.read_mat(mat_fp)
            
            msgs = np.zeros((N_CHANNELS, N_MELS, DESIRED_MSG_W), dtype=np.float32)

            for ch in range(16):
                msgs[ch] = lib.compute_msg(waves[ch], \
                      desired_msg_w=DESIRED_MSG_W, hop_length=HOP_LEN, \
                      n_fft=N_FFT, n_mels=N_MELS, sr=SAMPLING_RATE)
                            
            # move channel axis, shape is now: (ix, h, w, ch)
            msgs_t = np.swapaxes(msgs, 0, 1)
            msgs_t = np.swapaxes(msgs_t, 1, 2)

            X_msgs_batch[i] = msgs_t
                
            # save to cache
            X_msgs_batch[i].tofile(mat_cache_fp)
            
        else:
            
            X_msgs_batch[i] = np.fromfile(mat_cache_fp, dtype=np.float32).\
                reshape((N_MELS, DESIRED_MSG_W, N_CHANNELS))
                
            # compute means
            means[i] = np.mean(X_msgs_batch.T.reshape(N_CHANNELS, -1), axis=1)

        X_patients_batch[i] = X_patients[ii]
        y_batch[i] = [1., 0.] if y[ii] else [0., 1.]
    
    return X_msgs_batch, X_patients_batch, y_batch, means

In [None]:
# warm cache
if WARM_CACHE:
    
    means = []

    X_files = input_df.index.tolist()
    X_patients = np.vstack((input_df['patient_1'], input_df['patient_2'], input_df['patient_3']))\
        .T.astype(np.float32)
    y = np.zeros((len(X_files), 1), dtype=np.float32)
    
    start = 0
    stop = len(X_files)

    for i in tqdm(xrange(start, stop)):
        _msgs, _patients, _ys, _means = \
            gen_batch(X_files, X_patients, y, start_ix=i, n_samples=1, silent=True)
        means.append(_means[0])

 16%|█▌        | 1038/6672 [02:43<23:41,  3.96it/s]

In [90]:
np.mean(means, axis=0)

array([ 85.0632782 ,  86.75190735,  81.09435272,  82.0920639 ,
        83.20228577,  85.52071381,  83.13315582,  81.75543213,
        82.01451874,  75.06038666,  79.26768494,  78.29553223,
        81.36355591,  81.65673065,  79.51850128,  77.41757202], dtype=float32)

In [None]:
b = gen_batch(X_files, X_patients, y, start_ix=i, n_samples=1, silent=True)

In [63]:
mm = b[0][0]

In [81]:
np.mean(mm.T.reshape(mm.T.shape[0], -1), axis=1)

array([  93.77540588,   94.47314453,   95.14610291,   96.586586  ,
         99.23521423,   97.10848999,   93.72658539,   98.5067215 ,
         83.79309845,  126.31234741,   86.83683777,   84.59996796,
         90.30253601,   93.17021942,   86.09326935,   86.47769165], dtype=float32)

In [45]:
a = np.swapaxes(b[0][0], 2, 1)
a =  np.swapaxes(a, 1, 0)
print a.shape
print np.mean(a.reshape(a.shape[0], -1), axis=1)

(16, 256, 256)
[ 99.34836578  95.00201416  93.32743835  88.01261902  91.46595001
  97.46484375  95.62434387  97.9276886   95.93054199  36.29848862
  88.26551056  84.56757355  86.61559296  90.9962616   85.22241974
  82.38331604]


In [12]:
# split into train/test sets
X_train_files, X_val_files, X_train_patients, X_val_patients, y_train, y_val = \
    train_test_split(X_trainval_files, X_trainval_patients, y_trainval, \
                     test_size=VAL_SIZE, random_state=RND)

In [13]:
TRAIN_N_PER_BATCH = 16
TRAIN_N_SAMPLES_PER_EPOCH = len(X_train_files)
TRAIN_N_SAMPLES_PER_EPOCH -= TRAIN_N_SAMPLES_PER_EPOCH % TRAIN_N_PER_BATCH

TRAIN_N_EPOCHS = 111

print 'TRAIN_N_SAMPLES_PER_EPOCH', TRAIN_N_SAMPLES_PER_EPOCH, 'of', len(X_train_files)

TRAIN_N_SAMPLES_PER_EPOCH 4272 of 4287


In [14]:
# generate validation set
X_val_msgs, X_val_patients, y_val = \
    gen_batch(X_val_files, X_val_patients, y_val, n_samples=len(X_val_files), silent=False)

100%|██████████| 477/477 [00:16<00:00, 28.29it/s]


In [15]:
print 'Memory for val set: %.2fG' % (X_val_msgs.size * 4. / pow(2, 30))

Memory for val set: 7.45G


In [16]:
start_ix = 0
    
# training data generator
def train_generator():

    global start_ix
    
    while True:
        
        b = gen_batch(
            X_train_files, X_train_patients, y_train, 
            start_ix=start_ix, n_samples=TRAIN_N_PER_BATCH
        )
                
        start_ix += TRAIN_N_SAMPLES_PER_EPOCH
        
        yield [b[0], b[1]], b[2]

In [17]:
# create model

In [18]:
import keras
from keras.models import *
from keras.layers import *

Using TensorFlow backend.


In [19]:
input_msg = Input(shape=(N_MELS, DESIRED_MSG_W, N_CHANNELS), name='input_msg')

x = Convolution2D(96, 3, 3, border_mode='same', activation='relu')(input_msg)
# x = Convolution2D(128, 3, 3, border_mode='same', activation='relu')(x)
x = MaxPooling2D((2, 4), strides=(2, 4))(x)

x = Convolution2D(256, 3, 3, border_mode='same', activation='relu')(x)
# x = Convolution2D(128, 3, 3, border_mode='same', activation='relu')(x)
x = MaxPooling2D((2, 4), strides=(2, 4))(x)

x = Convolution2D(512, 3, 3, border_mode='same', activation='relu')(x)
# x = Convolution2D(256, 3, 3, border_mode='same', activation='relu')(x)
# x = Convolution2D(256, 3, 3, border_mode='same', activation='relu')(x)
x = MaxPooling2D((2, 2), strides=(2, 2))(x)

# x = Convolution2D(512, 3, 3, border_mode='same', activation='relu')(x)
# x = Convolution2D(512, 3, 3, border_mode='same', activation='relu')(x)
x = Convolution2D(512, 3, 3, border_mode='same', activation='relu')(x)
x = MaxPooling2D((2, 2), strides=(2, 2))(x)

# x = Convolution2D(512, 3, 3, border_mode='same', activation='relu')(x)
# x = Convolution2D(512, 3, 3, border_mode='same', activation='relu')(x)
x = Convolution2D(512, 3, 3, border_mode='same', activation='relu')(x)
x = MaxPooling2D((2, 2), strides=(2, 2))(x)

conv_out = Flatten()(x)

In [20]:
input_patient = Input(shape=(3,), name='input_patient')

In [21]:
x = merge([conv_out, input_patient], mode='concat')
x = Dense(2, activation='softmax')(x)

In [22]:
model = Model(input=[input_msg, input_patient], output=[x])

In [23]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [24]:
model.summary()
# del model

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_msg (InputLayer)           (None, 256, 1024, 16) 0                                            
____________________________________________________________________________________________________
convolution2d_1 (Convolution2D)  (None, 256, 1024, 96) 13920       input_msg[0][0]                  
____________________________________________________________________________________________________
maxpooling2d_1 (MaxPooling2D)    (None, 128, 256, 96)  0           convolution2d_1[0][0]            
____________________________________________________________________________________________________
convolution2d_2 (Convolution2D)  (None, 128, 256, 256) 221440      maxpooling2d_1[0][0]             
___________________________________________________________________________________________

In [25]:
from keras.utils.visualize_util import plot
plot(model, to_file='model.png', show_shapes=True)

#### Callbacks

In [26]:
from sklearn import metrics
import gc

scores = []

def score_auc():
    s = 0
    n = len(X_val_msgs)
    y_p = model.predict([X_val_msgs[s:s+n],
                         X_val_patients[s:s+n]], 
                        verbose=False)
    return metrics.roc_auc_score(y_val[s:s+n].T[0], y_p.T[0])

class MyCallback(keras.callbacks.Callback):
    def _validate(self):
        s = score_auc()
        scores.append(s)
        print "\n\n AUC = %.5f\n"%s; time.sleep(.5)
    def on_train_begin(self, epoch, logs={}):
        self._validate()
    def on_epoch_end(self, epoch, logs={}):
        self._validate()
        gc.collect()

In [27]:
from sklearn.utils import shuffle
def cb_shuffle_train_data(batch, logs):
    global X_train_files, X_train_patients, y_train
    X_train_files, X_train_patients, y_train = \
        shuffle(X_train_files, X_train_patients, y_train)

#### Train

In [None]:
from keras.callbacks import *

In [None]:
# train
hist = model.fit_generator(
        train_generator(),
        samples_per_epoch=TRAIN_N_SAMPLES_PER_EPOCH,
        nb_epoch=TRAIN_N_EPOCHS,
        validation_data=([X_val_msgs, X_val_patients], y_val),
        verbose=True,
        max_q_size=10,
        nb_worker=1,
        pickle_safe=False,
        callbacks = [
            MyCallback(),
            LambdaCallback(
                on_epoch_begin=cb_shuffle_train_data
            ),
            TensorBoard(log_dir=TFB_DIR, histogram_freq=0),
            ModelCheckpoint(
                MODELS_DIR + \
                '/e{epoch:02d}-l={loss:.5f}-vl={val_loss:.5f}-a={acc:.5f}-va={val_acc:.5f}.h5', 
                monitor='val_acc', verbose=0, save_best_only=False, 
                save_weights_only=False, mode='auto'
            )
        ]
     )



 AUC = 0.53111

Epoch 1/111

 AUC = 0.50000

Epoch 2/111

 AUC = 0.50000

Epoch 26/111

 AUC = 0.50000

Epoch 27/111

 AUC = 0.50000

Epoch 28/111

 AUC = 0.50000

Epoch 29/111

 AUC = 0.50000

Epoch 30/111

In [None]:
# train
# hist = model.fit([X_val_msgs, X_val_patients], y_val, batch_size=1, nb_epoch=10)