In [1]:
import os
import re
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cross_validation import train_test_split

In [2]:
RND = 111
np.random.seed(RND)
import random
random.seed(RND)

In [3]:
RUN = 'B'
MODELS_DIR = '/d3/caches/kaggle-mls-v6/models/' + RUN
TFB_DIR = '/tmp-persistent/mls6/' + RUN

In [4]:
TRAIN_N_PER_BATCH = 8
TRAIN_N_EPOCHS = 111

In [5]:
VAL_N_PER_BATCH = 8
VAL_SIZE = 400

In [6]:
WAVES_DIR = 'out/waveforms'

SAMPLING_RATE = 400
N_SAMPLES = 240000
N_CHANNELS = 16

N_TIMESTEPS = 9600

In [7]:
# make sure output dirs exist
for v in [MODELS_DIR, TFB_DIR]: 
    if not os.path.isdir(v): 
        os.makedirs(v)

In [8]:
# load inout files list
input_df = pd.read_csv('out/input_files.csv', index_col='file')

In [9]:
input_df.head()

Unnamed: 0_level_0,class,path,patient_1,patient_2,patient_3
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2_1288_0.mat,0,/datasets/kaggle/mls/train_2/2_1288_0.mat,0.0,1.0,0.0
3_725_0.mat,0,/datasets/kaggle/mls/train_3/3_725_0.mat,0.0,0.0,1.0
1_8_0.mat,0,/datasets/kaggle/mls/train_1/1_8_0.mat,1.0,0.0,0.0
2_1965_0.mat,0,/datasets/kaggle/mls/train_2/2_1965_0.mat,0.0,1.0,0.0
2_135_0.mat,0,/datasets/kaggle/mls/train_2/2_135_0.mat,0.0,1.0,0.0


In [10]:
# split inout files into train/test sets
train_df = input_df[input_df['class'] != -1]
test_df = input_df[input_df['class'] == -1]

In [11]:
X_trainval_files = train_df.index.tolist()
X_trainval_patients = np.vstack((train_df['patient_1'], train_df['patient_2'], train_df['patient_3']))\
    .T.astype(np.float32)
y_trainval = np.array(train_df['class'], dtype=np.float32)

In [12]:
def gen_batch(X_files=None, X_patients=None, y=None, start_ix=0, n_samples=1, \
              silent=True):
    
    X_waves_batch = np.zeros((n_samples, N_TIMESTEPS, N_CHANNELS * N_SAMPLES / N_TIMESTEPS), dtype=np.float32)
    X_patients_batch = np.zeros((n_samples, X_patients.shape[1]), dtype=np.float32)        
    y_batch = np.zeros([n_samples, 2], dtype=np.float32)
    
    r = range(n_samples) if silent else tqdm(range(n_samples))

    for i in r:
        ii = (i + start_ix) % len(X_files)
        
        mat_f = X_files[ii]
        
        # waves
        wave_f = WAVES_DIR + '/' + mat_f + '.mem'
        waves = np.fromfile(wave_f, dtype=np.float32).reshape(N_CHANNELS, -1)
        
        # add noise
        waves = waves.flatten()
        waves += (np.random.random_sample(N_CHANNELS*N_SAMPLES) - .5) * 0.1
        
        X_waves_batch[i] = waves.reshape(N_TIMESTEPS, -1)
        
        # patients
        X_patients_batch[i] = X_patients[ii]
        
        # ys
        if type(y[ii]) == np.ndarray:
            y_batch[i] = [1., 0.] if y[ii][0] == 1. else [0., 1.]
        else:
            y_batch[i] = [1., 0.] if y[ii] == 1. else [0., 1.]
            
    return X_waves_batch, X_patients_batch, y_batch

In [13]:
# generate val data
X_train_files, X_val_files, X_train_patients, X_val_patients, y_train, y_val = \
    train_test_split(X_trainval_files, X_trainval_patients, y_trainval, random_state=RND, test_size=VAL_SIZE)

In [14]:
TRAIN_N_PER_EPOCH = len(X_train_files) / TRAIN_N_PER_BATCH * TRAIN_N_PER_BATCH

In [15]:
TRAIN_N_PER_EPOCH

4360

In [16]:
start_ix_train = 0
    
# training data generator
def train_generator():

    global start_ix_train
    
    while True:
        
        b = gen_batch(
            X_train_files, X_train_patients, y_train, 
            start_ix=start_ix_train, n_samples=TRAIN_N_PER_BATCH
        )
                
        start_ix_train += len(b[0])
        
        yield [b[0], b[1]], b[2]

In [17]:
start_ix_val = 0
    
# validation data generator
def val_generator():

    global start_ix_val
    
    while True:
        
        b = gen_batch(
            X_val_files, X_val_patients, y_val, 
            start_ix=start_ix_val, n_samples=VAL_N_PER_BATCH
        )
                
        start_ix_val += len(b[0])
        
        yield [b[0], b[1]], b[2]

In [18]:
# create model

In [19]:
b = gen_batch(X_val_files, X_val_patients, y_val, start_ix=0, n_samples=1)

In [20]:
b[0].shape[1]/9600.

1.0

In [21]:
import keras
from keras.models import *
from keras.layers import *
from keras.callbacks import *

Using TensorFlow backend.


In [22]:
l = N_CHANNELS * N_SAMPLES
n_steps = N_TIMESTEPS # n seconds
step_l = l/n_steps
print step_l

400


In [23]:
model1 = Sequential()
model1.add(Convolution1D(64, 40, border_mode='same', activation='relu', input_shape=(n_steps, step_l)))
model1.add(Convolution1D(64, 40, border_mode='same', activation='relu'))
model1.add(MaxPooling1D(pool_length=4))
model1.add(Convolution1D(128, 10, border_mode='same', activation='relu'))
model1.add(Convolution1D(128, 10, border_mode='same', activation='relu'))
model1.add(MaxPooling1D(pool_length=4))
model1.add(Convolution1D(128, 10, border_mode='same', activation='relu'))
model1.add(Convolution1D(128, 10, border_mode='same', activation='relu'))
model1.add(MaxPooling1D(pool_length=4))
model1.add(Convolution1D(256, 10, border_mode='same', activation='relu'))
model1.add(Convolution1D(256, 10, border_mode='same', activation='relu'))
model1.add(MaxPooling1D(pool_length=4))
model1.add(Convolution1D(256, 10, border_mode='same', activation='relu'))
model1.add(Convolution1D(256, 10, border_mode='same', activation='relu'))
model1.add(MaxPooling1D(pool_length=4))
model1.add(Flatten())
model1.add(Dropout(0.5))

In [24]:
model1.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
convolution1d_1 (Convolution1D)  (None, 9600, 64)      1024064     convolution1d_input_1[0][0]      
____________________________________________________________________________________________________
convolution1d_2 (Convolution1D)  (None, 9600, 64)      163904      convolution1d_1[0][0]            
____________________________________________________________________________________________________
maxpooling1d_1 (MaxPooling1D)    (None, 2400, 64)      0           convolution1d_2[0][0]            
____________________________________________________________________________________________________
convolution1d_3 (Convolution1D)  (None, 2400, 128)     82048       maxpooling1d_1[0][0]             
___________________________________________________________________________________________

In [25]:
input_patient = Input(shape=(3,), name='input_patient')

In [26]:
x = merge([model1.output, input_patient], mode='concat')
x = Dense(2, activation='softmax')(x)

In [27]:
model1.input

<tf.Tensor 'convolution1d_input_1:0' shape=(?, 9600, 400) dtype=float32>

In [28]:
model = Model(input=[model1.input, input_patient], output=[x])

In [29]:
model.compile(loss='binary_crossentropy', optimizer='SGD', metrics=['accuracy'])

In [30]:
model.summary()
# del model

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
convolution1d_input_1 (InputLayer(None, 9600, 400)     0                                            
____________________________________________________________________________________________________
convolution1d_1 (Convolution1D)  (None, 9600, 64)      1024064     convolution1d_input_1[0][0]      
____________________________________________________________________________________________________
convolution1d_2 (Convolution1D)  (None, 9600, 64)      163904      convolution1d_1[0][0]            
____________________________________________________________________________________________________
maxpooling1d_1 (MaxPooling1D)    (None, 2400, 64)      0           convolution1d_2[0][0]            
___________________________________________________________________________________________

In [31]:
from keras.utils.visualize_util import plot
plot(model, to_file='model.png', show_shapes=True)

#### Callbacks

In [32]:
from sklearn import metrics
import gc

scores = []

def score_auc():
    s = 0
    n = len(X_val_msgs)
    y_p = model.predict([X_val_msgs[s:s+n],
                         X_val_patients[s:s+n]], 
                        verbose=False)
    return metrics.roc_auc_score(y_val[s:s+n].T[0], y_p.T[0])

class MyCallback(keras.callbacks.Callback):
    def _validate(self):
        s = score_auc()
        scores.append(s)
        print "\n\n AUC = %.5f\n"%s; time.sleep(.5)
    def on_train_begin(self, epoch, logs={}):
        self._validate()
    def on_epoch_end(self, epoch, logs={}):
        self._validate()
        gc.collect()

#### Train

In [33]:
model.fit_generator(
    generator=train_generator(), 
    samples_per_epoch=TRAIN_N_PER_EPOCH,
    nb_epoch=TRAIN_N_EPOCHS,
    validation_data=val_generator(),
    nb_val_samples=VAL_SIZE,
    callbacks = [
#         MyCallback(),
        TensorBoard(log_dir=TFB_DIR, histogram_freq=0),
        ModelCheckpoint(
            MODELS_DIR + \
            '/e{epoch:02d}-l={loss:.5f}-vl={val_loss:.5f}-a={acc:.5f}-va={val_acc:.5f}.h5', 
            monitor='val_acc', verbose=0, save_best_only=False, 
            save_weights_only=False, mode='auto'
        )
    ]
)

Epoch 1/111

KeyboardInterrupt: 

In [41]:
for i in tqdm(range(100)):
    np.random.random_sample(N_CHANNELS*N_SAMPLES) / 2. - 1.

100%|██████████| 100/100 [00:04<00:00, 23.56it/s]


In [51]:
(np.random.random_sample(N_CHANNELS*N_SAMPLES) - .5) * 0.1

array([-0.04380665, -0.03585813, -0.0310039 , ..., -0.0230558 ,
       -0.04071133,  0.0098319 ])

In [52]:
a = np.array([1.,2.,3.])

In [53]:
a

array([ 1.,  2.,  3.])

In [58]:
np.random.random_sample(3) + a

array([ 1.42748417,  2.33823995,  3.14727188])