In [1]:
import os
import re
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cross_validation import train_test_split

In [14]:
import keras
from keras.models import *
from keras.layers import *
from keras.callbacks import *

In [15]:
import lib

In [4]:
RND = 123
np.random.seed(RND)
import random
random.seed(RND)

In [5]:
RUN = 'E'
MODELS_DIR = '/d3/caches/kaggle-mls-v5/models/' + RUN
TFB_DIR = '/tmp-persistent/mls5/' + RUN

In [12]:
TRAIN_N_PER_BATCH = 16
TRAIN_N_EPOCHS = 111

In [6]:
VAL_SIZE = 0.1

WAVES_CACHE_DIR = 'out/waves'

SAMPLING_RATE = 400
N_SAMPLES = 240000
N_CHANNELS = 16

WARM_CACHE = True

In [8]:
# make sure output dirs exist
for v in [WAVES_CACHE_DIR, MODELS_DIR, TFB_DIR]: 
    if not os.path.isdir(v): 
        os.makedirs(v)

In [9]:
# load inout files list
input_df = pd.read_csv('out/input_files.csv', index_col='file')

In [10]:
# split inout files into train/test sets
train_df = input_df[input_df['class'] != -1]
test_df = input_df[input_df['class'] == -1]

In [11]:
X_trainval_files = train_df.index.tolist()
X_trainval_patients = np.vstack((train_df['patient_1'], train_df['patient_2'], train_df['patient_3']))\
    .T.astype(np.float32)
y_trainval = np.array(train_df['class'], dtype=np.float32)

In [11]:
def gen_batch(X_files=None, X_patients=None, y=None, start_ix=0, n_samples=1, \
              silent=True, compute_means=False, means_per_ch=None, div_by=None):
    
    X_msgs_batch = np.zeros((n_samples, N_MELS, DESIRED_MSG_W, N_CHANNELS), dtype=np.float32)
    X_patients_batch = np.zeros((n_samples, X_patients.shape[1]), dtype=np.float32)        
    y_batch = np.zeros([n_samples, 2], dtype=np.float32)
    
    if compute_means:
        means = np.zeros([n_samples, N_CHANNELS], dtype=np.float32)
    else:
        means = None
        
    r = range(n_samples) if silent else tqdm(range(n_samples))

    for i in r:
        ii  = (i + start_ix) % len(X_files)
        
        mat_f = X_files[ii]
        mat_cache_fp = MSGS_CACHE_DIR + '/' + mat_f + '.msgs.mem'
        
        # check if msgs are chached
        if not os.path.isfile(mat_cache_fp):

            mat_fp = input_df.ix[mat_f]['path']
            waves = lib.read_mat(mat_fp)
            
            msgs = np.zeros((N_CHANNELS, N_MELS, DESIRED_MSG_W), dtype=np.float32)

            for ch in range(16):
                msgs[ch] = lib.compute_msg(waves[ch], \
                      desired_msg_w=DESIRED_MSG_W, hop_length=HOP_LEN, \
                      n_fft=N_FFT, n_mels=N_MELS, sr=SAMPLING_RATE)
                            
            # move channel axis, shape is now: (ix, h, w, ch)
            msgs_t = np.swapaxes(msgs, 0, 1)
            msgs_t = np.swapaxes(msgs_t, 1, 2)

            X_msgs_batch[i] = msgs_t
            
            if not means_per_ch is None:
                X_msgs_batch = np.subtract(X_msgs_batch, means_per_ch)

            if not div_by is None:
                X_msgs_batch = np.divide(X_msgs_batch, div_by)
                
            # save to cache
            X_msgs_batch[i].tofile(mat_cache_fp)
            
        else:
            
            X_msgs_batch[i] = np.fromfile(mat_cache_fp, dtype=np.float32).\
                reshape((N_MELS, DESIRED_MSG_W, N_CHANNELS))
                
        # compute means
        if compute_means:
            means[i] = np.mean(X_msgs_batch.T.reshape(N_CHANNELS, -1), axis=1)

        X_patients_batch[i] = X_patients[ii]
        y_batch[i] = [1., 0.] if y[ii] else [0., 1.]
            
    return X_msgs_batch, X_patients_batch, y_batch, means

In [None]:
# warm cache
if WARM_CACHE:
    
    X_files = input_df.index.tolist()
    X_patients = np.vstack((input_df['patient_1'], input_df['patient_2'], input_df['patient_3']))\
        .T.astype(np.float32)
    y = np.zeros((len(X_files), 1), dtype=np.float32)
    
    start = 0
    stop = len(X_files)

#     for i in tqdm(xrange(start, stop)):
#         _msgs, _patients, _ys, _means = \
#             gen_batch(X_files, X_patients, y, start_ix=i, n_samples=1, \
#                       silent=True, compute_means=True, \
#                      means_per_ch=means_per_ch, div_by=255.)


 94%|█████████▍| 6266/6672 [23:26<01:39,  4.07it/s]

In [16]:
# create model
waves = lib.read_mat('/datasets/kaggle/mls/train_1/1_101_1.mat')

In [21]:
model = Sequential()

In [22]:
model.add(Convolution1D(128, 10, activation='relu', input_shape=(N_CHANNELS, N_SAMPLES)))

In [23]:
model.add(Flatten())

In [31]:
model.add(Dense(1, activation='softmax'))

In [32]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
convolution1d_1 (Convolution1D)  (None, 7, 128)        307200128   convolution1d_input_1[0][0]      
____________________________________________________________________________________________________
flatten_1 (Flatten)              (None, 896)           0           convolution1d_1[0][0]            
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 1)             897         flatten_1[0][0]                  
____________________________________________________________________________________________________
dense_2 (Dense)                  (None, 1)             2                                            
Total params: 307201027
___________________________________________________________________

In [33]:
model.compile(loss='MAE', optimizer='SGD')

In [34]:
model.predict([waves])

Exception: Error when checking : expected convolution1d_input_1 to have 3 dimensions, but got array with shape (16, 240000)

In [28]:
ww = np.array([waves])

In [29]:
ww.shape

(1, 16, 240000)

In [30]:
model.predict(ww)

array([[-889.23266602]], dtype=float32)