## Import python modules

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
from threading import Thread

import os
import sys
import glob
print("Imported!")

Imported!


## Set Hyperparameters

In [12]:
epochs = 1
batch_size = 2**6
feature_size = 2
valid_steps = 2

total_size = 629_145_480
seg_size = 150_000
bin_size = 2**12 # data comes in bins of 2^12 contiguous rows (ADC with 12-bit resolution)
bins_per_seg = (seg_size + bin_size - 1) // bin_size # ceil(seg_size / bin_size)
case_size = bins_per_seg * bin_size

total_cases = total_size // case_size
total_batches = (total_cases + batch_size - 1) // batch_size

withheld_cases = 2**8
withheld_batches = (withheld_cases + batch_size - 1) // batch_size
withheld_size = withheld_cases * case_size
withheld_percent = 100 * withheld_size / total_size

valid_cases = valid_steps * batch_size
valid_batches = (valid_cases + batch_size - 1) // batch_size
valid_size = valid_cases * case_size
valid_percent = 100 * valid_size / total_size

train_size = total_size - valid_size - withheld_size
train_cases = total_cases - valid_cases - withheld_cases
train_batches = (train_cases + batch_size - 1) // batch_size
train_percent = 100 * train_size / total_size

print("----- Hyperparameters -----")
print("epochs:", epochs)
print("batch_size:", batch_size)
print("feature_size:", feature_size)
print("valid_steps:", valid_steps)
print("---------------------------")
print("seg_size:", seg_size)
print("bin_size:", bin_size)
print("bins_per_seg:", bins_per_seg)
print("case_size:", case_size)
print("total_cases:", total_cases)
print("total_batches:", total_batches)
print("withheld_cases:", withheld_cases)
print("withheld_batches:", withheld_batches)
print("valid_cases:", valid_cases)
print("valid_batches:", valid_batches)
print("train_cases:", train_cases)
print("train_batches:", train_batches)
print("---------------------------")
print("total_size:       {0:9d}".format(total_size))
print("withheld_size:    {0:9d}".format(withheld_size))
print("train_size:       {0:9d}".format(train_size))
print("valid_size:       {0:9d}".format(valid_size))
print("---------------------------")
print("withheld_percent: {0:8.2f}%".format(withheld_percent))
print("valid_percent:    {0:8.2f}%".format(valid_percent))
print("train_percent:    {0:8.2f}%".format(train_percent))
print("---------------------------")

----- Hyperparameters -----
epochs: 1
batch_size: 64
feature_size: 2
valid_steps: 2
---------------------------
seg_size: 150000
bin_size: 4096
bins_per_seg: 37
case_size: 151552
total_cases: 4151
total_batches: 65
withheld_cases: 256
withheld_batches: 4
valid_cases: 128
valid_batches: 2
train_cases: 3767
train_batches: 59
---------------------------
total_size:       629145480
withheld_size:     38797312
train_size:       570949512
valid_size:        19398656
---------------------------
withheld_percent:     6.17%
valid_percent:        3.08%
train_percent:       90.75%
---------------------------


## Load data

In [4]:
print("Reading data...")
data = pd.read_csv('../input/train.csv', dtype={'acoustic_data': np.float32, 'time_to_failure': np.float32})[:-withheld_size]
print("Done reading data!")

Reading data...
Done reading data!


## Data Augmentation

In [15]:
# A vector with 1's marking the end of a bin
bin_end = np.zeros((1, bin_size), dtype=np.float32)
bin_end[0,-1] = 1.0

def create_features(df):
    df_size = df.shape[0]
    x = np.zeros((df_size, 2))
    x[:, 0] = (df.acoustic_data.values - 4.51946757) / 10.7357072
    x[:, 1] = np.resize(bin_end, (1, df_size))
    return x

def batch_gen(validation=False):
    samples = np.zeros((batch_size, seg_size, feature_size))
    targets = np.zeros((batch_size, 1, 1))
    while True:
        start_case = train_cases if validation else 0
        end_case = train_cases + valid_cases if validation else train_cases
        cases = np.random.randint(start_case, end_case, size=batch_size)

        for i, case in enumerate(cases):
            start_row = case * case_size
            end_row = start_row + seg_size
            seg = data[start_row:end_row]
            samples[i] = create_features(seg)
            targets[i] = seg.tail(1).time_to_failure.values
        yield samples, targets

train = batch_gen()
valid = batch_gen(validation=True)

## Data visualization

In [17]:
print(next(valid))
# rows = np.random.randint(0, train_size, size=batch_size)
# print(rows, data.shape)
# data[rows[0]:rows[0]+seg_size]

(array([[[ 0.32420152,  0.        ],
        [ 0.04476022,  0.        ],
        [ 0.04476022,  0.        ],
        ...,
        [-0.23468108,  0.        ],
        [-0.32782817,  0.        ],
        [-0.32782817,  0.        ]],

       [[ 0.51049572,  0.        ],
        [ 0.41734862,  0.        ],
        [-0.23468108,  0.        ],
        ...,
        [ 0.23105443,  0.        ],
        [ 0.60364282,  0.        ],
        [ 0.23105443,  0.        ]],

       [[ 0.78993702,  0.        ],
        [ 0.69678992,  0.        ],
        [ 0.32420152,  0.        ],
        ...,
        [-0.51412237,  0.        ],
        [-0.14153397,  0.        ],
        [-0.23468108,  0.        ]],

       ...,

       [[ 0.23105443,  0.        ],
        [ 0.04476022,  0.        ],
        [-0.14153397,  0.        ],
        ...,
        [ 0.13790733,  0.        ],
        [-0.14153397,  0.        ],
        [-0.14153397,  0.        ]],

       [[-0.14153397,  0.        ],
        [ 0.32420152,  0. 

In [None]:
# Define model
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D
from keras.optimizers import adam
from keras.callbacks import ModelCheckpoint

cb = [ModelCheckpoint("dcnn-model.hdf5", monitor='val_loss', save_weights_only=False, period=3)]

# 25, 25, 16, 5, 3
model = Sequential()
model.add(Conv1D(4, 4, activation='relu'  , dilation_rate=1, strides=4, input_shape=(seg_size, feature_size), padding="valid"))
model.add(Conv1D(8, 4, activation='relu'  , dilation_rate=1, strides=4, input_shape=(None, 4), padding="valid"))
model.add(Conv1D(16, 5, activation='relu'  , dilation_rate=1, strides=5, input_shape=(None, 8), padding="valid"))
model.add(Conv1D(32, 5, activation='relu'  , dilation_rate=1, strides=5, input_shape=(None, 16), padding="valid"))
model.add(MaxPooling1D(15, strides=15))
model.add(Conv1D(64, 5 , activation='relu'  , dilation_rate=1, strides=5, input_shape=(None, 32), padding="valid"))
model.add(Conv1D(1, 5 , activation='linear', dilation_rate=1, strides=5, input_shape=(None, 64), padding="valid"))

model.summary()

# Compile and fit model
model.compile(optimizer=adam(lr=0.0005), loss="mae")

history = model.fit_generator(train,
                              steps_per_epoch=train_batches,
                              epochs=epochs,
                              verbose=2,
                              callbacks=cb,
                              validation_data=valid,
                              validation_steps=valid_steps)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_14 (Conv1D)           (None, 37500, 4)          36        
_________________________________________________________________
conv1d_15 (Conv1D)           (None, 9375, 8)           136       
_________________________________________________________________
conv1d_16 (Conv1D)           (None, 1875, 16)          656       
_________________________________________________________________
conv1d_17 (Conv1D)           (None, 375, 32)           2592      
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 25, 32)            0         
_________________________________________________________________
conv1d_18 (Conv1D)           (None, 5, 64)             10304     
_________________________________________________________________
conv1d_19 (Conv1D)           (None, 1, 1)              321       
Total para

In [9]:
# Visualize accuracies
import matplotlib.pyplot as plt

def perf_plot(history, what = 'loss'):
    x = history.history[what]
    val_x = history.history['val_' + what]
    epochs = np.asarray(history.epoch) + 1

    plt.plot(epochs, x, 'bo', label = "Training " + what)
    plt.plot(epochs, val_x, 'b', label = "Validation " + what)
    plt.title("Training and validation " + what)
    plt.xlabel("Epochs")
    plt.legend()
    plt.show()
    return None

perf_plot(history)

ModuleNotFoundError: No module named 'matplotlib'

In [51]:
submission = pd.read_csv('../input/sample_submission.csv', index_col='seg_id', dtype={"time_to_failure": np.float32})

# Load each test data, create the feature matrix, get numeric prediction
for i, seg_id in enumerate(submission.index):
    seg = pd.read_csv('../input/test/' + seg_id + '.csv', dtype={'acoustic_data': np.float32})
    x = create_features(seg)
    predict = model.predict(np.expand_dims(x, 0))
    print('\r', i, seg_id, submission.shape[0], predict, end = '')
    submission.time_to_failure[i] = predict

submission.head()

# Save
submission.to_csv('submission.csv')


 18 seg_010eab 2624 [[[0.01918388]]]]

KeyboardInterrupt: 