In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
from statistics import mean

import os

import random
from tqdm import tqdm_notebook as tqdm
from tqdm import tqdm_pandas

import warnings
warnings.filterwarnings('ignore')

In [2]:
step = 100000000
stop = 100000000
X = pd.DataFrame(dtype = np.float32,columns = ['mean','std','99quat','50quat','25quat','1quat','time_to_failure'])
j = 0
for i in tqdm(range(0, stop, step)):
    train_df = pd.read_csv("../input/train.csv",
                           skiprows = i,
                           nrows = step,
                           dtype={'acoustic_data': np.int16, 'time_to_failure': np.float32}
                          )
    train_df.columns = ['acoustic_data','time_to_failure']
    seg_len = 5000
    segments = int(np.floor(train_df.shape[0] / seg_len))
    for segment in range(segments):
        x = train_df.acoustic_data[segment*seg_len:segment*seg_len+seg_len]
        X.loc[j,'mean'] = np.mean(x)
        X.loc[j,'std']  = np.std(x)
        X.loc[j,'99quat'] = np.quantile(x,0.99)
        X.loc[j,'50quat'] = np.quantile(x,0.5)
        X.loc[j,'25quat'] = np.quantile(x,0.25)
        X.loc[j,'1quat'] =  np.quantile(x,0.01)
        X.loc[j,'time_to_failure'] = train_df.time_to_failure.values[segment*seg_len+seg_len-1]
        j +=1
    del train_df
    gc.collect()
    

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




In [3]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() 
X.iloc[:,:-1] = scaler.fit_transform(X.iloc[:,:-1])

In [4]:
def getTrainBatch(dfl,seg_len,batch_size):
    x = np.empty([batch_size,seg_len,6])
    y = np.empty([batch_size,1])
    for i,rn in enumerate(np.random.randint(dfl.shape[0]-seg_len, size=batch_size)):
        df = dfl.loc[rn:rn+seg_len-1,:]
        x[i,:,:] = df.iloc[:,:-1]
        y[i] = df.iloc[-1,-1]
    return x,y

In [5]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, CuDNNLSTM
from keras.optimizers import Adam
from keras.losses import mean_squared_error
from keras.callbacks import History

Using TensorFlow backend.


In [6]:
model = Sequential()

model.add(CuDNNLSTM(64 ,return_sequences=True ,input_shape=(30, 6)))
model.add(CuDNNLSTM(64))
model.add(Dropout(rate=0.5))
model.add(Dense(1, activation='linear'))

print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
cu_dnnlstm_1 (CuDNNLSTM)     (None, 30, 64)            18432     
_________________________________________________________________
cu_dnnlstm_2 (CuDNNLSTM)     (None, 64)                33280     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 51,777
Trainable params: 51,777
Non-trainable params: 0
_________________________________________________________________
None


In [7]:
model.compile(loss='mean_absolute_error',optimizer='adam',metrics=['mae'])

In [8]:
loss = []
val_loss = []
for j in tqdm(range(101)):
        #print('Generating training batch :',j)
        x_train,y_train = getTrainBatch(X,30,batch_size=1024)
        history = model.fit(x_train,
                            y_train,
                            batch_size=16,
                            epochs=10,
                            validation_split=0.1,
                            verbose=0)
        loss = loss + history.history['loss']
        val_loss = val_loss + history.history['val_loss']
        #mae = mae + history.history['mean_absolute_error']
        if (j%10==0):
            print('loss :',mean(loss[-10:]),' val_loss :',mean(val_loss[-10:])) #, ' val_mae :',mean(mae[-10:])*16)
        del x_train, y_train
        gc.collect()

HBox(children=(IntProgress(value=0, max=101), HTML(value='')))

Instructions for updating:
Use tf.cast instead.
loss : 3.43531976401094  val_loss : 3.098019009886436
loss : 3.2454973077152762  val_loss : 3.05819864203629
loss : 2.0904933627860682  val_loss : 1.8587029714607497
loss : 2.1327819793542746  val_loss : 1.9976456167628465
loss : 1.9521911596112869  val_loss : 1.9889341762633
loss : 2.034992509200182  val_loss : 1.5764679394879388
loss : 1.9156632992977947  val_loss : 1.8855964896748367
loss : 1.9024245364658219  val_loss : 2.0461536794032864
loss : 1.74318339306938  val_loss : 1.7156517761424908
loss : 1.657940808750778  val_loss : 1.7494098979292565
loss : 1.6577117397394294  val_loss : 1.559677595883897



In [9]:
# predicting the submission
def predictSubmission(seg_id):
    X_test = pd.DataFrame(dtype = np.float32,columns = ['mean','std','99quat','50quat','25quat','1quat'])
    test_df = pd.read_csv('../input/test/' + seg_id + '.csv')
    seg_len = 5000
    segments = int(np.floor(test_df.shape[0] / seg_len))
    for i,segment in enumerate(range(segments)):
        x = test_df.acoustic_data[segment*seg_len:segment*seg_len+seg_len]
        X_test.loc[i,'mean'] = np.mean(x)
        X_test.loc[i,'std']  = np.std(x)
        X_test.loc[i,'99quat'] = np.quantile(x,0.99)
        X_test.loc[i,'50quat'] = np.quantile(x,0.5)
        X_test.loc[i,'25quat'] = np.quantile(x,0.25)
        X_test.loc[i,'1quat'] =  np.quantile(x,0.01)
    y = model.predict(scaler.transform(X_test).reshape(1,30,6))
    return y[0][0]

In [10]:
tqdm_pandas(tqdm())
submission = pd.read_csv('../input/sample_submission.csv')
submission.loc[:,'time_to_failure']=submission.loc[:,'seg_id'].progress_apply(predictSubmission)
submission.to_csv('submission_10.csv',index=False)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))





In [11]:
submission.head()

Unnamed: 0,seg_id,time_to_failure
0,seg_00030f,7.289006
1,seg_0012b5,6.249526
2,seg_00184e,8.097557
3,seg_003339,11.046974
4,seg_0042cc,6.546412
