In [16]:
import numpy as np
import scipy as sp
import pandas as pd
import os

import random

import matplotlib.pyplot as plt
%matplotlib inline
pd.options.display.precision = 15

import gc
import warnings
warnings.filterwarnings("ignore")

from fastai.tabular import * 
from tqdm import tqdm_notebook
from fastai.callbacks import *

In [66]:
%%time
train = pd.read_csv('../input/train.csv', dtype={'acoustic_data': np.int16, 'time_to_failure': np.float32},nrows=6e5)

CPU times: user 2min, sys: 9.55 s, total: 2min 10s
Wall time: 2min 40s


In [67]:
min = -100
max = 100
spread = 110
def get_counts(sequence): 
    diffs = np.diff(sequence)
    diffs = np.append(np.array([1]), diffs)
    sequence = sequence + diffs
        
    counts = [0]*spread
    unique_count = np.unique(sequence, return_counts=True)
    for i in range(0,len(unique_count[0])):
        val = unique_count[0][i]
        count = unique_count[1][i]
        r = count*val
        if val <= min:
            counts[0] += r
        elif val >= max:
            counts[-1] += r
        else:
            counts[int(val/2)+int(spread/2)] += r

    return counts

In [68]:
interval = 75000
counts = [get_counts(train.acoustic_data.values[i:i+150000]) for i in tqdm_notebook(range(0,len(train),interval))]
ttfs = [train.time_to_failure.values[i] for i in range(0,len(train),interval)]
del train

labels = ["D"+str(i) for i in range(0,len(counts[0]))]

df = pd.DataFrame(counts, columns=labels)
ttf_df = pd.DataFrame(ttfs, columns=["expected"])
df = df.join(ttf_df)

HBox(children=(IntProgress(value=0, max=8000), HTML(value='')))

In [65]:
df.head(3)
df.iloc[3][0:50]

D0    -180265.0
D1          0.0
D2          0.0
D3          0.0
D4          0.0
D5          0.0
D6      -2466.0
D7      -2688.0
D8       -950.0
D9       -739.0
D10     -4872.0
D11     -2288.0
D12      -348.0
D13     -4208.0
D14      -246.0
D15     -4733.0
D16     -2184.0
D17     -2682.0
D18     -1874.0
D19     -5832.0
D20     -3430.0
D21     -1500.0
D22     -2442.0
D23     -4378.0
D24     -2204.0
D25     -6420.0
D26      -291.0
D27     -6062.0
D28     -6893.0
D29     -2132.0
D30     -4427.0
D31    -10458.0
D32      -599.0
D33     -9366.0
D34     -9156.0
D35    -11561.0
D36     -2866.0
D37    -15304.0
D38    -12122.0
D39    -14489.0
D40    -22563.0
D41    -15180.0
D42    -10040.0
D43    -46188.0
D44     -3970.0
D45    -49061.0
D46    -32058.0
D47    -30088.0
D48    -50700.0
D49    -57427.0
Name: 3, dtype: float64

In [69]:
path ="../tmp"
try:
    os.makedirs(path)
except:
    pass

# Test Data

In [None]:
tpath = "../input/test"
files = os.listdir(tpath)
i = 0
test_id = []
test_df = pd.DataFrame(dtype=np.float64, columns=df.columns.values[:-1])
for f in tqdm_notebook(files):
    seg = pd.read_csv(f'{tpath}/{f}')
    converted = get_counts(seg.acoustic_data.values)
    test_df.loc[i] = converted
    test_id.append(f.replace(".csv", ""))
    i+=1

In [70]:
num = len(df)
interval = int(num/100)
values = int(num/(5*100))
valid_idx = []
for i in range(0,len(df)-values,interval):
    for j in range(0,values-1):
        valid_idx.append(i+j)

In [71]:
valid_ttfs = np.array([df.iloc[i].expected for i in valid_idx])

In [72]:
# data = TabularDataBunch.from_df(path, df, "expected", valid_idx=valid_idx, test_df=test_df, procs=[Normalize])
data = TabularDataBunch.from_df(path, df, "expected", valid_idx=valid_idx, procs=[Normalize])

In [73]:
%%time

best_learn = None
best_mae = 9999

for i in range(0, 99):
    learn = tabular_learner(data=data, layers=[200,100], metrics=mae, ps=0.5, y_range=(-1,15))
    learn.callbacks = [SaveModelCallback(learn, every='improvement', mode='min', name='best')]
    learn.fit_one_cycle(20, 1e-2)
    gc.collect()

    preds = learn.get_preds(DatasetType.Valid)[0].numpy().flatten()
    new_mae = np.abs(valid_ttfs-preds).mean()
    if new_mae < best_mae or not best_learn:
        best_learn = learn
        best_mae = new_mae
    print(f'Run {i} - Best MAE: {best_mae}')

Run 2 - Best MAE: 2.044030682653189


epoch,train_loss,valid_loss,mean_absolute_error,time
0,9.528108,7.310853,2.124905,00:01
1,8.529163,7.220847,2.137577,00:01
2,8.232252,7.674183,2.148491,00:01
3,8.047798,7.550503,2.10186,00:01
4,8.245251,7.101229,2.103573,00:01


Better model found at epoch 0 with val_loss value: 7.310853004455566.
Better model found at epoch 1 with val_loss value: 7.220847129821777.
Better model found at epoch 4 with val_loss value: 7.101229190826416.


Exception in thread Thread-1702:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.6/threading.py", line 864, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 158, in _pin_memory_loop
    r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
  File "/opt/conda/lib/python3.6/multiprocessing/queues.py", line 113, in get
    return _ForkingPickler.loads(res)
  File "/opt/conda/lib/python3.6/site-packages/torch/multiprocessing/reductions.py", line 256, in rebuild_storage_fd
    fd = df.detach()
  File "/opt/conda/lib/python3.6/multiprocessing/resource_sharer.py", line 57, in detach
    with _resource_sharer.get_connection(self._id) as conn:
  File "/opt/conda/lib/python3.6/multiprocessing/resource_sharer.py", line 87, in get_connection
    c = Client(address, authkey=process.current_process().authk

KeyboardInterrupt: 

# Submission

In [None]:
preds = best_learn.get_preds(DatasetType.Test)[0].numpy().flatten()

In [None]:
tpath = "../input/test"
files = os.listdir(tpath)
files = [f.replace(".csv","") for f in files]
files[:3]

In [None]:
results = pd.DataFrame({"seg_id":files, "time_to_failure":preds})
results.head()

In [None]:
results.to_csv('submission.csv',index=False)