# Prototype 1

## Load Data

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
import sys

In [2]:
print(os.listdir("../input"))

print("Reading data...")
# Any results you write to the current directory are saved as output.
chunksize = 2**21
dataset = pd.read_csv("../input/train.csv", dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64}, chunksize=chunksize)

def update_progress(progress, title=""):
    print('\r{0} [{1}] {2}%'.format(title, '#'*int(progress/2)+' '*int(50-progress/2), progress), end='')


dfs = []
loaded_bytes = 0.0
for i, chunck in enumerate(dataset):
    for j, g in chunck.groupby((chunck.time_to_failure.diff() > 0).cumsum()):
        frags = []
        for k, h in g.groupby((g.time_to_failure.diff() < -0.0001).cumsum()):
            frags.append(h)
            loaded_bytes += float(h.memory_usage(index=True, deep=True).sum())
        if j == 0 and len(dfs) > 0 and dfs[-1][-1].tail(1).time_to_failure.values[0] >= g.head(1).time_to_failure.values[0]:
            dfs[-1] += frags
        else:
            dfs.append(frags)
    update_progress(int(100.0 * loaded_bytes / 11324618640.0), "Earthquake: {0:3d} Fragment: {1:8d} ".format(len(dfs), len(dfs[-1])))

['test', 'train.csv', 'sample_submission.csv']
Reading data...
Earthquake:  17 Fragment:     1751  [##################################################] 100%

In [None]:
for frag in dfs:
    for df in frag:
        print(df.shape)

# Data Visualization

In [4]:
%matplotlib inline
# df_concat = pd.concat(dfs[1])
# df_concat.plot(kind='line',x='time_to_failure',y='acoustic_data', figsize=(100,20))

dfs[1].plot(kind='line',x='time_to_failure',y='acoustic_data', figsize=(100,20))

print(dfs[1][-1].shape[0], len(dfs[0]), len(dfs[1]))

In [None]:
def create_features(frag_id, frag, X):
    xc = frag['acoustic_data'].values
    zc = np.fft.fft(xc)
    xc_abs = np.abs(xc)
        
    X.loc[frag_id, 'mean'] = xc.mean()
    X.loc[frag_id, 'abs_mean'] = xc_abs.mean()
    X.loc[frag_id, 'std'] = xc.std()
    X.loc[frag_id, 'var'] = xc.var()
    X.loc[frag_id, 'max'] = xc.max()
    X.loc[frag_id, 'min'] = xc.min()
    X.loc[frag_id, 'abs_max'] = xc_abs.max()
    
    X.loc[frag_id, 'A0'] = abs(zc[0])
    X.loc[frag_id, 'A1'] = abs(zc[1])
    X.loc[frag_id, 'A2'] = abs(zc[2])
    X.loc[frag_id, 'A3'] = abs(zc[3])
    X.loc[frag_id, 'A4'] = abs(zc[4])
    X.loc[frag_id, 'A5'] = abs(zc[5])
    X.loc[frag_id, 'A6'] = abs(zc[6])
    X.loc[frag_id, 'A7'] = abs(zc[7])
    X.loc[frag_id, 'A8'] = abs(zc[8])
    X.loc[frag_id, 'A9'] = abs(zc[9])
    X.loc[frag_id, 'A10'] = abs(zc[10])

In [None]:
last_time = []
stats_25 = {
    "mean": [],
    "std": [],
    "min": [],
    "max": []
}
stats_rolling = {
    "mean": [],
    "std": [],
    "min": [],
    "max": []
}
n = 0
for frag in tqdm(dfs):
    for i, df in enumerate(frag):
        abs_acoustic_data_data = df.acoustic_data.abs()
        stats_25["mean"].append(abs_acoustic_data_data.mean())
        stats_25["std"].append(df.acoustic_data.std())
        stats_25["min"].append(abs_acoustic_data_data.min())
        stats_25["max"].append(abs_acoustic_data_data.max())
        if n % 25 == 24 or i == len(frag)-1:
            stats_rolling["mean"].append(sum(stats_25["mean"])/len(stats_25["mean"]))
            stats_rolling["std"].append(sum(stats_25["std"])/len(stats_25["std"]))
            stats_rolling["min"].append(sum(stats_25["min"])/len(stats_25["min"]))
            stats_rolling["max"].append(sum(stats_25["max"])/len(stats_25["max"]))
            last_time.append(df.tail(1).time_to_failure.values[0])
            stats_25["mean"] = []
            stats_25["std"] = []
            stats_25["min"] = []
            stats_25["max"] = []
        n += 1
rolling_mean = np.array(stats_rolling["mean"])
rolling_std = np.array(stats_rolling["std"])
rolling_min = np.array(stats_rolling["min"])
rolling_max = np.array(stats_rolling["max"])
last_time = np.array(last_time)
print("Done!")

In [None]:

# pd.set_option("display.precision", 15)
# plot rolling mean
fig, ax1 = plt.subplots(figsize=(20, 10))
fig.suptitle('Mean for chunks of 150,000 samples of training data', fontsize=14)

ax2 = ax1.twinx()
ax3 = ax2.twinx()
ax4 = ax2.twinx()
ax1.set_xlabel('index')
ax1.set_ylabel('Acoustic data')
ax2.set_ylabel('Time to failure')
ax3.set_ylabel('Acoustic data std')
ax4.set_ylabel('Acoustic data max')

window_size = 100
window_num = 0
start = window_num*window_size
end = (window_num+1)*window_size
start = 0
end = -1
threshold_filter = rolling_mean < 8
p1 = sns.lineplot(data=rolling_mean[threshold_filter][start:end], ax=ax1, color='orange')
p2 = sns.lineplot(data=last_time[threshold_filter][start:end], ax=ax2, color='gray')
p3 = sns.lineplot(data=rolling_std[threshold_filter][start:end], ax=ax3, color='green')
p5 = sns.lineplot(data=rolling_max[threshold_filter][start:end], ax=ax4, color='blue')

In [None]:
os.listdir("../input/test/")
test_file = "../input/test/" + os.listdir("../input/test/")[0]
print(test_file)
dataset_test = pd.read_csv(test_file)
for i in range(10):
    shift = 2**14 + i * 2**14
    window = 2**15
    start = shift - window//2
    end = shift + window//2
    dataset_test[start:end].plot(kind='line',y='acoustic_data', figsize=(400,5))

## Fragment Stats
```
print(count, frag_duration, frag_gap_ave, frag_gap_ave/diff_ave, diff_ave)

4096 -4.504500000335199e-06 -0.026599999999999957 24181818.18169926 -1.1000000000054078e-09
count  frag_duration  frag_gap_ave frag_gap_ave/diff_ave diff_ave
4096   -4.5045e-06    -0.0266      24181818.181818       -1.1e-09


print(frag_gap_ave, frag_duration_ave, 25*frag_gap_ave/frag_duration_ave, diff_ave_cum/n)
-0.02655007691140222 -4.504480206630345e-06 147353.72170312778 -1.0999924151498534e-09
```

In [None]:
last_ave_1 = 0
last_ave_2 = 0
last_ave_3 = 0
last_ave_4 = 0
last_ave_5 = 0
last_ave_6 = 0
last_ave_7 = 0
last_ave_8 = 0
last_ave_9 = 0
last_ave_10 = 0
last_ave_11 = 0
last_ave_12 = 0
last_ave_13 = 0
last_ave_14 = 0
last_ave_15 = 0
last_ave_16 = 0
last_ave_17 = 0
last_ave_18 = 0
last_ave_19 = 0
last_ave_20 = 0
last_ave_21 = 0
last_ave_22 = 0
last_ave_23 = 0
last_ave_24 = 0
last_ave_25 = 0


n = 0
frag_duration_cum = 0
frag_gap_cum = 0
diff_ave_cum = 0
for frag in dfs:
    for df in frag:
        count = len(df.index)
        head = df.head(1).time_to_failure.values[0]
        tail = df.tail(1).time_to_failure.values[0]
        ave = df.time_to_failure.mean()
        diff_ave = df.tail(4095).time_to_failure.diff().mean()
        frag_duration = tail-head
        frag_gap_ave = ave-last_ave_25
        if frag_gap_ave < 0 and count == 4096:
            n += 1
            frag_gap_cum += frag_gap_ave
            frag_duration_cum += frag_duration
            diff_ave_cum += diff_ave
        print(count, frag_duration, frag_gap_ave, frag_gap_ave/diff_ave, diff_ave)
        last_ave_25 = last_ave_24
        last_ave_24 = last_ave_23
        last_ave_23 = last_ave_22
        last_ave_22 = last_ave_21
        last_ave_21 = last_ave_20
        last_ave_20 = last_ave_19
        last_ave_19 = last_ave_18
        last_ave_18 = last_ave_17
        last_ave_17 = last_ave_16
        last_ave_16 = last_ave_15
        last_ave_15 = last_ave_14
        last_ave_14 = last_ave_13
        last_ave_13 = last_ave_12
        last_ave_12 = last_ave_11
        last_ave_11 = last_ave_10
        last_ave_10 = last_ave_9
        last_ave_9 = last_ave_8
        last_ave_8 = last_ave_7
        last_ave_7 = last_ave_6
        last_ave_6 = last_ave_5
        last_ave_5 = last_ave_4
        last_ave_4 = last_ave_3
        last_ave_3 = last_ave_2
        last_ave_2 = last_ave_1
        last_ave_1 = ave
frag_gap_ave = frag_gap_cum/n
frag_duration_ave = frag_duration_cum/n
print(frag_gap_ave, frag_duration_ave, 25*frag_gap_ave/frag_duration_ave, diff_ave_cum/n)
# 967,491.030451251 = 4096*236.20386485626256
# 967,489.195553758 = 4096*236.2034168832417
# 967,491.030447658 = 4096*236.20386485538532
# 1,000,133.18541318 = 
#