In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split, KFold, cross_val_predict

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import matplotlib.pyplot as plt
import xgboost
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from scipy.stats import kurtosis, skew


# Make pandas show more decimal places
pd.options.display.precision = 15

Load the data and see how it looks like

In [None]:
train = pd.read_csv('../input/train.csv', dtype={'acoustic_data': np.int16, 'time_to_failure': np.float32})

In [None]:
train.head(), train.shape

Create train set and target set in segments of *rows* size. 150,000 seems a good choice since test set files have that number of rows.

In [None]:
# Create a training file with simple derived features

rows = 150000
stride = 3750
segments = int(1 + np.floor((train.shape[0] - rows) / stride))

X_train = pd.DataFrame(index=range(segments), dtype=np.float64,
                       columns=['ave', 'std', 'max', 'min','q95','q99', 'q05','q01', 'kurtosis', 'variance', 'skew', 'median', 'mad', ])
y_train = pd.DataFrame(index=range(segments), dtype=np.float64,
                       columns=['time_to_failure'])

last_index = 0
for segment in tqdm(range(segments)):
    seg = train.iloc[segment*stride:segment*stride+rows]
    last_index = segment*stride+rows
    x = seg['acoustic_data'].values
    y = seg['time_to_failure'].values[-(int(rows/2)-1)]
    
    y_train.loc[segment, 'time_to_failure'] = y
    
    X_train.loc[segment, 'ave'] = x.mean()
    X_train.loc[segment, 'std'] = x.std()
    X_train.loc[segment, 'max'] = x.max()
    X_train.loc[segment, 'min'] = x.min()
    
    X_train.loc[segment, 'q95'] = np.quantile(x,0.95)
    X_train.loc[segment, 'q99'] = np.quantile(x,0.99)
    X_train.loc[segment, 'q05'] = np.quantile(x,0.05)
    X_train.loc[segment, 'q01'] = np.quantile(x,0.01)
    
    X_train.loc[segment, 'kurtosis'] = kurtosis(x, bias=False)
    X_train.loc[segment, 'variance'] = np.var(x)
    X_train.loc[segment, 'skew'] = skew(x)
    
    X_train.loc[segment, 'median'] = np.median(x)
    X_train.loc[segment, 'mad'] = np.mean(np.abs(x - x.mean()))
    
    X_train.loc[segment, 'abs_mean'] = np.abs(x).mean()
    X_train.loc[segment, 'abs_std'] = np.abs(x).std()
    
#     # Look at some chunkks of the current segment
#     X_train.loc[segment, 'first_50k_ave'] = x[:50000].mean()
#     X_train.loc[segment, 'first_50k_std'] = x[:50000].std()
#     X_train.loc[segment, 'first_50k_max'] = x[:50000].max()
#     X_train.loc[segment, 'first_50k_min'] = x[:50000].min()
#     X_train.loc[segment, 'first_50k_q95'] = np.quantile(x[:50000],0.95)
#     X_train.loc[segment, 'first_50k_q99'] = np.quantile(x[:50000],0.99)
#     X_train.loc[segment, 'first_50k_q05'] = np.quantile(x[:50000],0.05)
#     X_train.loc[segment, 'first_50k_q01'] = np.quantile(x[:50000],0.01)
#     X_train.loc[segment, 'first_50k_kurtosis'] = kurtosis(x[:50000], bias=False)
#     X_train.loc[segment, 'first_50k_variance'] = np.var(x[:50000])
#     X_train.loc[segment, 'first_50k_skew'] = skew(x[:50000])    
#     X_train.loc[segment, 'first_50k_median'] = np.median(x[:50000])
#     X_train.loc[segment, 'first_50k_mad'] = np.mean(np.abs(x[:50000] - x[:50000].mean()))    
#     X_train.loc[segment, 'first_50k_abs_mean'] = np.abs(x[:50000]).mean()
#     X_train.loc[segment, 'first_50k_abs_std'] = np.abs(x[:50000]).std()
    
#     X_train.loc[segment, 'middle_50k_ave'] = x[50000:100000].mean()
#     X_train.loc[segment, 'middle_50k_std'] = x[50000:100000].std()
#     X_train.loc[segment, 'middle_50k_max'] = x[50000:100000].max()
#     X_train.loc[segment, 'middle_50k_min'] = x[50000:100000].min()
#     X_train.loc[segment, 'middle_50k_q95'] = np.quantile(x[50000:100000],0.95)
#     X_train.loc[segment, 'middle_50k_q99'] = np.quantile(x[50000:100000],0.99)
#     X_train.loc[segment, 'middle_50k_q05'] = np.quantile(x[50000:100000],0.05)
#     X_train.loc[segment, 'middle_50k_q01'] = np.quantile(x[50000:100000],0.01)
#     X_train.loc[segment, 'middle_50k_kurtosis'] = kurtosis(x[50000:100000], bias=False)
#     X_train.loc[segment, 'middle_50k_variance'] = np.var(x[50000:100000])
#     X_train.loc[segment, 'middle_50k_skew'] = skew(x[50000:100000])    
#     X_train.loc[segment, 'middle_50k_median'] = np.median(x[50000:100000])
#     X_train.loc[segment, 'middle_50k_mad'] = np.mean(np.abs(x[50000:100000] - x[50000:100000].mean()))    
#     X_train.loc[segment, 'middle_50k_abs_mean'] = np.abs(x[50000:100000]).mean()
#     X_train.loc[segment, 'middle_50k_abs_std'] = np.abs(x[50000:100000]).std()
    
#     X_train.loc[segment, 'last_50k_ave'] = x[100000:150000].mean()
#     X_train.loc[segment, 'last_50k_std'] = x[100000:150000].std()
#     X_train.loc[segment, 'last_50k_max'] = x[100000:150000].max()
#     X_train.loc[segment, 'last_50k_min'] = x[100000:150000].min()
#     X_train.loc[segment, 'last_50k_q95'] = np.quantile(x[100000:150000],0.95)
#     X_train.loc[segment, 'last_50k_q99'] = np.quantile(x[100000:150000],0.99)
#     X_train.loc[segment, 'last_50k_q05'] = np.quantile(x[100000:150000],0.05)
#     X_train.loc[segment, 'last_50k_q01'] = np.quantile(x[100000:150000],0.01)
#     X_train.loc[segment, 'last_50k_kurtosis'] = kurtosis(x[100000:150000], bias=False)
#     X_train.loc[segment, 'last_50k_variance'] = np.var(x[100000:150000])
#     X_train.loc[segment, 'last_50k_skew'] = skew(x[100000:150000])    
#     X_train.loc[segment, 'last_50k_median'] = np.median(x[100000:150000])
#     X_train.loc[segment, 'last_50k_mad'] = np.mean(np.abs(x[100000:150000] - x[100000:150000].mean()))    
#     X_train.loc[segment, 'last_50k_abs_mean'] = np.abs(x[100000:150000]).mean()
#     X_train.loc[segment, 'last_50k_abs_std'] = np.abs(x[100000:150000]).std()
    
#     X_train.loc[segment, 'last_bin_ave'] = x[145904:150000].mean()
#     X_train.loc[segment, 'last_bin_std'] = x[145904:150000].std()
#     X_train.loc[segment, 'last_bin_max'] = x[145904:150000].max()
#     X_train.loc[segment, 'last_bin_min'] = x[145904:150000].min()
#     X_train.loc[segment, 'last_bin_q95'] = np.quantile(x[145904:150000],0.95)
#     X_train.loc[segment, 'last_bin_q99'] = np.quantile(x[145904:150000],0.99)
#     X_train.loc[segment, 'last_bin_q05'] = np.quantile(x[145904:150000],0.05)
#     X_train.loc[segment, 'last_bin_q01'] = np.quantile(x[145904:150000],0.01)
#     X_train.loc[segment, 'last_bin_kurtosis'] = kurtosis(x[145904:150000], bias=False)
#     X_train.loc[segment, 'last_bin_variance'] = np.var(x[145904:150000])
#     X_train.loc[segment, 'last_bin_skew'] = skew(x[145904:150000])    
#     X_train.loc[segment, 'last_bin_median'] = np.median(x[145904:150000])
#     X_train.loc[segment, 'last_bin_mad'] = np.mean(np.abs(x[145904:150000] - x[100000:150000].mean()))    
#     X_train.loc[segment, 'last_bin_abs_mean'] = np.abs(x[145904:150000]).mean()
#     X_train.loc[segment, 'last_bin_abs_std'] = np.abs(x[145904:150000]).std()



Take a look at how the train set looks like. Then scale it and recheck shapes.

In [None]:
X_train.shape
X_train.describe()

In [None]:
X_train.head()

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

In [None]:
X_train_scaled.shape, X_train_scaled[0:5]

## SVM
Lets train an SVM Regressor model with default values as one of our baseline models.
Plot a scatter of its prediction vs actual value.
Print MAE Score.

In [None]:
print ('SVM Training')
svm = NuSVR()
svm.fit(X_train_scaled, y_train.values.flatten())
svm_pred = svm.predict(X_train_scaled)

In [None]:
plt.figure(figsize=(6, 6))
plt.scatter(y_train.values.flatten(), svm_pred)
plt.xlim(0, 20)
plt.ylim(0, 20)
plt.xlabel('actual', fontsize=12)
plt.ylabel('predicted', fontsize=12)
plt.plot([(0, 0), (20, 20)], [(0, 0), (20, 20)])
plt.title('SVM')
plt.show()

In [None]:
svm_score = mean_absolute_error(y_train.values.flatten(), svm_pred)
print(f'Score: {svm_score:0.3f}')

## Random Forest
Lets train a Random Forest model with default values as one of our baseline models. Plot a scatter of its prediction vs actual value. Print MAE Score.

In [None]:
print ('Random Forest Training')
rf = RandomForestRegressor(n_estimators=100, criterion='mae')
rf.fit(X_train_scaled, y_train.values.flatten())
rf_pred = rf.predict(X_train_scaled)

In [None]:
plt.figure(figsize=(6, 6))
plt.scatter(y_train.values.flatten(), rf_pred)
plt.xlim(0, 20)
plt.ylim(0, 20)
plt.xlabel('actual', fontsize=12)
plt.ylabel('predicted', fontsize=12)
plt.plot([(0, 0), (20, 20)], [(0, 0), (20, 20)])
plt.title('Random Forest')
plt.show()

In [None]:
rf_score = mean_absolute_error(y_train.values.flatten(), rf_pred)
print(f'Score: {rf_score:0.3f}')

## XGBoost
Lets train an XGBoost model with default values as one of our baseline models.
Plot a scatter of its prediction vs actual value.
Print MAE Score.

In [None]:
print ('XGBoost Training')
xgb = xgboost.XGBRegressor(objective="reg:linear", eval_metric='mae', n_jobs=4, )
xgb.fit(X_train_scaled, y_train.values.flatten())
xgb_pred = xgb.predict(X_train_scaled)

In [None]:
plt.figure(figsize=(6, 6))
plt.scatter(y_train.values.flatten(), xgb_pred)
plt.xlim(0, 20)
plt.ylim(0, 20)
plt.xlabel('actual', fontsize=12)
plt.ylabel('predicted', fontsize=12)
plt.plot([(0, 0), (20, 20)], [(0, 0), (20, 20)])
plt.title('XGBoost')
plt.show()

In [None]:
xgb_score = mean_absolute_error(y_train.values.flatten(), xgb_pred)
print(f'Score: {xgb_score:0.3f}')

## Test Models

In [None]:
submission = pd.read_csv('../input/sample_submission.csv', index_col='seg_id')

In [None]:
X_test = pd.DataFrame(columns=X_train.columns, dtype=np.float64, index=submission.index)


In [None]:
print ('Testing')
for seg_id in X_test.index:
    seg = pd.read_csv('../input/test/' + seg_id + '.csv')
    segments = int(np.floor(seg.shape[0] / rows))
    for segment in range(segments):
        test_seg = seg.iloc[segment*rows:segment*rows+rows]
        x = test_seg['acoustic_data'].values

        X_test.loc[seg_id, 'ave'] = x.mean()
        X_test.loc[seg_id, 'std'] = x.std()
        X_test.loc[seg_id, 'max'] = x.max()
        X_test.loc[seg_id, 'min'] = x.min()
        
        X_test.loc[seg_id, 'q95'] = np.quantile(x,0.95)
        X_test.loc[seg_id, 'q99'] = np.quantile(x,0.99)
        X_test.loc[seg_id, 'q05'] = np.quantile(x,0.05)
        X_test.loc[seg_id, 'q01'] = np.quantile(x,0.01)
        
        X_test.loc[seg_id, 'kurtosis'] = kurtosis(x, bias=False)
        X_test.loc[seg_id, 'variance'] = np.var(x)
#         X_test.loc[seg_id, 'abs_energy'] = np.dot(x, x)
        X_test.loc[seg_id, 'skew'] = skew(x)
        
        X_test.loc[seg_id, 'median'] = np.median(x)
        X_test.loc[seg_id, 'mad'] = np.mean(np.abs(x - x.mean()))

        X_test.loc[seg_id, 'abs_mean'] = np.abs(x).mean()
        X_test.loc[seg_id, 'abs_std'] = np.abs(x).std()        
        
#         # Look at some chunkks of the current segment
#         X_test.loc[seg_id, 'first_50k_ave'] = x[:50000].mean()
#         X_test.loc[seg_id, 'first_50k_std'] = x[:50000].std()
#         X_test.loc[seg_id, 'first_50k_max'] = x[:50000].max()
#         X_test.loc[seg_id, 'first_50k_min'] = x[:50000].min()
#         X_test.loc[seg_id, 'first_50k_q95'] = np.quantile(x[:50000],0.95)
#         X_test.loc[seg_id, 'first_50k_q99'] = np.quantile(x[:50000],0.99)
#         X_test.loc[seg_id, 'first_50k_q05'] = np.quantile(x[:50000],0.05)
#         X_test.loc[seg_id, 'first_50k_q01'] = np.quantile(x[:50000],0.01)
#         X_test.loc[seg_id, 'first_50k_kurtosis'] = kurtosis(x[:50000], bias=False)
#         X_test.loc[seg_id, 'first_50k_variance'] = np.var(x[:50000])
#         X_test.loc[seg_id, 'first_50k_skew'] = skew(x[:50000])    
#         X_test.loc[seg_id, 'first_50k_median'] = np.median(x[:50000])
#         X_test.loc[seg_id, 'first_50k_mad'] = np.mean(np.abs(x[:50000] - x[:50000].mean()))    
#         X_test.loc[seg_id, 'first_50k_abs_mean'] = np.abs(x[:50000]).mean()
#         X_test.loc[seg_id, 'first_50k_abs_std'] = np.abs(x[:50000]).std()

#         X_test.loc[seg_id, 'middle_50k_ave'] = x[50000:100000].mean()
#         X_test.loc[seg_id, 'middle_50k_std'] = x[50000:100000].std()
#         X_test.loc[seg_id, 'middle_50k_max'] = x[50000:100000].max()
#         X_test.loc[seg_id, 'middle_50k_min'] = x[50000:100000].min()
#         X_test.loc[seg_id, 'middle_50k_q95'] = np.quantile(x[50000:100000],0.95)
#         X_test.loc[seg_id, 'middle_50k_q99'] = np.quantile(x[50000:100000],0.99)
#         X_test.loc[seg_id, 'middle_50k_q05'] = np.quantile(x[50000:100000],0.05)
#         X_test.loc[seg_id, 'middle_50k_q01'] = np.quantile(x[50000:100000],0.01)
#         X_test.loc[seg_id, 'middle_50k_kurtosis'] = kurtosis(x[50000:100000], bias=False)
#         X_test.loc[seg_id, 'middle_50k_variance'] = np.var(x[50000:100000])
#         X_test.loc[seg_id, 'middle_50k_skew'] = skew(x[50000:100000])    
#         X_test.loc[seg_id, 'middle_50k_median'] = np.median(x[50000:100000])
#         X_test.loc[seg_id, 'middle_50k_mad'] = np.mean(np.abs(x[50000:100000] - x[50000:100000].mean()))    
#         X_test.loc[seg_id, 'middle_50k_abs_mean'] = np.abs(x[50000:100000]).mean()
#         X_test.loc[seg_id, 'middle_50k_abs_std'] = np.abs(x[50000:100000]).std()

#         X_test.loc[seg_id, 'last_50k_ave'] = x[100000:150000].mean()
#         X_test.loc[seg_id, 'last_50k_std'] = x[100000:150000].std()
#         X_test.loc[seg_id, 'last_50k_max'] = x[100000:150000].max()
#         X_test.loc[seg_id, 'last_50k_min'] = x[100000:150000].min()
#         X_test.loc[seg_id, 'last_50k_q95'] = np.quantile(x[100000:150000],0.95)
#         X_test.loc[seg_id, 'last_50k_q99'] = np.quantile(x[100000:150000],0.99)
#         X_test.loc[seg_id, 'last_50k_q05'] = np.quantile(x[100000:150000],0.05)
#         X_test.loc[seg_id, 'last_50k_q01'] = np.quantile(x[100000:150000],0.01)
#         X_test.loc[seg_id, 'last_50k_kurtosis'] = kurtosis(x[100000:150000], bias=False)
#         X_test.loc[seg_id, 'last_50k_variance'] = np.var(x[100000:150000])
#         X_test.loc[seg_id, 'last_50k_skew'] = skew(x[100000:150000])    
#         X_test.loc[seg_id, 'last_50k_median'] = np.median(x[100000:150000])
#         X_test.loc[seg_id, 'last_50k_mad'] = np.mean(np.abs(x[100000:150000] - x[100000:150000].mean()))    
#         X_test.loc[seg_id, 'last_50k_abs_mean'] = np.abs(x[100000:150000]).mean()
#         X_test.loc[seg_id, 'last_50k_abs_std'] = np.abs(x[100000:150000]).std()

#         X_test.loc[seg_id, 'last_bin_ave'] = x[145904:150000].mean()
#         X_test.loc[seg_id, 'last_bin_std'] = x[145904:150000].std()
#         X_test.loc[seg_id, 'last_bin_max'] = x[145904:150000].max()
#         X_test.loc[seg_id, 'last_bin_min'] = x[145904:150000].min()
#         X_test.loc[seg_id, 'last_bin_q95'] = np.quantile(x[145904:150000],0.95)
#         X_test.loc[seg_id, 'last_bin_q99'] = np.quantile(x[145904:150000],0.99)
#         X_test.loc[seg_id, 'last_bin_q05'] = np.quantile(x[145904:150000],0.05)
#         X_test.loc[seg_id, 'last_bin_q01'] = np.quantile(x[145904:150000],0.01)
#         X_test.loc[seg_id, 'last_bin_kurtosis'] = kurtosis(x[145904:150000], bias=False)
#         X_test.loc[seg_id, 'last_bin_variance'] = np.var(x[145904:150000])
#         X_test.loc[seg_id, 'last_bin_skew'] = skew(x[145904:150000])    
#         X_test.loc[seg_id, 'last_bin_median'] = np.median(x[145904:150000])
#         X_test.loc[seg_id, 'last_bin_mad'] = np.mean(np.abs(x[145904:150000] - x[100000:150000].mean()))    
#         X_test.loc[seg_id, 'last_bin_abs_mean'] = np.abs(x[145904:150000]).mean()
#         X_test.loc[seg_id, 'last_bin_abs_std'] = np.abs(x[145904:150000]).std()



In [None]:
X_test_scaled = scaler.transform(X_test)

In [None]:
X_test_scaled.shape

Predict and save models predictions to a file. Best performer of this 3 models is the SVM Regressor. Commented code to only submit one file to the challenge.

In [None]:
# import the modules we'll need
from IPython.display import HTML
import pandas as pd
import numpy as np
import base64

# function that takes in a dataframe and creates a text link to  
# download it (will only work for files < 2MB or so)
def create_download_link(df, title = "Download CSV file", filename = "submission.csv"):  
    csv = df.to_csv()
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

In [None]:
# SVM Predictions
svm_prediction = svm.predict(X_test_scaled)
submission['time_to_failure'] = svm_prediction
submission.to_csv('svm_submission.csv')


# create a link to download the dataframe
create_download_link(submission, filename = 'svm_submission.csv')

# ↓ ↓ ↓  Yay, download link! ↓ ↓ ↓ 

In [None]:
# svm_prediction = svm.predict(X_test_scaled)
# svm_prediction
# svm_pred = svm.predict(X_test_scaled)
# svm_pred

In [None]:
# XGB Predictions
xgb_prediction = xgb.predict(X_test_scaled)
submission['time_to_failure'] = xgb_prediction
submission.to_csv('xgb_submission.csv')

# create a link to download the dataframe
create_download_link(submission, filename = 'xgb_submission.csv')

# ↓ ↓ ↓  Yay, download link! ↓ ↓ ↓ 

In [None]:
# # RF Predictions
# rf_prediction = rf.predict(X_test_scaled)
# submission['time_to_failure'] = rf_prediction
# submission.to_csv('rf_submission.csv')

In [None]:
y_train = train['time_to_failure']

In [None]:
y_train.mean()

In [None]:
submission = pd.read_csv('../input/sample_submission.csv', index_col='seg_id')

In [None]:
submission['time_to_failure'] = y_train.mean()

In [None]:
# SVM Predictions
submission.to_csv('dummy_submission.csv')


# create a link to download the dataframe
create_download_link(submission, filename = 'dummy_submission.csv')

# ↓ ↓ ↓  Yay, download link! ↓ ↓ ↓ 