In [1]:
from tqdm import tqdm_notebook as tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tsfresh.feature_extraction import feature_calculators
import librosa
import pywt

In [2]:
np.random.seed(1337)
noise = np.random.normal(0, 0.5, 150_000)

def denoise_signal_simple(x, wavelet = 'db4', level = 1):
    coeff = pywt.wavedec(x, wavelet, mode = 'per')
    uthresh = 10
    coeff[1:] = (pywt.threshold(i, value = uthresh, mode = 'hard') for i in coeff[1:])
    return pywt.waverec(coeff, wavelet, mode = 'per')

def feature_gen(z):
    X = pd.DataFrame(index = [0], dtype=np.float64)
    
    z = z+noise
    z = z-np.median(z)
    
    den_sample_simple = denoise_signal_simple(z)
    mfcc = librosa.feature.mfcc(z)
    mfcc_mean = mfcc.mean(axis = 1)
    percentile_roll50_std_20 =  np.percentile(pd.Series(z).rolling(50).std().dropna().values,20)
    
    X['var_num_peaks_2_denoise_simple'] = feature_calculators.number_peaks(den_sample_simple,2)
    X['var_percentile_roll50_std_20'] = percentile_roll50_std_20
    X['var_mfcc_mean18'] = mfcc_mean[18]
    X['var_mfcc_mean4'] = mfcc_mean[4]
    
    return X

def parse_sample(sample, start):
    if len(sample)==150_000:
        delta = feature_gen(sample['acoustic_data'].values)
        delta['start'] = sample.iloc[0].name
        delta['target'] = sample['time_to_failure'].values[-1]
        return delta
    else:
        return pd.DataFrame(columns=['var_num_peaks_2_denoise_simple', 'var_percentile_roll50_std_20',
       'var_mfcc_mean18', 'var_mfcc_mean4', 'start', 'target'])

def sample_train_gen(df, segment_size = 150_000, indices_to_calculate= [0]):
    result = [parse_sample(df[int(i):int(i)+ segment_size], int(i)) for i in indices_to_calculate]
    data = [r.values[0] for r in result if not r.empty]
    X = pd.DataFrame(data, columns = result[0].columns)
    
    return X

# Load the data and concat the dataframe

In [6]:
chunksize = 150_000*200
CsvFileReader = pd.read_csv('train.csv', header = 0, chunksize = chunksize)

In [7]:
df = pd.DataFrame(columns=['var_num_peaks_2_denoise_simple', 'var_percentile_roll50_std_20',
       'var_mfcc_mean18', 'var_mfcc_mean4', 'start', 'target'])

for raw in CsvFileReader:
    indices_to_calculate = np.arange(chunksize)[::150_000]
    #print(indices_to_calculate, len(raw))
    X = sample_train_gen(raw, indices_to_calculate=indices_to_calculate)
    df1 = df.copy()
    df = pd.concat([df1,X.copy()])
    print(len(df), len(X))
    df.to_csv('extract_train_full.csv')

200 200
400 200
600 200
800 200
1000 200
1200 200
1400 200
1600 200
1800 200
2000 200
2200 200
2400 200
2600 200
2800 200
3000 200
3200 200
3400 200
3600 200
3800 200
4000 200
4194 194


In [5]:
df

Unnamed: 0,var_num_peaks_2_denoise_simple,var_percentile_roll50_std_20,var_mfcc_mean18,var_mfcc_mean4,start,target
0,6210.0,2.697163,-1.778088,-11.369058,0.0,1.430797
1,6116.0,2.701290,-2.464462,-11.027536,150000.0,1.391499
2,6748.0,2.780605,-2.721902,-12.873970,300000.0,1.353196
3,6259.0,2.718767,-2.215434,-11.228475,450000.0,1.313798
4,6230.0,2.718618,-2.786946,-11.586154,600000.0,1.274400
5,6436.0,2.744226,-2.519710,-11.439426,750000.0,1.236097
6,6413.0,2.728869,-2.575175,-11.269781,900000.0,1.196798
7,5866.0,2.671478,-2.283799,-10.390144,1050000.0,1.158496
8,7254.0,2.827005,-2.513442,-12.496606,1200000.0,1.119097
9,6730.0,2.748357,-2.686752,-12.052642,1350000.0,1.079699
