In [165]:
import csv
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings

import pywt
import biosppy.signals.ecg as ecg
import neurokit2 as nk

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import scipy.stats as stats

In [166]:
# Data import ---------------------------------------------------------------------------------------------------

y_train_raw = pd.read_csv('data/y_train.csv', index_col='id')
X_train_raw = pd.read_csv('data/X_train.csv', index_col='id')
X_test_raw = pd.read_csv("data/X_test.csv", index_col='id')

In [168]:
def compute_wavelets(beat):
    db1 = pywt.Wavelet('db1')
    coeffs = pywt.wavedec(beat, db1, level=3)
    wavel = coeffs[0]
    
    return wavel


def generate_wavelet_features(ecg_signal):
    
    # col names
    agg_fcts = ['mean', 'std', 'min', 'max']
    col_names = []
    for fct in agg_fcts:
        for i in range(1, 23+1):
            col_names.append('wav_' + fct + str(i))

    try:
        ecg_signal = ecg_signal.dropna().to_numpy(dtype='float32')

        # inversion of flipped signals
        ecg_signal, _ = nk.ecg_invert(ecg_signal, sampling_rate=300, force=False, show=False)
        ecg_signal = pd.Series(ecg_signal)

        # extract heartbeats
        r_peaks = ecg.engzee_segmenter(ecg_signal, 300)['rpeaks']                         
        beats = ecg.extract_heartbeats(ecg_signal, r_peaks, 300)['templates']

        # extract wavelets
        wav_all_beats = []
        for i in range(len(beats)):
            wav_all_beats.append(compute_wavelets(beats[i]))

        wav_all_beats = pd.DataFrame(wav_all_beats)
        wav_aggregated = np.hstack([
            wav_all_beats.mean(),
            wav_all_beats.std(),
            wav_all_beats.min(),
            wav_all_beats.max(),
        ])

        features = pd.DataFrame(wav_aggregated).transpose()
        features.columns = col_names
        
    except:
        # NaN row
        features = pd.DataFrame(np.nan, index=[0], columns=col_names)

    return features

In [None]:
# compute train features ------------------------------------------------------------------------------------------

X_features = []
    
for row_idx in range(X_train_raw.shape[0]): 
    
    # show progress
    if (row_idx % 100) == 0:
        print(round(row_idx/X_train_raw.shape[0] * 100, 1), "% completed")
        
    # compute the features
    features = generate_wavelet_features(X_train_raw.iloc[row_idx])
    
    # add index
    df_id = pd.DataFrame({"id": [row_idx]})
    X_features.append(pd.concat([df_id, features], axis=1))


X_features = pd.concat(X_features)
X_features.set_index('id', inplace=True) 
X_features.to_csv("data/X_train_wavelets_agg.csv")

In [None]:
# compute test features ------------------------------------------------------------------------------------------

X_features = []
    
for row_idx in range(X_test_raw.shape[0]): 
    
    # show progress
    if (row_idx % 100) == 0:
        print(round(row_idx/X_test_raw.shape[0] * 100, 1), "% completed")
        
    # compute the features
    features = generate_wavelet_features(X_test_raw.iloc[row_idx])
    
    # add index
    df_id = pd.DataFrame({"id": [row_idx]})
    X_features.append(pd.concat([df_id, features], axis=1))


X_features = pd.concat(X_features)
X_features.set_index('id', inplace=True) 
X_features.to_csv("data/X_test_wavelets.csv")