In [1]:
import csv
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings

import biosppy.signals.ecg as ecg
import neurokit2 as nk

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import scipy.stats as stats

In [2]:
# Data import ---------------------------------------------------------------------------------------------------

y_train_raw = pd.read_csv('data/y_train.csv', index_col='id')
X_train_raw = pd.read_csv('data/X_train.csv', index_col='id')
X_test_raw = pd.read_csv("data/X_test.csv", index_col='id')

In [3]:
# Feature extractor for a single observation -----------------------------------------------------------------

def sample_features(ecg_signal):

    N = 60 # number of feats per stat
    
    # col names
    mean_names = []
    var_names = []
    for i in range(0,N):
        mean_names.append('mean' + str(i))
        var_names.append('var' + str(i))
    
    try:
        # cut the trailing NAs
        ecg_signal = ecg_signal.dropna().to_numpy(dtype='float32')
        
        # inversion of flipped signals
        ecg_signal, _ = nk.ecg_invert(ecg_signal, sampling_rate=300, force=False, show=False)
        ecg_signal = pd.Series(ecg_signal)
        
        # extract heartbeats
        r_peaks = ecg.engzee_segmenter(ecg_signal, 300)['rpeaks']                         
        beats = ecg.extract_heartbeats(ecg_signal, r_peaks, 300)['templates']
        
        # compute statistics
        mu = np.mean(beats, axis=0)
        var = np.std(beats, axis=0)
        
        # compute features
        sample_points = np.linspace(start=0, stop=179, num=N, dtype='int')
        sample_mu = mu[sample_points]
        sample_var = var[sample_points]
            
        mean_feats = pd.DataFrame(sample_mu).transpose()
        mean_feats.columns = mean_names
        var_feats = pd.DataFrame(sample_var).transpose()
        var_feats.columns = var_names
        
        feats = pd.concat([mean_feats, var_feats], axis=1)
    
    except:
        # NaN row
        colnames = mean_names +  var_names
        feats = pd.DataFrame(np.nan, index=[0], columns=colnames)
    
    return feats

In [None]:
# compute train features ------------------------------------------------------------------------------------------

X_features = []
    
for row_idx in range(X_train_raw.shape[0]): 
    
    # show progress
    if (row_idx % 100) == 0:
        print(round(row_idx/X_train_raw.shape[0] * 100, 1), "% completed")
        
    # compute the features
    features = sample_features(X_train_raw.iloc[row_idx])
    
    # add index
    df_id = pd.DataFrame({"id": [row_idx]})
    X_features.append(pd.concat([df_id, features], axis=1))


X_features = pd.concat(X_features)
X_features.set_index('id', inplace=True)
X_features.to_csv("data/X_train_sample_features.csv")      

In [None]:
# compute test features ------------------------------------------------------------------------------------------

X_features = []
    
for row_idx in range(X_test_raw.shape[0]): 
    
    # show progress
    if (row_idx % 100) == 0:
        print(round(row_idx/X_test_raw.shape[0] * 100, 1), "% completed")
        
    # compute the features
    features = sample_features(X_test_raw.iloc[row_idx])
    
    # add index
    df_id = pd.DataFrame({"id": [row_idx]})
    X_features.append(pd.concat([df_id, features], axis=1))


X_features = pd.concat(X_features)
X_features.set_index('id', inplace=True) 
X_features.to_csv("data/X_test_sample_features.csv")   