In [10]:
import sys
sys.path.append('../../')

from functools import partial
from aml_project import utils

import pandas as pd
import numpy as np
import neurokit2 as nk
import biosppy.signals.ecg as ecg
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, f1_score, make_scorer
sns.set('talk')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
%matplotlib inline

In [11]:
%%time
X = pd.read_csv('data/X_train.csv')

CPU times: user 36 s, sys: 1.36 s, total: 37.3 s
Wall time: 37.9 s


In [12]:
%%time
X_test = pd.read_csv('data/X_test.csv')

CPU times: user 24.1 s, sys: 559 ms, total: 24.6 s
Wall time: 24.8 s


In [13]:
%%time
y = pd.read_csv('data/y_train.csv')

CPU times: user 1.54 ms, sys: 793 µs, total: 2.33 ms
Wall time: 2.1 ms


In [40]:
X.pop('id')
X_test.pop('id')

0          0
1          1
2          2
3          3
4          4
        ... 
3406    3406
3407    3407
3408    3408
3409    3409
3410    3410
Name: id, Length: 3411, dtype: int64

In [67]:
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y.y)

In [14]:
def plot_signal(data, idx, vlines=[], titles=[]):
    
    if type(idx) == int:
        idx = range(idx)
        
    width = 20
    ncols = 1
    if len(data.shape) > 2:
        ncols = 5
        width = 2
        
    nrows = len(idx) // ncols
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols*width, nrows*5))
    if len(idx) > 1:
        axes = axes.flatten()
    else:
        axes = [axes]
    for i, ax in enumerate(axes):
        if len(data.shape) > 2:
            ax.imshow(data[i][0])
        else:
            measurements = data.iloc[i].dropna().to_numpy(dtype='float32')
            #measurements = measurements[0:-1:10]
            measurements /= 1000
            seconds = np.arange(0, len(measurements)) / 30
            ax.plot(seconds, measurements)
            for line in vlines:
                ax.axvline(x=(line/30))
        if len(titles) > 0:
            ax.set_title(titles[idx[i]])

In [15]:
def get_statistics(np_array):
    if len(np_array) == 0:
        return np.nan, np.nan, np.nan, np.nan, np.nan
    array_mean = np.mean(np_array)
    array_median = np.median(np_array)
    array_std = np.std(np_array)
    array_max = np.max(np_array)
    array_min = np.min(np_array)
    return array_mean, array_median, array_std, array_max, array_min

In [16]:
def create_features(X, sampling_rate=150, threshold=0.2):
    features_data = []
    for i, signal in tqdm(enumerate(X), total=len(X)):
        
        signal_no_nan = pd.DataFrame(signal).dropna().to_numpy()
        features_sample = []

        rpeaks = ecg.engzee_segmenter(signal, sampling_rate, threshold=threshold)['rpeaks']
        beats = ecg.extract_heartbeats(signal, rpeaks, sampling_rate)['templates']
        _, waves_peak = nk.ecg_delineate(signal, rpeaks, sampling_rate=sampling_rate, method="dwt", show=False)
        
        # remove nans
        waves_peak_nonan = {k:[elem for elem in v if elem is not np.nan] for k,v in waves_peak.items()}
        
        
        ppeaks = waves_peak_nonan['ECG_P_Peaks']
        qpeaks = waves_peak_nonan['ECG_Q_Peaks']
        speaks = waves_peak_nonan['ECG_S_Peaks']
        tpeaks = waves_peak_nonan['ECG_T_Peaks']
        ponsets = waves_peak_nonan['ECG_P_Onsets']
        toffsets = waves_peak_nonan['ECG_T_Offsets']

        # Amplitude values
        P_amplitudes = signal[ppeaks]
        Q_amplitudes = signal[qpeaks]
        S_amplitudes = signal[speaks]
        T_amplitudes = signal[tpeaks]
        R_amplitudes = signal[rpeaks]
        Pon_amplitudes = signal[ponsets]
        Toff_amplitudes = signal[toffsets]

        features_sample.extend(get_statistics(P_amplitudes))
        features_sample.extend(get_statistics(Q_amplitudes))
        features_sample.extend(get_statistics(S_amplitudes))
        features_sample.extend(get_statistics(T_amplitudes))
        features_sample.extend(get_statistics(R_amplitudes))
        features_sample.extend(get_statistics(Pon_amplitudes))
        features_sample.extend(get_statistics(Toff_amplitudes))

        # Timing features
        rr_interval = np.diff(rpeaks)/sampling_rate*1000 # rr interval in ms

        pp_interval = np.diff(ppeaks)/sampling_rate*1000 # rr interval in ms

        qrs_duration = (np.array(waves_peak['ECG_R_Offsets'])-np.array(waves_peak['ECG_R_Onsets']))/sampling_rate*1000 # in ms
        qrs_duration = qrs_duration[~np.isnan(qrs_duration)]

        p_duration = (np.array(waves_peak['ECG_P_Offsets'])-np.array(waves_peak['ECG_P_Onsets']))/sampling_rate*1000 # in ms
        p_duration = p_duration[~np.isnan(p_duration)]

        pr_duration = (np.array(waves_peak['ECG_R_Onsets'])-np.array(waves_peak['ECG_P_Onsets']))/sampling_rate*1000 # in ms
        pr_duration = pr_duration[~np.isnan(pr_duration)]

        rwave_peaktime = rpeaks-waves_peak['ECG_R_Onsets']
        rwave_peaktime = rwave_peaktime[~np.isnan(rwave_peaktime)]

        features_sample.extend(get_statistics(rr_interval))
        features_sample.extend(get_statistics(pp_interval))
        features_sample.extend(get_statistics(qrs_duration))
        features_sample.extend(get_statistics(p_duration))
        features_sample.extend(get_statistics(pr_duration))
        features_sample.extend(get_statistics(rwave_peaktime))

        # get hrv features  (time, frequency, non-linear)
        df_hrv_time = nk.hrv_time(rpeaks, sampling_rate=sampling_rate)
        #df_hrv_time.dropna(axis=1, inplace=True)
        features_sample.extend(df_hrv_time.values[0])
        
        df_hrv_frequency = nk.hrv_frequency(rpeaks, sampling_rate)
        #df_hrv_frequency.dropna(axis=1, inplace=True)
        features_sample.extend(df_hrv_frequency.values[0])

        features_data.append(np.array(features_sample))
    features_data = np.stack(features_data)

    return features_data

In [35]:
def create_features_simple(X, sampling_rate=300, threshold=0.2):
    features_data = []
    for i in tqdm(range(len(X))):
        
        signal = X.iloc[i].dropna()
        
        features_sample = []
        
        if len(beats) < 2:
            print(i)
            continue
        
        ts, filtered, rpeaks, templates_ts, templates, heart_rate_ts, heart_rate = ecg.ecg(signal, sampling_rate, show=False)
        rpeaks = ecg.correct_rpeaks(signal=signal, rpeaks=rpeaks, sampling_rate=sampling_rate, tol=0.1)['rpeaks']

        ramps = signal[rpeaks]
        
        if len(heart_rate) < 2:
            heart_rate = [0, 1]
        if len(heart_rate_ts) < 2:
            heart_rate_ts = [0, 1]
        

        features_sample.extend(get_statistics(ramps))
        features_sample.extend(get_statistics(rpeaks))
        features_sample.extend(get_statistics(heart_rate))
        features_sample.extend(get_statistics(heart_rate_ts))
        features_sample.append(np.sum(filtered - signal))
        
        features_sample += list(np.mean(templates, axis=0))
        features_sample += list(np.std(templates, axis=0))
        features_sample += list(np.min(templates, axis=0))
        features_sample += list(np.max(templates, axis=0))
        
        features_sample = np.array(features_sample)
        features_sample[np.isnan(features_sample)] = 0
        
        features_data.append(features_sample)
        
    return np.stack(features_data, axis=0)

In [65]:
def create_features_other(X, sampling_rate=300, threshold=0.2):
    features_data = []
    
    
    for i in tqdm(range(len(X))):
        features_sample = []
        
        signal = X.iloc[i].dropna()

        corr = signal.autocorr(lag=2)
        features_sample.append(corr)

        features_sample.append(np.average(signal))
        features_sample.append(np.ptp(signal))

        f = np.fft.fft(signal)
        array = f[0:800]
        n = 15
        indices = array.argsort()[-n:][::-1]
        
       # if len(indices) == 1:
       #     indices = [np.nan for _ in range(n)]
        features_sample.extend(indices)
        features_data.append(np.array(features_sample))

    return np.stack(features_data, axis=0)

In [69]:
%%time
features_train = create_features_other(X)

100%|██████████| 5117/5117 [00:41<00:00, 122.36it/s]

CPU times: user 40.9 s, sys: 716 ms, total: 41.6 s
Wall time: 41.8 s





In [70]:
features_train.shape

(5117, 18)

In [71]:
%%time
features_test = create_features_other(X_test)

100%|██████████| 3411/3411 [00:27<00:00, 123.34it/s]

CPU times: user 26.9 s, sys: 559 ms, total: 27.5 s
Wall time: 27.7 s





In [83]:
pd.DataFrame(features_train).to_csv('data/features_train_fft.csv')
pd.DataFrame(features_test).to_csv('data/features_test_fft.csv')

In [84]:
df_fft = pd.read_csv('data/features_test_fft.csv')
df_fft.pop('Unnamed: 0')
df_fft.shape

(3411, 18)

In [None]:
clf = XGBClassifier(learning_rate=0.05, n_estimators=300, max_depth=5)

In [129]:
%%time
clf.fit(features_train, y_train.y.values.ravel())





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.05, max_delta_step=0,
              max_depth=5, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=300, n_jobs=10,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [130]:
y_pred = clf.predict(features_val)

In [131]:
f1_score(y_val.y.values, y_pred, average='micro')

0.7859375

In [132]:
y_result = clf.predict(features_test)

In [136]:
utils.results_to_csv(y_result, 'results.csv')