In [3]:
import csv
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings

import biosppy.signals.ecg as ecg
import neurokit2 as nk

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import scipy.stats as stats

In [4]:
# Data import ---------------------------------------------------------------------------------------------------
y_train_raw = pd.read_csv('data/y_train.csv', index_col='id')
X_train_raw = pd.read_csv('data/X_train.csv', index_col='id')
X_test_raw = pd.read_csv("data/X_test.csv", index_col='id')

In [5]:
# FUNCTION DEFINITIONS ------------------------------------------------------------------------------------------

# loop over rows and extract features from single observations --------------------------------------------------
def generate_features(X):
    
    X_features = []
    
    # loop over the rows
    for row_idx in range(X.shape[0]): 
        
        # show progress
        if (row_idx % 100) == 0:
            print(round(row_idx/X.shape[0] * 100, 1), "% completed")
            
        # compute the features
        rpeaks_features = compute_rpeaks_features(X.iloc[row_idx])
        PQST_features = compute_PQST_features(X.iloc[row_idx])
        
        # merge the features and add index
        features_single_obs = pd.concat([rpeaks_features, PQST_features], axis=1)
        df_id = pd.DataFrame({"id": [row_idx]})
        X_features.append(pd.concat([df_id, features_single_obs], axis=1))
    
    X_features = pd.concat(X_features)
    X_features.replace([np.inf, -np.inf], np.nan, inplace=True) # (some values are inf, -inf)
    X_features.set_index('id', inplace=True)
            
    return X_features




# Compute R-peaks-related features of a single signal ------------------------------------------------------------
def compute_rpeaks_features(ecg_signal):
    
    colnames_all = ['R_amplitude_mean', 'R_amplitude_std', 'R_amplitude_min',
                'R_amplitude_25%', 'R_amplitude_50%', 'R_amplitude_75%',
                'R_amplitude_max', 'R_ECG_Rate_Mean', 'R_HRV_MeanNN', 'R_HRV_SDNN',
                'R_HRV_SDANN1', 'R_HRV_SDNNI1', 'R_HRV_SDANN2', 'R_HRV_SDNNI2',
                'R_HRV_SDANN5', 'R_HRV_SDNNI5', 'R_HRV_RMSSD', 'R_HRV_SDSD',
                'R_HRV_CVNN', 'R_HRV_CVSD', 'R_HRV_MedianNN', 'R_HRV_MadNN',
                'R_HRV_MCVNN', 'R_HRV_IQRNN', 'R_HRV_Prc20NN', 'R_HRV_Prc80NN',
                'R_HRV_pNN50', 'R_HRV_pNN20', 'R_HRV_MinNN', 'R_HRV_MaxNN', 'R_HRV_HTI',
                'R_HRV_TINN', 'R_HRV_ULF', 'R_HRV_VLF', 'R_HRV_LF', 'R_HRV_HF',
                'R_HRV_VHF', 'R_HRV_LFHF', 'R_HRV_LFn', 'R_HRV_HFn', 'R_HRV_LnHF',
                'R_HRV_SD1', 'R_HRV_SD2', 'R_HRV_SD1SD2', 'R_HRV_S', 'R_HRV_CSI',
                'R_HRV_CVI', 'R_HRV_CSI_Modified', 'R_HRV_PIP', 'R_HRV_IALS',
                'R_HRV_PSS', 'R_HRV_PAS', 'R_HRV_GI', 'R_HRV_SI', 'R_HRV_AI',
                'R_HRV_PI', 'R_HRV_C1d', 'R_HRV_C1a', 'R_HRV_SD1d', 'R_HRV_SD1a',
                'R_HRV_C2d', 'R_HRV_C2a', 'R_HRV_SD2d', 'R_HRV_SD2a', 'R_HRV_Cd',
                'R_HRV_Ca', 'R_HRV_SDNNd', 'R_HRV_SDNNa', 'R_HRV_DFA_alpha1',
                'R_HRV_MFDFA_alpha1_Width', 'R_HRV_MFDFA_alpha1_Peak',
                'R_HRV_MFDFA_alpha1_Mean', 'R_HRV_MFDFA_alpha1_Max',
                'R_HRV_MFDFA_alpha1_Delta', 'R_HRV_MFDFA_alpha1_Asymmetry',
                'R_HRV_MFDFA_alpha1_Fluctuation', 'R_HRV_MFDFA_alpha1_Increment',
                'R_HRV_ApEn', 'R_HRV_SampEn', 'R_HRV_ShanEn', 'R_HRV_FuzzyEn',
                'R_HRV_MSEn', 'R_HRV_CMSEn', 'R_HRV_RCMSEn', 'R_HRV_CD', 'R_HRV_HFD',
                'R_HRV_KFD', 'R_HRV_LZC']
    
    colnames_amplitude = ['R_amplitude_mean', 'R_amplitude_std', 'R_amplitude_min',
                'R_amplitude_25%', 'R_amplitude_50%', 'R_amplitude_75%',
                'R_amplitude_max']
    
    colnames_hrv = ['R_ECG_Rate_Mean', 'R_HRV_MeanNN', 'R_HRV_SDNN', 'R_HRV_SDANN1',
                'R_HRV_SDNNI1', 'R_HRV_SDANN2', 'R_HRV_SDNNI2', 'R_HRV_SDANN5',
                'R_HRV_SDNNI5', 'R_HRV_RMSSD', 'R_HRV_SDSD', 'R_HRV_CVNN', 'R_HRV_CVSD',
                'R_HRV_MedianNN', 'R_HRV_MadNN', 'R_HRV_MCVNN', 'R_HRV_IQRNN',
                'R_HRV_Prc20NN', 'R_HRV_Prc80NN', 'R_HRV_pNN50', 'R_HRV_pNN20',
                'R_HRV_MinNN', 'R_HRV_MaxNN', 'R_HRV_HTI', 'R_HRV_TINN', 'R_HRV_ULF',
                'R_HRV_VLF', 'R_HRV_LF', 'R_HRV_HF', 'R_HRV_VHF', 'R_HRV_LFHF',
                'R_HRV_LFn', 'R_HRV_HFn', 'R_HRV_LnHF', 'R_HRV_SD1', 'R_HRV_SD2',
                'R_HRV_SD1SD2', 'R_HRV_S', 'R_HRV_CSI', 'R_HRV_CVI',
                'R_HRV_CSI_Modified', 'R_HRV_PIP', 'R_HRV_IALS', 'R_HRV_PSS',
                'R_HRV_PAS', 'R_HRV_GI', 'R_HRV_SI', 'R_HRV_AI', 'R_HRV_PI',
                'R_HRV_C1d', 'R_HRV_C1a', 'R_HRV_SD1d', 'R_HRV_SD1a', 'R_HRV_C2d',
                'R_HRV_C2a', 'R_HRV_SD2d', 'R_HRV_SD2a', 'R_HRV_Cd', 'R_HRV_Ca',
                'R_HRV_SDNNd', 'R_HRV_SDNNa', 'R_HRV_DFA_alpha1',
                'R_HRV_MFDFA_alpha1_Width', 'R_HRV_MFDFA_alpha1_Peak',
                'R_HRV_MFDFA_alpha1_Mean', 'R_HRV_MFDFA_alpha1_Max',
                'R_HRV_MFDFA_alpha1_Delta', 'R_HRV_MFDFA_alpha1_Asymmetry',
                'R_HRV_MFDFA_alpha1_Fluctuation', 'R_HRV_MFDFA_alpha1_Increment',
                'R_HRV_ApEn', 'R_HRV_SampEn', 'R_HRV_ShanEn', 'R_HRV_FuzzyEn',
                'R_HRV_MSEn', 'R_HRV_CMSEn', 'R_HRV_RCMSEn', 'R_HRV_CD', 'R_HRV_HFD',
                'R_HRV_KFD', 'R_HRV_LZC']
             
    # cut the trailing NAs
    ecg_signal = ecg_signal.dropna().to_numpy(dtype='float32')

    # inversion of flipped signals
    ecg_signal, _ = nk.ecg_invert(ecg_signal, sampling_rate=300, force=False, show=False)
    ecg_signal = pd.Series(ecg_signal)
    
    try:
        # compute R peaks
        clean_ecg_signal = nk.ecg_clean(ecg_signal, sampling_rate=300)
        _, rpeaks = nk.ecg_peaks(ecg_signal, sampling_rate=300)
        
        try:
            # compute amplitude-related features
            clean_ecg_signal = pd.Series(clean_ecg_signal)
            peak_amplitudes = clean_ecg_signal[ pd.Series(rpeaks['ECG_R_Peaks']).dropna().astype(int) ] # remove invalid peak locations 'nan'
            amplitude_features = pd.DataFrame(peak_amplitudes.describe()[1:]).transpose() # summary statistics except counts=#records
            amplitude_features.columns = 'R' + '_amplitude_' + amplitude_features.columns
        except:
            # compute vector of nan with same length and same name
            amplitude_features = pd.DataFrame(np.nan, index=[0], columns=colnames_amplitude)
        
        try:
            # compute ALL the available HRV features 
            processed_data, _ = nk.bio_process(ecg=ecg_signal, sampling_rate=300)
            rpeaks_features = nk.bio_analyze(processed_data, sampling_rate=300, method='interval-related')
            rpeaks_features.columns = 'R_' + rpeaks_features.columns
        except:
            # compute vector of nan with same length and same name
            rpeaks_features = pd.DataFrame(np.nan, index=[0], columns=colnames_hrv)
            
        # combine the computed features
        features = pd.concat([amplitude_features, rpeaks_features], axis=1)
        
    except:
        # compute vector of nan with same length and same name
        features = pd.DataFrame(np.nan, index=[0], columns=colnames_all)
        
    return features




# Compute the features related to the other kind of peaks ----------------------------------------------------------
def compute_PQST_features(ecg_signal):
    
    colnames = ['P_HRV_MeanNN','P_HRV_SDNN','P_HRV_SDANN1','P_HRV_SDNNI1','P_HRV_SDANN2','P_HRV_SDNNI2','P_HRV_SDANN5','P_HRV_SDNNI5'
                ,'P_HRV_RMSSD','P_HRV_SDSD','P_HRV_CVNN','P_HRV_CVSD','P_HRV_MedianNN','P_HRV_MadNN','P_HRV_MCVNN','P_HRV_IQRNN','P_HRV_Prc20NN','P_HRV_Prc80NN','P_HRV_pNN50'
                ,'P_HRV_pNN20','P_HRV_MinNN','P_HRV_MaxNN','P_HRV_HTI','P_HRV_TINN','P_amplitude_mean','P_amplitude_std','P_amplitude_min','P_amplitude_25%','P_amplitude_50%','P_amplitude_75%','P_amplitude_max'
                ,'Q_HRV_MeanNN','Q_HRV_SDNN','Q_HRV_SDANN1','Q_HRV_SDNNI1','Q_HRV_SDANN2','Q_HRV_SDNNI2','Q_HRV_SDANN5','Q_HRV_SDNNI5','Q_HRV_RMSSD'
                ,'Q_HRV_SDSD','Q_HRV_CVNN','Q_HRV_CVSD','Q_HRV_MedianNN','Q_HRV_MadNN','Q_HRV_MCVNN','Q_HRV_IQRNN','Q_HRV_Prc20NN','Q_HRV_Prc80NN'
                ,'Q_HRV_pNN50','Q_HRV_pNN20','Q_HRV_MinNN','Q_HRV_MaxNN','Q_HRV_HTI','Q_HRV_TINN','Q_amplitude_mean','Q_amplitude_std','Q_amplitude_min'
                ,'Q_amplitude_25%','Q_amplitude_50%','Q_amplitude_75%','Q_amplitude_max','S_HRV_MeanNN','S_HRV_SDNN','S_HRV_SDANN1','S_HRV_SDNNI1'
                ,'S_HRV_SDANN2','S_HRV_SDNNI2','S_HRV_SDANN5','S_HRV_SDNNI5','S_HRV_RMSSD','S_HRV_SDSD','S_HRV_CVNN','S_HRV_CVSD','S_HRV_MedianNN'
                ,'S_HRV_MadNN','S_HRV_MCVNN','S_HRV_IQRNN','S_HRV_Prc20NN','S_HRV_Prc80NN','S_HRV_pNN50','S_HRV_pNN20','S_HRV_MinNN','S_HRV_MaxNN','S_HRV_HTI','S_HRV_TINN'
                ,'S_amplitude_mean','S_amplitude_std','S_amplitude_min','S_amplitude_25%','S_amplitude_50%','S_amplitude_75%','S_amplitude_max','T_HRV_MeanNN','T_HRV_SDNN','T_HRV_SDANN1'
                ,'T_HRV_SDNNI1','T_HRV_SDANN2','T_HRV_SDNNI2','T_HRV_SDANN5','T_HRV_SDNNI5','T_HRV_RMSSD','T_HRV_SDSD','T_HRV_CVNN','T_HRV_CVSD','T_HRV_MedianNN','T_HRV_MadNN','T_HRV_MCVNN'
                ,'T_HRV_IQRNN','T_HRV_Prc20NN','T_HRV_Prc80NN','T_HRV_pNN50','T_HRV_pNN20','T_HRV_MinNN','T_HRV_MaxNN','T_HRV_HTI','T_HRV_TINN','T_amplitude_mean','T_amplitude_std','T_amplitude_min'
                ,'T_amplitude_25%','T_amplitude_50%','T_amplitude_75%','T_amplitude_max','meanQ_Ponset','meanToffset_Q','meanSQ','meanQP','meanTS','stdQ_Ponset','stdToffset_Q'
                ,'stdSQ','stdQP','stdTS','minQ_Ponset','minToffset_Q','minSQ','minQP','minTS','25%Q_Ponset','25%Toffset_Q','25%SQ','25%QP'
                ,'25%TS' ,'50%Q_Ponset' ,'50%Toffset_Q' ,'50%SQ' ,'50%QP' ,'50%TS' ,'75%Q_Ponset' ,'75%Toffset_Q' ,'75%SQ'
                ,'75%QP','75%TS','maxQ_Ponset','maxToffset_Q','maxSQ','maxQP','maxTS']
    
    # cut the trailing NAs
    ecg_signal = ecg_signal.dropna().to_numpy(dtype='float32')
    
    # inversion of flipped signals
    ecg_signal, _ = nk.ecg_invert(ecg_signal, sampling_rate=300, force=False, show=False)
    ecg_signal = pd.Series(ecg_signal)
    
    try:
        # compute PQST peaks
        ecg_signal = nk.ecg_clean(ecg_signal, sampling_rate=300)
        _, rpeaks = nk.ecg_peaks(ecg_signal, sampling_rate=300)
        _, waves_peak = nk.ecg_delineate(ecg_signal, rpeaks, sampling_rate=300, method="peak")
        
        # compute PQST peak related features
        PQST_peak_related_features = pd.concat(
            [
                feature_extractor(waves_peak, ecg_signal, peak_type='P'),
                feature_extractor(waves_peak, ecg_signal, peak_type='Q'),
                feature_extractor(waves_peak, ecg_signal, peak_type='S'),
                feature_extractor(waves_peak, ecg_signal, peak_type='T')
            ],
            axis=1
        )
        
        # compute interval related features
        interval_features = compute_interval_features(waves_peak)
        
        # combine the computed features
        features = pd.concat([PQST_peak_related_features, interval_features], axis=1)
        
    except:
        features = pd.DataFrame(np.nan, index=[0], columns=colnames)
    
    return features




# Compute HRV and Amplitude -related features for the other peaks ----------------------------------------------------
def feature_extractor(waves_peak, ecg_signal, peak_type):
    """
    peak_type: {P,Q,S,T}
    """
    colnames_hrv_P = ['P_HRV_MeanNN', 'P_HRV_SDNN', 'P_HRV_SDANN1', 'P_HRV_SDNNI1',
       'P_HRV_SDANN2', 'P_HRV_SDNNI2', 'P_HRV_SDANN5', 'P_HRV_SDNNI5',
       'P_HRV_RMSSD', 'P_HRV_SDSD', 'P_HRV_CVNN', 'P_HRV_CVSD',
       'P_HRV_MedianNN', 'P_HRV_MadNN', 'P_HRV_MCVNN', 'P_HRV_IQRNN',
       'P_HRV_Prc20NN', 'P_HRV_Prc80NN', 'P_HRV_pNN50', 'P_HRV_pNN20',
       'P_HRV_MinNN', 'P_HRV_MaxNN', 'P_HRV_HTI', 'P_HRV_TINN']
    colnames_hrv_Q = ['Q_HRV_MeanNN', 'Q_HRV_SDNN', 'Q_HRV_SDANN1', 'Q_HRV_SDNNI1',
       'Q_HRV_SDANN2', 'Q_HRV_SDNNI2', 'Q_HRV_SDANN5', 'Q_HRV_SDNNI5',
       'Q_HRV_RMSSD', 'Q_HRV_SDSD', 'Q_HRV_CVNN', 'Q_HRV_CVSD',
       'Q_HRV_MedianNN', 'Q_HRV_MadNN', 'Q_HRV_MCVNN', 'Q_HRV_IQRNN',
       'Q_HRV_Prc20NN', 'Q_HRV_Prc80NN', 'Q_HRV_pNN50', 'Q_HRV_pNN20',
       'Q_HRV_MinNN', 'Q_HRV_MaxNN', 'Q_HRV_HTI', 'Q_HRV_TINN']
    colnames_hrv_S = ['S_HRV_MeanNN', 'S_HRV_SDNN', 'S_HRV_SDANN1', 'S_HRV_SDNNI1',
       'S_HRV_SDANN2', 'S_HRV_SDNNI2', 'S_HRV_SDANN5', 'S_HRV_SDNNI5',
       'S_HRV_RMSSD', 'S_HRV_SDSD', 'S_HRV_CVNN', 'S_HRV_CVSD',
       'S_HRV_MedianNN', 'S_HRV_MadNN', 'S_HRV_MCVNN', 'S_HRV_IQRNN',
       'S_HRV_Prc20NN', 'S_HRV_Prc80NN', 'S_HRV_pNN50', 'S_HRV_pNN20',
       'S_HRV_MinNN', 'S_HRV_MaxNN', 'S_HRV_HTI', 'S_HRV_TINN']
    colnames_hrv_T = ['T_HRV_MeanNN', 'T_HRV_SDNN', 'T_HRV_SDANN1', 'T_HRV_SDNNI1',
       'T_HRV_SDANN2', 'T_HRV_SDNNI2', 'T_HRV_SDANN5', 'T_HRV_SDNNI5',
       'T_HRV_RMSSD', 'T_HRV_SDSD', 'T_HRV_CVNN', 'T_HRV_CVSD',
       'T_HRV_MedianNN', 'T_HRV_MadNN', 'T_HRV_MCVNN', 'T_HRV_IQRNN',
       'T_HRV_Prc20NN', 'T_HRV_Prc80NN', 'T_HRV_pNN50', 'T_HRV_pNN20',
       'T_HRV_MinNN', 'T_HRV_MaxNN', 'T_HRV_HTI', 'T_HRV_TINN']

    colnames_amplitude_P = ['P_amplitude_mean', 'P_amplitude_std', 'P_amplitude_min',
       'P_amplitude_25%', 'P_amplitude_50%', 'P_amplitude_75%',
       'P_amplitude_max']
    colnames_amplitude_Q = ['Q_amplitude_mean', 'Q_amplitude_std', 'Q_amplitude_min',
       'Q_amplitude_25%', 'Q_amplitude_50%', 'Q_amplitude_75%',
       'Q_amplitude_max']
    colnames_amplitude_S = ['S_amplitude_mean', 'S_amplitude_std', 'S_amplitude_min',
       'S_amplitude_25%', 'S_amplitude_50%', 'S_amplitude_75%',
       'S_amplitude_max']
    colnames_amplitude_T = ['T_amplitude_mean', 'T_amplitude_std', 'T_amplitude_min',
       'T_amplitude_25%', 'T_amplitude_50%', 'T_amplitude_75%',
       'T_amplitude_max']
    
    peak_name = 'ECG_' + peak_type + "_Peaks"
    
    # calculate HRV features
    try:
        peak_locations = {
            'ECG_R_Peaks': np.array(waves_peak[peak_name]), # has be to called ECG_R_Peaks that hrv_time() works
            'sampling_rate': 300
        }
        
        hrv_features = nk.hrv_time(peak_locations, sampling_rate=300)
        hrv_features.columns = peak_type + '_' + hrv_features.columns
    except:
        match peak_type:
            case 'P':
                hrv_features = pd.DataFrame(np.nan, index=[0], columns=colnames_hrv_P)
            case 'Q':
                hrv_features = pd.DataFrame(np.nan, index=[0], columns=colnames_hrv_Q)
            case 'S':
                hrv_features = pd.DataFrame(np.nan, index=[0], columns=colnames_hrv_S)
            case 'T':
                hrv_features = pd.DataFrame(np.nan, index=[0], columns=colnames_hrv_T)
        
    # calculate amplitude features
    try:
        ecg_signal = pd.Series(ecg_signal)
        peak_amplitudes = ecg_signal[pd.Series(waves_peak[peak_name]).dropna().astype(int)] # remove invalid peak locations 'nan'
        amplitude_features = pd.DataFrame(peak_amplitudes.describe()[1:]).transpose() # summary statistics except counts=#records
        amplitude_features.columns = peak_type + '_amplitude_' + amplitude_features.columns
    except:
        match peak_type:
            case 'P':
                hrv_features = pd.DataFrame(np.nan, index=[0], columns=colnames_amplitude_P)
            case 'Q':
                hrv_features = pd.DataFrame(np.nan, index=[0], columns=colnames_amplitude_Q)
            case 'S':
                hrv_features = pd.DataFrame(np.nan, index=[0], columns=colnames_amplitude_S)
            case 'T':
                hrv_features = pd.DataFrame(np.nan, index=[0], columns=colnames_amplitude_T)
        
    # combine all features
    features = pd.concat([hrv_features, amplitude_features], axis=1)
    
    return features




# Compute features for various Intervals --------------------------------------------------------------------------
def compute_interval_features(waves_peak):
    
    colnames = ['meanQ_Ponset', 'meanToffset_Q', 'meanSQ', 'meanQP', 'meanTS',
            'stdQ_Ponset', 'stdToffset_Q', 'stdSQ', 'stdQP', 'stdTS', 'minQ_Ponset',
            'minToffset_Q', 'minSQ', 'minQP', 'minTS', '25%Q_Ponset',
            '25%Toffset_Q', '25%SQ', '25%QP', '25%TS', '50%Q_Ponset',
            '50%Toffset_Q', '50%SQ', '50%QP', '50%TS', '75%Q_Ponset',
            '75%Toffset_Q', '75%SQ', '75%QP', '75%TS', 'maxQ_Ponset',
            'maxToffset_Q', 'maxSQ', 'maxQP', 'maxTS']
    
    try:
        peak_df = pd.DataFrame(waves_peak)
        
        interval_features = pd.DataFrame()
        interval_features['Q_Ponset'] = peak_df['ECG_Q_Peaks'] - peak_df['ECG_P_Onsets']
        interval_features['Toffset_Q'] = peak_df['ECG_T_Offsets'] - peak_df['ECG_Q_Peaks']
        interval_features['SQ'] = peak_df['ECG_S_Peaks'] - peak_df['ECG_Q_Peaks']
    
        interval_features['QP'] = peak_df['ECG_Q_Peaks'] - peak_df['ECG_P_Peaks']
        interval_features['TS'] = peak_df['ECG_T_Peaks'] - peak_df['ECG_S_Peaks']
        
        describe_features = interval_features.describe().iloc[1:]
        feature_row = pd.DataFrame(describe_features.values.reshape(1,35))
    
        colnames = []
        for i in describe_features.index:
            for j in describe_features.columns:
                colnames.append(i + j)
    
        feature_row.columns = colnames
    
    except:
        feature_row = pd.DataFrame(np.nan, index=[0], columns=colnames)
    
    return feature_row

In [6]:
# Extract features for Training ------------------------------------------------------------------------

RECOMPUTE_TRAIN = False

if RECOMPUTE_TRAIN:
    warnings.filterwarnings("ignore")
    
    # compute features
    X_train = generate_features(X_train_raw)
    
    warnings.resetwarnings()
    
    # write data
    X_train.to_csv("data/X_train2_processed.csv", index_label=False)
    
else:
    # read data 
    X_train = pd.read_csv("data/X_train2_processed.csv")


X_train.dropna(axis=1, how='all', inplace=True)

In [7]:
# Extract features for Testing ------------------------------------------------------------------------

RECOMPUTE_TEST = False

if RECOMPUTE_TEST:
    warnings.filterwarnings("ignore")
    
    # compute features
    X_test = generate_features(X_test_raw)
    
    warnings.resetwarnings()
    
    # write data
    X_test.to_csv("data/X_test2_processed.csv", index_label=False)
    
else:
    # read data
    X_test = pd.read_csv("data/X_test2_processed.csv")

X_test.dropna(axis=1, how='all', inplace=True)