In [1]:
import numpy as np
import scipy.io
import scipy.stats as stats
import matplotlib.pyplot as plt
import pandas as pd
import random
import time 

import os
from pathlib import Path

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, LeaveOneOut, GroupKFold
from sklearn.svm import LinearSVC, SVR
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.metrics import balanced_accuracy_score, f1_score

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler

from sklearn.utils import shuffle

from make_classification import *

# Import and prepare dataset

In [2]:
path_labels = '../Dataset/'
labels = pd.read_csv(path_labels+'labels.txt', sep=",", header=0, index_col=0)
labels.head()

Unnamed: 0_level_0,3class-label,binary-label
subject/task,Unnamed: 1_level_1,Unnamed: 2_level_1
2ea4_Breathing,0,0
2ea4_Counting1,1,1
2ea4_Counting2,1,1
2ea4_Counting3,1,1
2ea4_Math,1,1


In [3]:
y = labels['binary-label']
y.value_counts()

0    258
1    202
Name: binary-label, dtype: int64

In [4]:
x_ecg = pd.read_csv('Features/ecg_features.csv', sep=",", header=0, index_col=0)
x_eda = pd.read_csv('Features/eda_features.csv', sep=",", header=0, index_col=0)
x_resp = pd.read_csv('Features/resp_features.csv', sep=",", header=0, index_col=0)

# --------- TO FIX: Remove rows without a label --------------#
x_ecg = x_ecg.loc[list(labels.index)]
x_eda = x_eda.loc[list(labels.index)]
x_resp = x_resp.loc[list(labels.index)]
# ------------------------------------------------------------#

In [5]:
x = pd.read_csv('Features/all_physiological_features.csv', sep=",", header=0, index_col=0)
x.head(5)

Unnamed: 0,meanHR,minHR,maxHR,sdHR,modeHR,nNN,meanNN,SDSD,CVNN,SDNN,...,VHF_rsp,LF/HF_rsp,rLF_rsp,rHF_rsp,peakLF_rsp,peakHF_rsp,SD1_rrv,SD2_rrv,SD1SD2_rrv,apEn_rrv
2ea4_Baseline,63.43094,57.034221,79.575597,4.686314,22.541376,62.0,950.677419,42.033401,64.843179,0.068207,...,2.403554,1.025407,50.627207,49.372793,0.131387,0.175182,816.718056,333.265493,2.450653,0.105361
2ea4_Breathing,61.712465,45.87156,84.269663,11.008515,38.398103,59.666667,1002.893855,106.659486,173.316531,0.172816,...,9.995773,2.72027,73.120232,26.879768,0.117188,0.164062,1119.779881,647.574415,1.729191,0.013791
2ea4_Counting1,70.649419,58.252427,82.872928,5.750703,24.620501,69.0,855.101449,43.670598,72.095377,0.084312,...,0.923779,1.881455,65.295311,34.704689,0.086957,0.173913,560.140756,681.563788,0.821846,0.105361
2ea4_Counting2,63.913828,52.910053,79.787234,5.890798,26.877181,63.0,946.47619,58.622906,84.207389,0.088969,...,0.292025,15.878445,94.075284,5.924716,0.082192,0.164384,538.632539,525.776932,1.024451,0.105361
2ea4_Counting3,64.321357,49.751244,78.534031,5.908397,28.782788,62.0,941.032258,39.985888,90.192441,0.095844,...,1.063001,33.239731,97.079416,2.920584,0.09375,0.1875,908.089478,816.368158,1.112353,0.117783


#### Pre-processing of the data

In [6]:
"""
# remove the inf values and columns with more than 20% missing data
x.replace([np.inf, -np.inf], np.nan, inplace=True)
nan_cols = [i for i in x.columns if x[i].isnull().sum() > 0.2*len(x)]
x.drop(columns=nan_cols, inplace=True)
"""

# --------- TO FIX: Remove rows without a label --------------#
x = x.loc[list(labels.index)]
# ------------------------------------------------------------#

# Classification

Feature selection using L1 penalty or Recursive Feature Elimination (RFE). The optimal number of features is determined using RFECV. Several models are tested: Logistic Regression, Linear SVM, Decision Trees, Random Forests, K neares neighbors, and Adaboost. 

The models are fitted n times, and the average scores over n repetitions are reported.

## One dataset (example: only on ECG features database) 

In [7]:
feature_selector= "l1"
list_classif = [LogisticRegression(max_iter=5000, random_state=0),
                DecisionTreeClassifier(random_state=0), 
                RandomForestClassifier(max_depth=5, random_state=0), 
                KNeighborsClassifier(n_neighbors=3), 
                AdaBoostClassifier(n_estimators=100, random_state=0), 
                GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=2, random_state=0),
                MLPClassifier(random_state=0, max_iter=5000)]
n_splits=10

In [11]:
res = make_nclassif(x_ecg, y, n_splits=n_splits, 
                    feature_selector=feature_selector, 
                    list_classifiers = list_classif)
avg_res(res)

Split  1/10
Split  2/10
Split  3/10
Split  4/10
Split  5/10
Split  6/10
Split  7/10
Split  8/10
Split  9/10
Split 10/10


Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GradientBoostingClassifier,0.65141,0.64774,0.786028
MLPClassifier,0.696248,0.70184,1.539224
RandomForestClassifier,0.696773,0.703476,0.916779


## Comparison of several datasets (example: ECG only, EDA only, respiration only, and all 3 modalities)

In [12]:
list_datasets = [('ECG',x_ecg), ('EDA',x_eda), ('Resp',x_resp), ('all', x)]

In [13]:
res_benchmark = benchmark_nrep(list_datasets, y, 
                               n_splits=n_splits, 
                               feature_selector=feature_selector, 
                               list_classifiers = list_classif, verbose=False)

#### ECG results

In [14]:
avg_res(res_benchmark['ECG'])

Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GradientBoostingClassifier,0.65141,0.64774,0.889001
MLPClassifier,0.696248,0.70184,1.677279
RandomForestClassifier,0.696773,0.703476,0.913316


#### EDA results

In [15]:
avg_res(res_benchmark['EDA'])

Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GradientBoostingClassifier,0.572172,0.57444,0.368029
MLPClassifier,0.618292,0.629351,1.168751
RandomForestClassifier,0.616478,0.626604,0.467608


#### Respiration results

In [16]:
avg_res(res_benchmark['Resp'])

Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GradientBoostingClassifier,0.605241,0.606783,1.72405
MLPClassifier,0.646507,0.653492,2.673042
RandomForestClassifier,0.667298,0.68312,1.813695


#### All modalities combined

In [17]:
avg_res(res_benchmark['all'])

Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GradientBoostingClassifier,0.648601,0.648696,5.344408
MLPClassifier,0.608135,0.604444,7.593
RandomForestClassifier,0.673769,0.675675,5.351356


# Test - using random splits instead of grouped by subjects
#### All modalities

In [8]:
resrandom = make_nclassif_random_splits(x, y, n_splits=n_splits, 
                                        feature_selector=feature_selector, 
                                        list_classifiers = list_classif)

Split  1/10
Split  2/10
Split  3/10
Split  4/10
Split  5/10
Split  6/10
Split  7/10
Split  8/10
Split  9/10
Split 10/10


In [9]:
avg_res(resrandom)

Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GradientBoostingClassifier,0.654291,0.628659,6.062592
MLPClassifier,0.697504,0.675912,7.683664
RandomForestClassifier,0.712399,0.717771,5.930669


#### ECG only

In [10]:
resrandomecg = make_nclassif_random_splits(x_ecg, y, n_splits=n_splits, 
                                        feature_selector=feature_selector, 
                                        list_classifiers = list_classif, verbose=False)
avg_res(resrandomecg)

Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GradientBoostingClassifier,0.697504,0.675912,0.65476
MLPClassifier,0.707458,0.684386,6.665223
RandomForestClassifier,0.733347,0.73472,0.74166
