# WESAD Validation Notebook for FLIRT


In [5]:
# Import Packages
import pandas as pd
import numpy as np

import matplotlib; matplotlib.use('agg')
import matplotlib.pyplot as plt

import multiprocessing
from joblib import Parallel, delayed
from tqdm.autonotebook import trange
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import f1_score, classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.utils.class_weight import compute_class_weight

from datetime import datetime, timedelta, timezone

from typing import List
import lightgbm as lgb
import glob2
import os 


from sklearn.preprocessing import MinMaxScaler
from sklearn import utils, model_selection, metrics, preprocessing
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from pykliep import DensityRatioEstimator 
import flirt.simple

The following function retrieves all HRV, EDA and ACC features per subject using the FLIRT pipeline


In [17]:
def get_features_per_subject(path, window_length):
    features = flirt.simple.get_features_for_empatica_archive(zip_file_path = path,
                                      window_length = window_length,
                                      window_step_size = 0.25,
                                      hrv_features = True,
                                      eda_features = True,
                                      acc_features = True,
                                      bvp_features = True,
                                      temp_features = True,
                                      debug = True)
    return features

The following function determines the time offsets of the start and end of each relevant analysis period (baseline, stress, amusement). These offsets are combined with the timestamp stating the start of recording, to determine the absolute timestamps of the sections of interest for each subject. 

In [18]:
def find_label_timestamps(csv_path, StartingTime):

    ID = csv_path.split('/', 3)[1]
    df_timestamp = pd.read_csv(glob2.glob('WESAD/' + ID + '/*quest.csv')[0], delimiter = ';', header = 1).iloc[:2, :].dropna(axis = 1)
    print('===================================')
    print('Printing the timestamp for {0}'.format(ID))
    print('===================================')
    print(df_timestamp.head())
    
    # Start/End of experiment periods
    print('\nStart of the baseline: ' + str(df_timestamp['Base'][0]))
    print('End of the baseline: ' + str(df_timestamp['Base'][1]))
    print('Start of the fun: ' + str(df_timestamp['Fun'][0]))
    print('End of the fun: ' + str(df_timestamp['Fun'][1]))
    print('Start of the stress: ' + str(df_timestamp['TSST'][0]))
    print('End of the stress: ' + str(df_timestamp['TSST'][1]))
    
    # Get start and end time and assign label into a dict
    lab_dict = {'Base':0, 'TSST':1, 'Fun':2}
    labels_times_dict = {}
    for mode in df_timestamp.columns.tolist():
        print('mode', mode)
        if mode=='Base' or mode=='Fun' or mode=='TSST':
            labels_times_dict[mode] = [StartingTime + timedelta(minutes = int(str(df_timestamp[mode][0]).split(".")[0]))+ timedelta(seconds = int(str(df_timestamp[mode][0]).split(".")[1])), 
                                  StartingTime + timedelta(minutes = int(str(df_timestamp[mode][1]).split(".")[0])) + timedelta(seconds = int(str(df_timestamp[mode][1]).split(".")[1])), lab_dict[mode]]
        
    return labels_times_dict

In [30]:
import ast
from dateutil.parser import parse

def find_label_start_time(csv_path):
    ID = csv_path.split('/', 3)[1]
    timestamp = open(glob2.glob('WESAD/' + ID + '/*respiban.txt')[0], "r")
    for i in range(2):
        line = (timestamp.readline())
        line = line.strip()[2:]
        if i==1:
            dict = ast.literal_eval(line)
            start_time_str = dict['00:07:80:D8:AB:58']['time']
            date_str = dict['00:07:80:D8:AB:58']['date']
            datetime_str = date_str + " " + start_time_str
            #print(datetime_str)
            #date_time_obj= date_time_obj.replace(tzinfo="Europe/Berlin")
            
            date_time_obj = pd.to_datetime(datetime_str).tz_localize("Europe/Berlin")
            utc_time = date_time_obj.tz_convert(None)
            
            #print(date_time_obj)
            #start_time = date_time_obj
            #utc_time = start_time - timedelta(hours=2)
    timestamp.close()

    #df_timestamp = pd.read_table(glob2.glob('project_data/WESAD/' + ID + '/*respiban.txt')[0], delim_whitespace=True)#.iloc[:2, :].dropna(axis = 1)
    print('===================================')
    print('Printing the timestamp for {0}'.format(ID))
    print('===================================')
    #print(df_timestamp.head())
    return utc_time

In [31]:
def main():
    #os.chdir('/home/fefespinola/ETHZ_Fall_2020/') #local directory where the script is
    df_all = pd.DataFrame(None)
    #relevant_features = pd.DataFrame(None)
    File_Path = glob2.glob('WESAD/**/*_readme.txt', recursive=True)
    window_length = 60 # in seconds
    window_shift = 0.25 # in seconds
    for subject_path in File_Path:
        print(subject_path)
        print(subject_path.split('/', 3)[1])
        ID = subject_path.split('/', 3)[1]
        zip_path = glob2.glob('WESAD/' + ID + '/*_Data.zip')[0]
        print(zip_path)
        features = get_features_per_subject(zip_path, window_length)
        features.index.name = 'timedata'
        E4Time = features.index[0]
        print(E4Time)
        StartingTime = find_label_start_time(subject_path)
        print(StartingTime)
        labels_times = find_label_timestamps(subject_path, StartingTime)
        #features.index.tz_localize(tz='UTC')
        relevant_features = features.loc[
            ((features.index.tz_localize(tz=None)+ timedelta(seconds = window_length) >= labels_times['Base'][0]) & (features.index.tz_localize(tz=None) <= labels_times['Base'][1])) 
            | ((features.index.tz_localize(tz=None)+ timedelta(seconds = window_length) >= labels_times['Fun'][0]) & (features.index.tz_localize(tz=None) <= labels_times['Fun'][1])) 
            | ((features.index.tz_localize(tz=None)+ timedelta(seconds = window_length) >= labels_times['TSST'][0]) & (features.index.tz_localize(tz=None) <= labels_times['TSST'][1]))]

        relevant_features.insert(0, 'ID', ID)
        relevant_features.loc[(relevant_features.index.tz_localize(tz=None)>=labels_times['Base'][0]) &
                                (relevant_features.index.tz_localize(tz=None)<=labels_times['Base'][1]), 'label'] = labels_times['Base'][2]
        relevant_features.loc[(relevant_features.index.tz_localize(tz=None)>=labels_times['Fun'][0]) &
                                (relevant_features.index.tz_localize(tz=None)<=labels_times['Fun'][1]), 'label'] = labels_times['Fun'][2]
        relevant_features.loc[(relevant_features.index.tz_localize(tz=None)>=labels_times['TSST'][0]) & 
                            (relevant_features.index.tz_localize(tz=None)<=labels_times['TSST'][1]), 'label'] = labels_times['TSST'][2]

        # concatenate all subjects and add IDs
        df_all = pd.concat((df_all, relevant_features))
    
    print(df_all)

    return df_all

In [32]:
df_all = main()
df_all.to_csv('hrv_eda_acc_60_25.csv')

WESAD/S5/S5_readme.txt
S5
WESAD/S5/S5_E4_Data.zip
Reading files
Calculating HRV features
Cleaning data...
Calculate td features


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1166.0), HTML(value='')))


Calculate fd features


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1166.0), HTML(value='')))


Calculate stat features


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1166.0), HTML(value='')))



Calculating EDA features


  return (a - mns) / sstd


HBox(children=(HTML(value='EDA features'), FloatProgress(value=0.0, max=30203.0), HTML(value='')))


Calculating ACC features


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=30203.0), HTML(value='')))


Calculating BVP features


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=30203.0), HTML(value='')))


Calculating temperature features


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=30199.0), HTML(value='')))


2017-06-13 12:23:13+00:00


UnknownTimeZoneError: 'None'

# ============

In [None]:
def main():
    #os.chdir('/home/fefespinola/ETHZ_Fall_2020/') #local directory where the script is
    df_all = pd.DataFrame(None)
    #relevant_features = pd.DataFrame(None)
    File_Path = glob2.glob('WESAD/**/*_readme.txt', recursive=True)
    window_length = 60 # in seconds
    window_shift = 1 # in seconds
    for subject_path in File_Path:
        print(subject_path)
        print(subject_path.split('/', 3)[1])
        ID = subject_path.split('/', 3)[1]
        zip_path = glob2.glob('WESAD/' + ID + '/*_Data.zip')[0]
        print(zip_path)
        features = get_features_per_subject(zip_path, window_length)
        features.index.name = 'timedata'
        StartingTime = features.index[0]
        print(features)
        
        labels_times = find_label_timestamps(subject_path, StartingTime)
        
        relevant_features = features.loc[
            ((features.index + timedelta(seconds = window_length)  >= labels_times['Base'][0]) & (features.index <= labels_times['Base'][1])) 
            | ((features.index + timedelta(seconds = window_length) >= labels_times['Fun'][0]) & (features.index <= labels_times['Fun'][1])) 
            | ((features.index + timedelta(seconds = window_length) >= labels_times['TSST'][0]) & (features.index <= labels_times['TSST'][1]))]

        relevant_features.insert(0, 'ID', ID)
        
        #relevant_features['label'] = np.zeros(len(relevant_features))
        
        relevant_features.loc[(relevant_features.index>=labels_times['Base'][0]) &
                                (relevant_features.index<=labels_times['Base'][1]), 'label'] = labels_times['Base'][2]
        
        relevant_features.loc[(relevant_features.index>=labels_times['Fun'][0]) &
                                (relevant_features.index<=labels_times['Fun'][1]), 'label'] = labels_times['Fun'][2]
        
        relevant_features.loc[(relevant_features.index>=labels_times['TSST'][0]) & 
                            (relevant_features.index<=labels_times['TSST'][1]), 'label'] = labels_times['TSST'][2]

        # concatenate all subjects and add IDs
        df_all = pd.concat((df_all, relevant_features))
    
    print(df_all)

    return df_all

Run the evaluation script to retrieve the labeled data and train classifier to output f1-score

In [None]:
df_all = main()
df_all.to_csv('hrv_eda_acc_60_1.csv')

In [11]:
data = pd.read_csv('hrv_eda_acc_60_1.csv')
data.set_index('timedata', inplace=True)
label=data.label
ID=data.ID
print(ID.unique())

['S5' 'S2' 'S3' 'S4' 'S17' 'S10' 'S11' 'S16' 'S8' 'S6' 'S7' 'S9' 'S13'
 'S14' 'S15']


In [61]:
data=data.filter(regex='^eda',axis=1)
data['label']=label
data['ID']=ID

In [13]:
data.head()

Unnamed: 0_level_0,eda_phasic_mean,eda_phasic_std,eda_phasic_min,eda_phasic_max,eda_phasic_ptp,eda_phasic_sum,eda_phasic_energy,eda_phasic_skewness,eda_phasic_kurtosis,eda_phasic_peaks,...,eda_tonic_fd_kurtosis,eda_tonic_fd_iqr,eda_tonic_mfcc_mean,eda_tonic_mfcc_std,eda_tonic_mfcc_median,eda_tonic_mfcc_skewness,eda_tonic_mfcc_kurtosis,eda_tonic_mfcc_iqr,label,ID
timedata,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-06-13 12:30:50+00:00,0.060865,0.115519,1e-06,0.63269,0.632689,43.822819,12.275442,2.560102,6.650678,0.0,...,629.5125,84150.945114,0.008696,0.05789,0.004423,-10.540938,287.962553,0.008475,0.0,S5
2017-06-13 12:30:51+00:00,0.061395,0.115458,1e-06,0.63269,0.632689,44.204567,12.311991,2.551351,6.623677,0.0,...,628.959261,84301.59415,0.008693,0.060275,0.004392,-11.56959,307.522451,0.007677,0.0,S5
2017-06-13 12:30:52+00:00,0.061818,0.115373,1e-06,0.63269,0.632689,44.50906,12.335295,2.546886,6.614378,0.0,...,628.515626,87722.011313,0.00869,0.062847,0.004748,-12.589045,327.802971,0.006184,0.0,S5
2017-06-13 12:30:53+00:00,0.06214,0.115281,1e-06,0.63269,0.632689,44.740868,12.348796,2.545448,6.616074,0.0,...,628.174501,92254.037249,0.008688,0.065509,0.004824,-13.562295,347.890653,0.006858,0.0,S5
2017-06-13 12:30:54+00:00,0.062371,0.115198,1e-06,0.63269,0.632689,44.907169,12.355799,2.545746,6.622771,0.0,...,627.932407,98671.770657,0.008685,0.068087,0.004637,-14.359361,365.279193,0.006545,0.0,S5


# Prediction (LightGBM)


In [12]:
df = data.replace([np.inf, -np.inf], np.nan) # np.inf leads to problems with some techniques

# Clean columns that contain a lot of nan values 
print(len(df), len(df.columns))
df = df.dropna(axis=1, thresh=int(len(df)*0.8))
df = df.dropna(axis=0)
print(len(df), len(df.columns))
print('Columns dropped: ', data.drop(df.columns, axis=1).columns.values)


df.round(4)

stats = []

cv = model_selection.LeaveOneGroupOut()

X = df.drop(columns=['label', 'ID'])
y = df['label'].astype('int')
groups = df['ID']
print("running %d-fold CV..." % (cv.get_n_splits(X, y, groups)))

for train_index, test_index in cv.split(X, y, groups):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    y_train=y_train.astype(int)
    y_test=y_test.astype(int)
    
    #weights = compute_class_weight('balanced', 3, y_train)
    
    #kliep = DensityRatioEstimator()
    #kliep.fit(X_train, X_test) # keyword arguments are X_train and X_test
    #weights = kliep.predict(X_train)
    
    clf = LDA()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    #params = {'objective': 'multiclass','num_class':3}
    #model = lgb.LGBMClassifier(**params)
    #model.fit(X_train, y_train)
    #y_pred = model.predict(X_test)
    stats.append({
        'f1': f1_score(y_test, y_pred, average="macro"),
        'accuracy': accuracy_score(y_test, y_pred)
    })
        
    
    print(metrics.classification_report(y_test, y_pred))

stats = pd.DataFrame(stats)
print(stats.f1.mean())

33917 263
33851 259
Columns dropped:  ['acc_acc_x_entropy' 'acc_acc_y_entropy' 'acc_acc_z_entropy' 'bvp_entropy']
running 15-fold CV...
              precision    recall  f1-score   support

           0       0.70      0.52      0.59      1201
           1       0.92      0.95      0.94       773
           2       0.19      0.33      0.24       393

    accuracy                           0.63      2367
   macro avg       0.61      0.60      0.59      2367
weighted avg       0.69      0.63      0.65      2367

              precision    recall  f1-score   support

           0       0.90      0.88      0.89      1201
           1       0.99      0.99      0.99       701
           2       0.63      0.69      0.66       389

    accuracy                           0.88      2291
   macro avg       0.84      0.85      0.85      2291
weighted avg       0.88      0.88      0.88      2291



  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.70      0.79      0.74      1201
           1       0.72      1.00      0.84       685
           2       0.00      0.00      0.00       403

    accuracy                           0.71      2289
   macro avg       0.48      0.60      0.53      2289
weighted avg       0.59      0.71      0.64      2289

              precision    recall  f1-score   support

           0       0.58      0.94      0.72      1201
           1       0.81      0.29      0.43       696
           2       0.40      0.12      0.18       393

    accuracy                           0.60      2290
   macro avg       0.60      0.45      0.44      2290
weighted avg       0.62      0.60      0.54      2290

              precision    recall  f1-score   support

           0       0.71      0.27      0.39      1178
           1       1.00      0.47      0.64       707
           2       0.23      0.89      0.37       393

    accuracy        

In [5]:
df.groupby('label').count()

Unnamed: 0_level_0,ID,hrv_hrv_mean_nni,hrv_hrv_median_nni,hrv_hrv_range_nni,hrv_hrv_sdsd,hrv_hrv_rmssd,hrv_hrv_nni_50,hrv_hrv_pnni_50,hrv_hrv_nni_20,hrv_hrv_pnni_20,...,temp_n_above_mean,temp_n_below_mean,temp_n_sign_changes,temp_iqr,temp_iqr_5_95,temp_pct_5,temp_pct_95,temp_entropy,temp_perm_entropy,temp_svd_entropy
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,16080,16080,16080,16080,16080,16080,16080,16080,16080,16080,...,16080,16080,16080,16080,16080,16080,16080,16080,16080,16080
1.0,10254,10254,10254,10254,10254,10254,10254,10254,10254,10254,...,10254,10254,10254,10254,10254,10254,10254,10254,10254,10254
2.0,5800,5800,5800,5800,5800,5800,5800,5800,5800,5800,...,5800,5800,5800,5800,5800,5800,5800,5800,5800,5800
