In [2]:
%load_ext autoreload
%autoreload 2

Libraries

In [7]:
import os
import pandas as pd
import numpy as np
from verbio import readers, preprocessing, temporal, features, settings
from sklearn.metrics import f1_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from collections import defaultdict

Constants

In [10]:
SUBJECT_LABELS = [f'P{p:03d}' for p in range(1, 74, 1)] # Participants 001-073
TRAIN_SESSIONS = ['TEST01','TEST02','TEST03','TEST04']
TEST_SESSIONS = ['TEST05','TEST06','TEST07','TEST08']
DATA_DIR = '/home/jason/hubbs/project_verbio/data/raw/'
EDA_FILENAME = 'E4_EDA_PPT.xlsx'
HR_FILENAME = 'E4_HR_PPT.xlsx'
ANNOTATION_FILENAME = 'MANUAL_ANNOTATION_PPT.xlsx'
WIN_LEN = 20
WIN_STRIDE = 5

Helper functions

In [12]:
def get_data(participant, session):
    eda_filepath = os.path.join(DATA_DIR, participant, session, EDA_FILENAME)
    hr_filepath = os.path.join(DATA_DIR, participant, session, HR_FILENAME)
    annotation_filepath = os.path.join(DATA_DIR, participant, session, ANNOTATION_FILENAME)
    
    if any(not os.path.exists(x) for x in (eda_filepath, hr_filepath, annotation_filepath)): return None
    
    eda_df = readers.read_excel(eda_filepath)
    hr_df = readers.read_excel(hr_filepath)
    annotation_df = readers.read_excel(annotation_filepath)
    
    eda_fx = get_eda_fx(eda_df)
    hr_fx = get_hr_fx(hr_df)
    annotation_fx = get_annotation_fx(annotation_df)

    min_len = min(len(annotation_fx), len(eda_fx), len(hr_fx))
    y = annotation_fx[:min_len]
    eda_fx = eda_fx.iloc[:min_len]
    hr_fx = hr_fx[:min_len]
    
    x_df = eda_fx
    x_df['HR'] = hr_fx
    
    x = x_df.to_numpy()
    return x, y
    
def get_eda_fx(eda_df):
    # Convert EDA signals to numpy
    eda_signal = eda_df['EDA'].to_numpy()
    eda_times = eda_df[settings.time_key].to_numpy()
    # Get EDA features
    eda_fx = features.eda_features(
        signal=eda_signal, 
        times=eda_times, 
        sr=settings.e4_eda_sr, 
        win_len=WIN_LEN, 
        win_stride=WIN_STRIDE
    )[['SCR_Peaks', 'SCR_Amplitude']]
    return eda_fx

def get_hr_fx(hr_df):
    # Convert HR signals to numpy
    hr_signal = hr_df['HR'].to_numpy()
    hr_times = hr_df[settings.time_key].to_numpy()
    # Window HR
    hr_fx = preprocessing.window_timed(
        x=hr_signal,
        times=hr_times,
        win_len=WIN_LEN,
        win_stride=WIN_STRIDE,
        win_fn=lambda x: np.mean(x)
    )
    return np.array(hr_fx)

def get_annotation_fx(annotation_df):
    # Convert annotation signals to numpy
    annotation_r1 = annotation_df['R1'].to_numpy()
    annotation_r2 = annotation_df['R2'].to_numpy()
    annotation_r4 = annotation_df['R4'].to_numpy()
    annotation_r5 = annotation_df['R5'].to_numpy()
    annotation_times = annotation_df[settings.time_key].to_numpy()
    # Combine both annotators
    annotation_mixed = np.vstack([annotation_r1, annotation_r2, annotation_r4, annotation_r5])
    annotation_mean = np.mean(annotation_mixed, axis=0)
    # Window annotations
    annotation_fx = preprocessing.window_timed(
        x=annotation_mean,
        times=annotation_times,
        win_len=WIN_LEN,
        win_stride=WIN_STRIDE,
        win_fn=lambda x: preprocessing.binarize(np.mean(x), threshold=2.5)
    )
    annotation_fx = np.array(annotation_fx, dtype='int')    
    # Shift annotations back in time
    assert WIN_LEN % WIN_STRIDE < 0.1 # Assert that they're at least somewhat divisible
    shift_len = -int(WIN_LEN//WIN_STRIDE)
    return temporal.shift(annotation_fx, shift_len)[:shift_len] # Shift back in time and truncate

Grab raw data from VerBIO dataset for training and testing sessions

In [13]:
train_dict = {}
test_dict = {}

for p in SUBJECT_LABELS:
    valid = True
    participant_train = []
    participant_test = []

    for s in TRAIN_SESSIONS:
        session_data = get_data(p, s)
        if session_data is None:
            valid = False
            break
        else:
            participant_train.append(session_data)
    
    for s in TEST_SESSIONS:
        session_data = get_data(p, s)
        if session_data is None:
            valid = False
            break
        else:
            participant_test.append(session_data)
            
    if valid:
        print(f'Valid participant {p}')
        train_dict[p] = participant_train
        test_dict[p] = participant_test

Valid participant P004
Valid participant P005
Valid participant P008
Valid participant P016
Valid participant P020
Valid participant P021
Valid participant P023
Valid participant P032
Valid participant P035
Valid participant P037
Valid participant P039
Valid participant P041
Valid participant P042
Valid participant P044
Valid participant P047
Valid participant P050
Valid participant P051
Valid participant P053
Valid participant P060
Valid participant P061
Valid participant P062
Valid participant P065
Valid participant P066
Valid participant P071
Valid participant P073


Run experiment loop

In [11]:
average_f = 0.0
valid_p = 0
for target_p in train_dict.keys():
    aux_participants = set(train_dict.keys())
    aux_participants.remove(target_p)
    
    x_train = []
    y_train = []
    
    for p in aux_participants:
        p_data = train_dict[p]
        p_x = [z[0] for z in p_data]
        p_y = [z[1] for z in p_data]
        x_train.append(np.concatenate(p_x, axis=0))
        y_train.append(np.concatenate(p_y, axis=0))
    x_train = np.concatenate(x_train).astype(np.float32)
    y_train = np.concatenate(y_train).astype(int)
    
    p_data = test_dict[target_p]
    x_test = np.concatenate([z[0] for z in p_data], axis=0).astype(np.float32)
    y_test = np.concatenate([z[1] for z in p_data], axis=0).astype(int)
    
    train_c0 = sum(y_train == 0)
    train_c1 = sum(y_train == 1)
    test_c0 = sum(y_test == 0)
    test_c1 = sum(y_test == 1)
    
    
    clf = GradientBoostingClassifier(n_estimators=100, warm_start=True)
    clf.fit(x_pretrain, y_pretrain)
    clf.set_params(n_estimators=200)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    
    # Train and test model
    clf = GradientBoostingClassifier()
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    if f1_score(y_pred, y_test) == 0.0:
        continue
    else:
        average_f += f1_score(y_pred, y_test)
        valid_p += 1
        print(f1_score(y_pred, y_test))
        print(f'Train: c0: {train_c0} | c1: {train_c1}')
        print(f'Test: c0: {test_c0} | c1: {test_c1}')
        print(x_train.shape, y_train.shape)
        print(x_test.shape, y_test.shape)
        print('')
        
print('Average f1: ', average_f/valid_p)

0.851063829787234
Train: c0: 2356 | c1: 1595
Test: c0: 0 | c1: 27
(3951, 4) (3951,)
(27, 4) (27,)

0.5
Train: c0: 2327 | c1: 1606
Test: c0: 16 | c1: 8
(3933, 4) (3933,)
(24, 4) (24,)

0.2666666666666667
Train: c0: 2444 | c1: 1515
Test: c0: 3 | c1: 26
(3959, 4) (3959,)
(29, 4) (29,)

0.5
Train: c0: 2338 | c1: 1631
Test: c0: 9 | c1: 4
(3969, 4) (3969,)
(13, 4) (13,)

0.19999999999999998
Train: c0: 2428 | c1: 1471
Test: c0: 3 | c1: 36
(3899, 4) (3899,)
(39, 4) (39,)



  average, "true nor predicted", 'F-score is', len(true_sum)


0.2564102564102564
Train: c0: 2297 | c1: 1624
Test: c0: 30 | c1: 12
(3921, 4) (3921,)
(42, 4) (42,)

0.5
Train: c0: 2331 | c1: 1571
Test: c0: 13 | c1: 6
(3902, 4) (3902,)
(19, 4) (19,)

0.2222222222222222
Train: c0: 2351 | c1: 1617
Test: c0: 3 | c1: 16
(3968, 4) (3968,)
(19, 4) (19,)

0.04081632653061225
Train: c0: 2406 | c1: 1491
Test: c0: 0 | c1: 48
(3897, 4) (3897,)
(48, 4) (48,)

0.36363636363636365
Train: c0: 2303 | c1: 1633
Test: c0: 23 | c1: 8
(3936, 4) (3936,)
(31, 4) (31,)

0.8260869565217392
Train: c0: 2397 | c1: 1490
Test: c0: 3 | c1: 52
(3887, 4) (3887,)
(55, 4) (55,)

0.0625
Train: c0: 2388 | c1: 1543
Test: c0: 3 | c1: 31
(3931, 4) (3931,)
(34, 4) (34,)

0.32
Train: c0: 2330 | c1: 1563
Test: c0: 3 | c1: 42
(3893, 4) (3893,)
(45, 4) (45,)

0.35294117647058826
Train: c0: 2437 | c1: 1499
Test: c0: 0 | c1: 28
(3936, 4) (3936,)
(28, 4) (28,)

0.3636363636363636
Train: c0: 2283 | c1: 1597
Test: c0: 18 | c1: 4
(3880, 4) (3880,)
(22, 4) (22,)

0.36363636363636365
Train: c0: 2385 |