In [1]:
%load_ext autoreload
%autoreload 2

Libraries

In [2]:
import os
import pandas as pd
import numpy as np
import verbio as vb
from verbio import settings
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from collections import defaultdict

Constants

In [3]:
SUBJECT_LABELS = [f'P{p:03d}' for p in range(1, 74, 1)] # Participants 001-073
TRAIN_SESSIONS = ['TEST01','TEST02','TEST03','TEST04']
TEST_SESSIONS = ['TEST05','TEST06','TEST07','TEST08']
DATA_DIR = '/home/jason/workspace/hubbs/project_verbio/data/physio/'
EDA_FILENAME = 'E4_EDA_PPT.xlsx'
BVP_FILENAME = 'E4_BVP_PPT.xlsx'
ANNOTATION_FILENAME = 'MANUAL_ANNOTATION_PPT.xlsx'
WIN_LEN = 30
WIN_STRIDE = 15
ANNOTATION_THRESHOLD = 2.5
EDA_FILTER_SIZE = 8

Helper functions

In [4]:
def get_data(participant, session):
    eda_filepath = os.path.join(DATA_DIR, participant, session, EDA_FILENAME)
    bvp_filepath = os.path.join(DATA_DIR, participant, session, BVP_FILENAME)
    annotation_filepath = os.path.join(DATA_DIR, participant, session, ANNOTATION_FILENAME)
    
    if any(not os.path.exists(x) for x in (eda_filepath, bvp_filepath, annotation_filepath)): return None
    
    vbr = vb.readers.DataReader()
    
    eda_df = vbr.read_excel(eda_filepath)
    bvp_df = vbr.read_excel(bvp_filepath)
    annotation_df = vbr.read_excel(annotation_filepath)
    
    eda_fx = get_eda_fx(eda_df)
    bvp_fx = get_bvp_fx(bvp_df)
    annotation_fx = get_annotation_fx(annotation_df)

    min_len = min(len(annotation_fx), len(eda_fx), len(bvp_fx))
    annotation_fx = annotation_fx[:min_len]
    eda_fx = eda_fx.iloc[:min_len]
    bvp_fx = bvp_fx[:min_len]
    
    x_df = pd.concat([eda_fx, bvp_fx], axis=1)
    x = x_df.to_numpy()
    y = annotation_fx
    
    return x, y
    
def get_eda_fx(eda_df):
    # Convert EDA signals to numpy
    eda_signal = eda_df['EDA'].to_numpy()
    eda_times = eda_df[vb.settings.time_key].to_numpy()
    
    # Get EDA features
    eda_fx = vb.features.eda_features(
        signal      = eda_signal, 
        times       = eda_times, 
        sr          = vb.settings.e4_eda_sr, 
        win_len     = WIN_LEN, 
        win_stride  = WIN_STRIDE,
        filter_size = EDA_FILTER_SIZE
    )[['SCR_Peaks', 'SCR_Amplitude', 'SCL']]
    
    return eda_fx

def get_bvp_fx(bvp_df):
    # Convert BVP signals to numpy
    bvp_signal = bvp_df['BVP'].to_numpy()
    bvp_times = bvp_df[vb.settings.time_key].to_numpy()
    
    # Get BVP features
    bvp_fx = vb.features.bvp_features(
        signal     = bvp_signal,
        times      = bvp_times,
        sr         = vb.settings.e4_bvp_sr,
        win_len    = WIN_LEN,
        win_stride = WIN_STRIDE
    )[['HR', 'HR_Grad']]
    
    return bvp_fx

def get_annotation_fx(annotation_df):
    # Convert annotation signals to numpy
    annotation_r1 = annotation_df['R1'].to_numpy()
    annotation_r2 = annotation_df['R2'].to_numpy()
    annotation_r4 = annotation_df['R4'].to_numpy()
    annotation_r5 = annotation_df['R5'].to_numpy()
    annotation_times = annotation_df[vb.settings.time_key].to_numpy()
    
    # Combine both annotators
    annotation_mixed = np.vstack([annotation_r1, annotation_r2, annotation_r4, annotation_r5])
    annotation_mean = np.mean(annotation_mixed, axis=0)
   
    # Window annotations
    annotation_fx = vb.preprocessing.window_timed(
        x=annotation_mean,
        times=annotation_times,
        win_len=WIN_LEN,
        win_stride=WIN_STRIDE,
        win_fn=lambda x: vb.preprocessing.binarize(np.mean(x), threshold=ANNOTATION_THRESHOLD)
    )
    annotation_fx = np.array(annotation_fx, dtype='int') 
    
    # Shift annotations back in time
    assert WIN_LEN % WIN_STRIDE < 0.1 # Assert that they're at least somewhat divisible
    shift_len = -int(WIN_LEN//WIN_STRIDE)
    
    return vb.temporal.shift(annotation_fx, shift_len)[:shift_len] # Shift back in time and truncate

Grab raw data from VerBIO dataset for training and testing sessions

In [5]:
train_dict = {}
test_dict = {}

for p in SUBJECT_LABELS:
    valid = True
    participant_train = []
    participant_test = []

    for s in TRAIN_SESSIONS:
        session_data = get_data(p, s)
        if session_data is None:
            valid = False
            break
        else:
            participant_train.append(session_data)
    
    for s in TEST_SESSIONS:
        session_data = get_data(p, s)
        if session_data is None:
            valid = False
            break
        else:
            participant_test.append(session_data)
            
    if valid:
        print(f'Valid participant {p}')
        train_dict[p] = participant_train
        test_dict[p] = participant_test

Valid participant P004
Valid participant P005
Valid participant P008
Valid participant P016
Valid participant P020
Valid participant P021
Valid participant P023
Valid participant P032
Valid participant P035
Valid participant P037
Valid participant P039
Valid participant P041
Valid participant P042
Valid participant P044
Valid participant P047
Valid participant P050
Valid participant P051
Valid participant P053
Valid participant P060
Valid participant P061
Valid participant P062
Valid participant P065
Valid participant P071
Valid participant P073


Run experiment loop

In [6]:
noseed_f1 = 0.0
seed_f1 = 0.0
onlyseed_f1 = 0.0
n_valid = 0
seed_ratio = 0.0
onlyseed_ratio = 0.0

for target_p in train_dict.keys():
    
    aux_participants = set(train_dict.keys())
    aux_participants.remove(target_p)
    
    x_pretrain = []
    y_pretrain = []
    
    for p in aux_participants:
        p_data = train_dict[p]
        p_x = [z[0] for z in p_data]
        p_y = [z[1] for z in p_data]
        x_pretrain.append(np.concatenate(p_x, axis=0))
        y_pretrain.append(np.concatenate(p_y, axis=0))
    x_pretrain = np.concatenate(x_pretrain).astype(np.float32)
    y_pretrain = np.concatenate(y_pretrain).astype(int)
    
    p_data = train_dict[target_p]
    x_train = np.concatenate([z[0] for z in p_data], axis=0).astype(np.float32)
    y_train = np.concatenate([z[1] for z in p_data], axis=0).astype(int)
    
    p_data = test_dict[target_p]
    x_dev = np.concatenate([z[0] for z in p_data[:3]], axis=0).astype(np.float32)
    y_dev = np.concatenate([z[1] for z in p_data[:3]], axis=0).astype(int)
    x_test = np.concatenate([z[0] for z in p_data[3:]], axis=0).astype(np.float32)
    y_test = np.concatenate([z[1] for z in p_data[3:]], axis=0).astype(int)
    
    pretrain_c0 = sum(y_pretrain == 0)
    pretrain_c1 = sum(y_pretrain == 1)
    train_c0 = sum(y_train == 0)
    train_c1 = sum(y_train == 1)
    dev_c0 = sum(y_dev == 0)
    dev_c1 = sum(y_dev == 1)
    test_c0 = sum(y_test == 0)
    test_c1 = sum(y_test == 1)
    
    MAX_DEPTH = 5
    N_ESTIMATORS = 400
    N_ESTIMATORS_PRE = 200
    
    try:
        # Train and test model
        print(f'\n=============On participant {target_p}=============')

        clf_2 = GradientBoostingClassifier(n_estimators=N_ESTIMATORS, warm_start=False, max_depth=MAX_DEPTH)
        clf_2.fit(x_pretrain, y_pretrain)
        y_pred_2 = clf_2.predict(x_dev)

        tp = sum((y_pred_2 == 1) & (y_dev == 1))
        tn = sum((y_pred_2 == 0) & (y_dev == 0))
        fp = sum((y_pred_2 == 1) & (y_dev == 0))
        fn = sum((y_pred_2 == 0) & (y_dev == 1))
        print(f'Noseed   -> tp: {tp} | tn: {tn} | fp: {fp} | fn: {fn}')

        clf = GradientBoostingClassifier(n_estimators=N_ESTIMATORS_PRE, warm_start=True, max_depth=MAX_DEPTH)
        clf.fit(x_pretrain, y_pretrain)
        clf.set_params(n_estimators=N_ESTIMATORS)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_dev)

        tp = sum((y_pred == 1) & (y_dev == 1))
        tn = sum((y_pred == 0) & (y_dev == 0))
        fp = sum((y_pred == 1) & (y_dev == 0))
        fn = sum((y_pred == 0) & (y_dev == 1))
        print(f'Seeded   -> tp: {tp} | tn: {tn} | fp: {fp} | fn: {fn}')


        clf_3 = GradientBoostingClassifier(n_estimators=N_ESTIMATORS, warm_start=False, max_depth=MAX_DEPTH)
        clf_3.fit(x_train, y_train)
        y_pred_3 = clf_3.predict(x_dev)

        tp = sum((y_pred_3 == 1) & (y_dev == 1))
        tn = sum((y_pred_3 == 0) & (y_dev == 0))
        fp = sum((y_pred_3 == 1) & (y_dev == 0))
        fn = sum((y_pred_3 == 0) & (y_dev == 1))

        onlyseed_f1 += f1_score(y_pred_3, y_dev)
        noseed_f1 += f1_score(y_pred_2, y_dev)
        seed_f1 += f1_score(y_pred, y_dev)
        seed_ratio += (sum(y_pred==1))/(y_dev.shape[0])
        onlyseed_ratio += (sum(y_pred_3==1))/(y_dev.shape[0])
        n_valid += 1

        print(f'Onlyseed -> tp: {tp} | tn: {tn} | fp: {fp} | fn: {fn}')
        print('')
        print(f'Noseed   -> '
              f'F1: {f1_score(y_pred_2, y_dev):.2f} | '
              f'Acc: {accuracy_score(y_pred_2, y_dev):.2f} | '
              f'Rec: {recall_score(y_pred_2, y_dev):.2f} | '
              f'Prec: {precision_score(y_pred_2, y_dev):.2f}')

        print(f'Seeded   -> '
              f'F1: {f1_score(y_pred, y_dev):.2f} | '
              f'Acc: {accuracy_score(y_pred, y_dev):.2f} | '
              f'Rec: {recall_score(y_pred, y_dev):.2f} | '
              f'Prec: {precision_score(y_pred, y_dev):.2f}')

        print(f'Onlyseed -> '
              f'F1: {f1_score(y_pred_3, y_dev):.2f} | '
              f'Acc: {accuracy_score(y_pred_3, y_dev):.2f} | '
              f'Rec: {recall_score(y_pred_3, y_dev):.2f} | '
              f'Prec: {precision_score(y_pred_3, y_dev):.2f}')

        print('')
        print(f'Pretrain -> c0: {pretrain_c0} | c1: {pretrain_c1}')
        print(f'Train    -> c0: {train_c0} | c1: {train_c1}')
        print(f'Dev      -> c0: {dev_c0} | c1: {dev_c1}')
        print(f'Test     -> c0: {test_c0} | c1: {test_c1}')
        print('=============================================\n')
    except:
        continue

        
print('noseed:',noseed_f1/n_valid)
print('seed:',seed_f1/n_valid)
print('onlyseed:',onlyseed_f1/n_valid)
print('estim:',N_ESTIMATORS)
print('pretrain_estim:',N_ESTIMATORS_PRE)
print('depth:',MAX_DEPTH)


Noseed   -> tp: 9 | tn: 15 | fp: 8 | fn: 22
Seeded   -> tp: 3 | tn: 20 | fp: 3 | fn: 28
Onlyseed -> tp: 5 | tn: 20 | fp: 3 | fn: 26

Noseed   -> F1: 0.38 | Acc: 0.44 | Rec: 0.53 | Prec: 0.29
Seeded   -> F1: 0.16 | Acc: 0.43 | Rec: 0.50 | Prec: 0.10
Onlyseed -> F1: 0.26 | Acc: 0.46 | Rec: 0.62 | Prec: 0.16

Pretrain -> c0: 676 | c1: 440
Train    -> c0: 35 | c1: 43
Dev      -> c0: 23 | c1: 31
Test     -> c0: 0 | c1: 18


Noseed   -> tp: 18 | tn: 4 | fp: 4 | fn: 5
Seeded   -> tp: 11 | tn: 8 | fp: 0 | fn: 12
Onlyseed -> tp: 4 | tn: 4 | fp: 4 | fn: 19

Noseed   -> F1: 0.80 | Acc: 0.71 | Rec: 0.82 | Prec: 0.78
Seeded   -> F1: 0.65 | Acc: 0.61 | Rec: 1.00 | Prec: 0.48
Onlyseed -> F1: 0.26 | Acc: 0.26 | Rec: 0.50 | Prec: 0.17

Pretrain -> c0: 682 | c1: 475
Train    -> c0: 29 | c1: 8
Dev      -> c0: 8 | c1: 23
Test     -> c0: 0 | c1: 9


Noseed   -> tp: 9 | tn: 6 | fp: 5 | fn: 2
Seeded   -> tp: 8 | tn: 4 | fp: 7 | fn: 3
Onlyseed -> tp: 8 | tn: 4 | fp: 7 | fn: 3

Noseed   -> F1: 0.72 | Acc: 0.6

  _warn_prf(average, modifier, msg_start, len(result))


Noseed   -> tp: 5 | tn: 0 | fp: 0 | fn: 15

Noseed   -> tp: 0 | tn: 8 | fp: 8 | fn: 2

Noseed   -> tp: 1 | tn: 35 | fp: 5 | fn: 4
Seeded   -> tp: 4 | tn: 9 | fp: 31 | fn: 1
Onlyseed -> tp: 4 | tn: 9 | fp: 31 | fn: 1

Noseed   -> F1: 0.18 | Acc: 0.80 | Rec: 0.17 | Prec: 0.20
Seeded   -> F1: 0.20 | Acc: 0.29 | Rec: 0.11 | Prec: 0.80
Onlyseed -> F1: 0.20 | Acc: 0.29 | Rec: 0.11 | Prec: 0.80

Pretrain -> c0: 683 | c1: 451
Train    -> c0: 28 | c1: 32
Dev      -> c0: 40 | c1: 5
Test     -> c0: 21 | c1: 0


Noseed   -> tp: 6 | tn: 0 | fp: 0 | fn: 40
Seeded   -> tp: 29 | tn: 0 | fp: 0 | fn: 17
Onlyseed -> tp: 33 | tn: 0 | fp: 0 | fn: 13

Noseed   -> F1: 0.23 | Acc: 0.13 | Rec: 1.00 | Prec: 0.13
Seeded   -> F1: 0.77 | Acc: 0.63 | Rec: 1.00 | Prec: 0.63
Onlyseed -> F1: 0.84 | Acc: 0.72 | Rec: 1.00 | Prec: 0.72

Pretrain -> c0: 708 | c1: 430
Train    -> c0: 3 | c1: 53
Dev      -> c0: 0 | c1: 46
Test     -> c0: 0 | c1: 20


Noseed   -> tp: 0 | tn: 49 | fp: 4 | fn: 0

Noseed   -> tp: 0 | tn: 27 | f

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Noseed   -> tp: 7 | tn: 1 | fp: 3 | fn: 11
Seeded   -> tp: 18 | tn: 0 | fp: 4 | fn: 0
Onlyseed -> tp: 13 | tn: 3 | fp: 1 | fn: 5

Noseed   -> F1: 0.50 | Acc: 0.36 | Rec: 0.70 | Prec: 0.39
Seeded   -> F1: 0.90 | Acc: 0.82 | Rec: 0.82 | Prec: 1.00
Onlyseed -> F1: 0.81 | Acc: 0.73 | Rec: 0.93 | Prec: 0.72

Pretrain -> c0: 672 | c1: 468
Train    -> c0: 39 | c1: 15
Dev      -> c0: 4 | c1: 18
Test     -> c0: 0 | c1: 11


Noseed   -> tp: 6 | tn: 9 | fp: 21 | fn: 0
Seeded   -> tp: 3 | tn: 20 | fp: 10 | fn: 3
Onlyseed -> tp: 3 | tn: 20 | fp: 10 | fn: 3

Noseed   -> F1: 0.36 | Acc: 0.42 | Rec: 0.22 | Prec: 1.00
Seeded   -> F1: 0.32 | Acc: 0.64 | Rec: 0.23 | Prec: 0.50
Onlyseed -> F1: 0.32 | Acc: 0.64 | Rec: 0.23 | Prec: 0.50

Pretrain -> c0: 662 | c1: 480
Train    -> c0: 49 | c1: 3
Dev      -> c0: 30 | c1: 6
Test     -> c0: 13 | c1: 3


Noseed   -> tp: 1 | tn: 14 | fp: 0 | fn: 9
Seeded   -> tp: 0 | tn: 11 | fp: 3 | fn: 10
Onlyseed -> tp: 0 | tn: 14 | fp: 0 | fn: 10

Noseed   -> F1: 0.18 | Acc: 0

  _warn_prf(average, modifier, msg_start, len(result))


Noseed   -> tp: 13 | tn: 0 | fp: 0 | fn: 35
Seeded   -> tp: 43 | tn: 0 | fp: 0 | fn: 5
Onlyseed -> tp: 43 | tn: 0 | fp: 0 | fn: 5

Noseed   -> F1: 0.43 | Acc: 0.27 | Rec: 1.00 | Prec: 0.27
Seeded   -> F1: 0.95 | Acc: 0.90 | Rec: 1.00 | Prec: 0.90
Onlyseed -> F1: 0.95 | Acc: 0.90 | Rec: 1.00 | Prec: 0.90

Pretrain -> c0: 700 | c1: 438
Train    -> c0: 11 | c1: 45
Dev      -> c0: 0 | c1: 48
Test     -> c0: 0 | c1: 12


Noseed   -> tp: 1 | tn: 18 | fp: 10 | fn: 2

Noseed   -> tp: 22 | tn: 0 | fp: 2 | fn: 21
Seeded   -> tp: 41 | tn: 0 | fp: 2 | fn: 2
Onlyseed -> tp: 42 | tn: 0 | fp: 2 | fn: 1

Noseed   -> F1: 0.66 | Acc: 0.49 | Rec: 0.92 | Prec: 0.51
Seeded   -> F1: 0.95 | Acc: 0.91 | Rec: 0.95 | Prec: 0.95
Onlyseed -> F1: 0.97 | Acc: 0.93 | Rec: 0.95 | Prec: 0.98

Pretrain -> c0: 697 | c1: 437
Train    -> c0: 14 | c1: 46
Dev      -> c0: 2 | c1: 43
Test     -> c0: 0 | c1: 10


Noseed   -> tp: 7 | tn: 1 | fp: 0 | fn: 20
Seeded   -> tp: 24 | tn: 0 | fp: 1 | fn: 3
Onlyseed -> tp: 23 | tn: 0 | 

In [7]:
print(seed_ratio/n_valid)
print(onlyseed_ratio/n_valid)

0.5254647626011955
0.49021522986217747
