In [36]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Libraries

In [44]:
import os
import pandas as pd
import numpy as np
from verbio import readers, preprocessing, temporal, features, settings
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from collections import defaultdict

Constants

In [38]:
SUBJECT_LABELS = [f'P{p:03d}' for p in range(1, 74, 1)] # Participants 001-073
TRAIN_SESSIONS = ['TEST01','TEST02','TEST03','TEST04']
TEST_SESSIONS = ['TEST05','TEST06','TEST07','TEST08']
DATA_DIR = '/home/jason/hubbs/project_verbio/data/raw/'
EDA_FILENAME = 'E4_EDA_PPT.xlsx'
HR_FILENAME = 'E4_HR_PPT.xlsx'
ANNOTATION_FILENAME = 'MANUAL_ANNOTATION_PPT.xlsx'
WIN_LEN = 20
WIN_STRIDE = 5

Helper functions

In [39]:
def get_data(participant, session):
    eda_filepath = os.path.join(DATA_DIR, participant, session, EDA_FILENAME)
    hr_filepath = os.path.join(DATA_DIR, participant, session, HR_FILENAME)
    annotation_filepath = os.path.join(DATA_DIR, participant, session, ANNOTATION_FILENAME)
    
    if any(not os.path.exists(x) for x in (eda_filepath, hr_filepath, annotation_filepath)): return None
    
    eda_df = readers.read_excel(eda_filepath)
    hr_df = readers.read_excel(hr_filepath)
    annotation_df = readers.read_excel(annotation_filepath)
    
    eda_fx = get_eda_fx(eda_df)
    hr_fx = get_hr_fx(hr_df)
    hr_grad_fx = get_hr_grad_fx(hr_df)
    annotation_fx = get_annotation_fx(annotation_df)

    min_len = min(len(annotation_fx), len(eda_fx), len(hr_fx), len(hr_grad_fx))
    y = annotation_fx[:min_len]
    eda_fx = eda_fx.iloc[:min_len]
    hr_fx = hr_fx[:min_len]
    hr_grad_fx = hr_grad_fx[:min_len]
    
    x_df = eda_fx
    x_df['HR'] = hr_fx
    x_df['HR_grad'] = hr_grad_fx
    
    x = x_df.to_numpy()
    return x, y
    
def get_eda_fx(eda_df):
    # Convert EDA signals to numpy
    eda_signal = eda_df['EDA'].to_numpy()
    eda_times = eda_df[settings.time_key].to_numpy()
    # Get EDA features
    eda_fx = features.eda_features(
        signal=eda_signal, 
        times=eda_times, 
        sr=settings.e4_eda_sr, 
        win_len=WIN_LEN, 
        win_stride=WIN_STRIDE
    )[['SCR_Peaks', 'SCR_Amplitude', 'SCL']]
    return eda_fx

def get_hr_fx(hr_df):
    # Convert HR signals to numpy
    hr_signal = hr_df['HR'].to_numpy()
    hr_times = hr_df[settings.time_key].to_numpy()
    # Window HR
    hr_fx = preprocessing.window_timed(
        x=hr_signal,
        times=hr_times,
        win_len=WIN_LEN,
        win_stride=WIN_STRIDE,
        win_fn=lambda x: np.mean(x)
    )
    return np.array(hr_fx)

def get_hr_grad_fx(hr_df):
    # Convert HR signals to numpy
    hr_signal = np.gradient(hr_df['HR'].to_numpy())
    hr_times = hr_df[settings.time_key].to_numpy()
    # Window HR
    hr_fx = preprocessing.window_timed(
        x=hr_signal,
        times=hr_times,
        win_len=WIN_LEN,
        win_stride=WIN_STRIDE,
        win_fn=lambda x: np.mean(x)
    )
    return np.array(hr_fx)

def get_annotation_fx(annotation_df):
    # Convert annotation signals to numpy
    annotation_r1 = annotation_df['R1'].to_numpy()
    annotation_r2 = annotation_df['R2'].to_numpy()
    annotation_r4 = annotation_df['R4'].to_numpy()
    annotation_r5 = annotation_df['R5'].to_numpy()
    annotation_times = annotation_df[settings.time_key].to_numpy()
    # Combine both annotators
    annotation_mixed = np.vstack([annotation_r1, annotation_r2, annotation_r4, annotation_r5])
    annotation_mean = np.mean(annotation_mixed, axis=0)
    # Window annotations
    annotation_fx = preprocessing.window_timed(
        x=annotation_mean,
        times=annotation_times,
        win_len=WIN_LEN,
        win_stride=WIN_STRIDE,
        win_fn=lambda x: preprocessing.binarize(np.mean(x), threshold=2.5)
    )
    annotation_fx = np.array(annotation_fx, dtype='int')    
    # Shift annotations back in time
    assert WIN_LEN % WIN_STRIDE < 0.1 # Assert that they're at least somewhat divisible
    shift_len = -int(WIN_LEN//WIN_STRIDE)
    return temporal.shift(annotation_fx, shift_len)[:shift_len] # Shift back in time and truncate

Grab raw data from VerBIO dataset for training and testing sessions

In [40]:
train_dict = {}
test_dict = {}

for p in SUBJECT_LABELS:
    valid = True
    participant_train = []
    participant_test = []

    for s in TRAIN_SESSIONS:
        session_data = get_data(p, s)
        if session_data is None:
            valid = False
            break
        else:
            participant_train.append(session_data)
    
    for s in TEST_SESSIONS:
        session_data = get_data(p, s)
        if session_data is None:
            valid = False
            break
        else:
            participant_test.append(session_data)
            
    if valid:
        print(f'Valid participant {p}')
        train_dict[p] = participant_train
        test_dict[p] = participant_test

Valid participant P004
Valid participant P005
Valid participant P008
Valid participant P016
Valid participant P020
Valid participant P021
Valid participant P023
Valid participant P032
Valid participant P035
Valid participant P037
Valid participant P039
Valid participant P041
Valid participant P042
Valid participant P044
Valid participant P047
Valid participant P050
Valid participant P051
Valid participant P053
Valid participant P060
Valid participant P061
Valid participant P062
Valid participant P065
Valid participant P071
Valid participant P073


Run experiment loop

In [54]:
noseed_f1 = 0.0
seed_f1 = 0.0
onlyseed_f1 = 0.0
n_valid = 0
for target_p in train_dict.keys():
    
    aux_participants = set(train_dict.keys())
    aux_participants.remove(target_p)
    
    x_pretrain = []
    y_pretrain = []
    
    for p in aux_participants:
        p_data = train_dict[p]
        p_x = [z[0] for z in p_data]
        p_y = [z[1] for z in p_data]
        x_pretrain.append(np.concatenate(p_x, axis=0))
        y_pretrain.append(np.concatenate(p_y, axis=0))
    x_pretrain = np.concatenate(x_pretrain).astype(np.float32)
    y_pretrain = np.concatenate(y_pretrain).astype(int)
    
    p_data = train_dict[target_p]
    x_train = np.concatenate([z[0] for z in p_data], axis=0).astype(np.float32)
    y_train = np.concatenate([z[1] for z in p_data], axis=0).astype(int)
    
    p_data = test_dict[target_p]
    x_dev = np.concatenate([z[0] for z in p_data[:3]], axis=0).astype(np.float32)
    y_dev = np.concatenate([z[1] for z in p_data[:3]], axis=0).astype(int)
    x_test = np.concatenate([z[0] for z in p_data[3:]], axis=0).astype(np.float32)
    y_test = np.concatenate([z[1] for z in p_data[3:]], axis=0).astype(int)
    
    pretrain_c0 = sum(y_pretrain == 0)
    pretrain_c1 = sum(y_pretrain == 1)
    train_c0 = sum(y_train == 0)
    train_c1 = sum(y_train == 1)
    dev_c0 = sum(y_dev == 0)
    dev_c1 = sum(y_dev == 1)
    test_c0 = sum(y_test == 0)
    test_c1 = sum(y_test == 1)
        
    # Train and test model
    try:
        print(f'\n=============On participant {target_p}=============')
        
        clf_2 = GradientBoostingClassifier(n_estimators=200, warm_start=False, max_depth=4)
        clf_2.fit(x_pretrain, y_pretrain)
        y_pred_2 = clf_2.predict(x_dev)
        
        tp = sum((y_pred_2 == 1) & (y_dev == 1))
        tn = sum((y_pred_2 == 0) & (y_dev == 0))
        fp = sum((y_pred_2 == 1) & (y_dev == 0))
        fn = sum((y_pred_2 == 0) & (y_dev == 1))
        print(f'Noseed   -> tp: {tp} | tn: {tn} | fp: {fp} | fn: {fn}')
        
        clf = GradientBoostingClassifier(n_estimators=180, warm_start=True, max_depth=4)
        clf.fit(x_pretrain, y_pretrain)
        clf.set_params(n_estimators=200)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_dev)
        
        tp = sum((y_pred == 1) & (y_dev == 1))
        tn = sum((y_pred == 0) & (y_dev == 0))
        fp = sum((y_pred == 1) & (y_dev == 0))
        fn = sum((y_pred == 0) & (y_dev == 1))
        print(f'Seeded   -> tp: {tp} | tn: {tn} | fp: {fp} | fn: {fn}')
        

        clf_3 = GradientBoostingClassifier(n_estimators=200, warm_start=False, max_depth=4)
        clf_3.fit(x_train, y_train)
        y_pred_3 = clf_3.predict(x_dev)
        
        tp = sum((y_pred_3 == 1) & (y_dev == 1))
        tn = sum((y_pred_3 == 0) & (y_dev == 0))
        fp = sum((y_pred_3 == 1) & (y_dev == 0))
        fn = sum((y_pred_3 == 0) & (y_dev == 1))
        
        onlyseed_f1 += f1_score(y_pred_3, y_dev)
        noseed_f1 += f1_score(y_pred_2, y_dev)
        seed_f1 += f1_score(y_pred, y_dev)
        n_valid += 1
        
        print(f'Onlyseed -> tp: {tp} | tn: {tn} | fp: {fp} | fn: {fn}')
        print('')
        print(f'Noseed   -> '
              f'F1: {f1_score(y_pred_2, y_dev):.2f} | '
              f'Acc: {accuracy_score(y_pred_2, y_dev):.2f} | '
              f'Rec: {recall_score(y_pred_2, y_dev):.2f} | '
              f'Prec: {precision_score(y_pred_2, y_dev):.2f}')

        print(f'Seeded   -> '
              f'F1: {f1_score(y_pred, y_dev):.2f} | '
              f'Acc: {accuracy_score(y_pred, y_dev):.2f} | '
              f'Rec: {recall_score(y_pred, y_dev):.2f} | '
              f'Prec: {precision_score(y_pred, y_dev):.2f}')
        
        print(f'Onlyseed -> '
              f'F1: {f1_score(y_pred_3, y_dev):.2f} | '
              f'Acc: {accuracy_score(y_pred_3, y_dev):.2f} | '
              f'Rec: {recall_score(y_pred_3, y_dev):.2f} | '
              f'Prec: {precision_score(y_pred_3, y_dev):.2f}')
        
        print('')
        print(f'Pretrain -> c0: {pretrain_c0} | c1: {pretrain_c1}')
        print(f'Train    -> c0: {train_c0} | c1: {train_c1}')
        print(f'Dev      -> c0: {dev_c0} | c1: {dev_c1}')
        print(f'Test     -> c0: {test_c0} | c1: {test_c1}')
        print('=============================================\n')
    except Exception as e:
        print(e)
        continue

        
print(noseed_f1/n_valid)
print(seed_f1/n_valid)
print(onlyseed_f1/n_valid)


Noseed   -> tp: 26 | tn: 63 | fp: 13 | fn: 70
Seeded   -> tp: 55 | tn: 74 | fp: 2 | fn: 41
Onlyseed -> tp: 61 | tn: 40 | fp: 36 | fn: 35

Noseed   -> F1: 0.39 | Acc: 0.52 | Rec: 0.67 | Prec: 0.27
Seeded   -> F1: 0.72 | Acc: 0.75 | Rec: 0.96 | Prec: 0.57
Onlyseed -> F1: 0.63 | Acc: 0.59 | Rec: 0.63 | Prec: 0.64

Pretrain -> c0: 2174 | c1: 1461
Train    -> c0: 109 | c1: 136
Dev      -> c0: 76 | c1: 96
Test     -> c0: 3 | c1: 53


Noseed   -> tp: 50 | tn: 12 | fp: 10 | fn: 30
Seeded   -> tp: 39 | tn: 18 | fp: 4 | fn: 41
Onlyseed -> tp: 36 | tn: 18 | fp: 4 | fn: 44

Noseed   -> F1: 0.71 | Acc: 0.61 | Rec: 0.83 | Prec: 0.62
Seeded   -> F1: 0.63 | Acc: 0.56 | Rec: 0.91 | Prec: 0.49
Onlyseed -> F1: 0.60 | Acc: 0.53 | Rec: 0.90 | Prec: 0.45

Pretrain -> c0: 2194 | c1: 1559
Train    -> c0: 89 | c1: 38
Dev      -> c0: 22 | c1: 80
Test     -> c0: 0 | c1: 30


Noseed   -> tp: 36 | tn: 2 | fp: 35 | fn: 3
Seeded   -> tp: 19 | tn: 20 | fp: 17 | fn: 20
Onlyseed -> tp: 19 | tn: 24 | fp: 13 | fn: 20

N

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Noseed   -> tp: 0 | tn: 111 | fp: 13 | fn: 20
Seeded   -> tp: 15 | tn: 18 | fp: 106 | fn: 5
Onlyseed -> tp: 20 | tn: 0 | fp: 124 | fn: 0

Noseed   -> F1: 0.00 | Acc: 0.77 | Rec: 0.00 | Prec: 0.00
Seeded   -> F1: 0.21 | Acc: 0.23 | Rec: 0.12 | Prec: 0.75
Onlyseed -> F1: 0.24 | Acc: 0.14 | Rec: 0.14 | Prec: 1.00

Pretrain -> c0: 2198 | c1: 1488
Train    -> c0: 85 | c1: 109
Dev      -> c0: 124 | c1: 20
Test     -> c0: 56 | c1: 9


Noseed   -> tp: 12 | tn: 1 | fp: 2 | fn: 132
Seeded   -> tp: 72 | tn: 3 | fp: 0 | fn: 72
Onlyseed -> tp: 137 | tn: 2 | fp: 1 | fn: 7

Noseed   -> F1: 0.15 | Acc: 0.09 | Rec: 0.86 | Prec: 0.08
Seeded   -> F1: 0.67 | Acc: 0.51 | Rec: 1.00 | Prec: 0.50
Onlyseed -> F1: 0.97 | Acc: 0.95 | Rec: 0.99 | Prec: 0.95

Pretrain -> c0: 2266 | c1: 1435
Train    -> c0: 17 | c1: 162
Dev      -> c0: 3 | c1: 144
Test     -> c0: 0 | c1: 63


Noseed   -> tp: 0 | tn: 122 | fp: 45 | fn: 0
y contains 1 class after sample_weight trimmed classes with zero weights, while a minimum of 2 

  _warn_prf(average, modifier, msg_start, len(result))


Noseed   -> tp: 57 | tn: 0 | fp: 0 | fn: 94
Seeded   -> tp: 151 | tn: 0 | fp: 0 | fn: 0
Onlyseed -> tp: 150 | tn: 0 | fp: 0 | fn: 1

Noseed   -> F1: 0.55 | Acc: 0.38 | Rec: 1.00 | Prec: 0.38
Seeded   -> F1: 1.00 | Acc: 1.00 | Rec: 1.00 | Prec: 1.00
Onlyseed -> F1: 1.00 | Acc: 0.99 | Rec: 1.00 | Prec: 0.99

Pretrain -> c0: 2244 | c1: 1455
Train    -> c0: 39 | c1: 142
Dev      -> c0: 0 | c1: 151
Test     -> c0: 0 | c1: 39


Noseed   -> tp: 9 | tn: 50 | fp: 42 | fn: 3
y contains 1 class after sample_weight trimmed classes with zero weights, while a minimum of 2 classes are required.

Noseed   -> tp: 59 | tn: 3 | fp: 4 | fn: 77
Seeded   -> tp: 114 | tn: 2 | fp: 5 | fn: 22
Onlyseed -> tp: 132 | tn: 0 | fp: 7 | fn: 4

Noseed   -> F1: 0.59 | Acc: 0.43 | Rec: 0.94 | Prec: 0.43
Seeded   -> F1: 0.89 | Acc: 0.81 | Rec: 0.96 | Prec: 0.84
Onlyseed -> F1: 0.96 | Acc: 0.92 | Rec: 0.95 | Prec: 0.97

Pretrain -> c0: 2235 | c1: 1454
Train    -> c0: 48 | c1: 143
Dev      -> c0: 7 | c1: 136
Test     -> c0

  _warn_prf(average, modifier, msg_start, len(result))
