In [55]:
# IMPORTS
# import sys
# sys.path.insert(0, '../Analysis')
import helpers as h
import empatica_helpers as eh
import inquisit_helpers as ih
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from importlib import reload
import pickle

# ML IMPORTS
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

reload(h), reload(eh), reload(ih)

# GLOBAL SETTINGS
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:.2f}'.format
plt.rcParams["figure.figsize"] = (20, 10)
plt.style.use('seaborn-v0_8-notebook') # plt.style.use('ggplot'); print(plt.style.available)
pd.set_option('display.max_columns', None)

sr = 32
wl = 24 # Window length in seconds

# FULL PIPELINE
# e_raw, _ = eh.load_empatica(data_folder='input/empatica/', useIBI=False, save=True, plotTrimmings=False, desired_sampling_rate=sr)
# i_raw = ih.load_inquisit(data_folder='input/inquisit/', save=True)
# ei_raw = h.combine_empatica_and_inquisit(e_raw, i_raw, save=True, sr=sr)
# ei_prep = h.clean_and_filter(save=True, normalise=None, sr=sr, window_length=wl)
# X, y, p = h.prepare_for_vae(sr=sr, wl=wl, filepath="output/ei_prep_original.csv", save=True, normalise=None) # Normalisation now happens later in the process. Normalise = False applies the standard scaler to the data.
X_train, X_val, X_test, y_train, y_val, y_test, p_train, p_val, p_test = h.prepare_train_val_test_sets(filenames=['output/dl_X_wl24_sr32_original.pkl', 'output/dl_y_wl24_sr32_original.pkl', 'output/dl_p_wl24_sr32_original.pkl'])
X_train, X_val, X_test = h.handle_outliers_and_impute(X_train, X_val, X_test, num_mad=4, verbose=True)
X_train, X_val, X_test = h.scale_features(X_train, X_val, X_test, p_train, p_val, p_test, normalise=True)

Train size:  80.23809523809524
Val size:  8.333333333333332
Test size:  11.428571428571429
Size: : (1011, 768, 6)
Initial imputation complete.
Missing values before outlier detection:
   Train  Validation  Test
0   0.00        0.00  0.00




Final imputation complete.
  Feature  Train  Validation  Test
0       0   0.59        0.00  0.00
1       1  15.58       14.62  6.23
2       2  10.81        0.00  0.00
3       3  11.32       15.51 13.71
4       4  13.67        0.00  0.00
5       5  23.94        9.54 13.29


## Handle outliers and impute

### Robust Mahalanobis distance (computationally way too expensive)

In [None]:
from sklearn.covariance import MinCovDet
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import numpy as np

def handle_outliers_and_impute(X_train, X_val, X_test, random_state=0):
    # Number of features
    num_features = X_train.shape[2]

    for feature in range(num_features):
        print(f"Processing feature {feature+1} out of {num_features}")
        
        # Select the feature from each dataset
        X_train_feature = X_train[:, :, feature]
        X_val_feature = X_val[:, :, feature]
        X_test_feature = X_test[:, :, feature]

        # Fit the IterativeImputer on the training set and transform training, validation, and test sets
        imputer = IterativeImputer(random_state=random_state)
        X_train_feature = imputer.fit_transform(X_train_feature)
        X_val_feature = imputer.transform(X_val_feature)
        X_test_feature = imputer.transform(X_test_feature)
        print(f"Feature {feature}: pre-outlier detection imputation complete.")
        
        # Robust Mahalanobis Distance
        robust_cov = MinCovDet().fit(X_train_feature)
        maha_dist = robust_cov.mahalanobis(X_train_feature)
        threshold = np.percentile(maha_dist, 99.7)  # 3 standard deviations
        outliers_train = maha_dist > threshold
        print(f"Feature {feature}: Percentage of outliers in training set: {np.mean(outliers_train) * 100:.2f}%")

        # Identify outliers in validation and test sets
        maha_dist_val = robust_cov.mahalanobis(X_val_feature)
        outliers_val = maha_dist_val > threshold
        print(f"Feature {feature}: Percentage of outliers in validation set: {np.mean(outliers_val) * 100:.2f}%")

        maha_dist_test = robust_cov.mahalanobis(X_test_feature)
        outliers_test = maha_dist_test > threshold
        print(f"Feature {feature}: Percentage of outliers in test set: {np.mean(outliers_test) * 100:.2f}%")

        # Create a mask for inliers
        inliers_train = ~outliers_train

        # Fit the IterativeImputer on the inliers in the training set
        imputer = IterativeImputer(random_state=random_state)
        imputer.fit(X_train_feature[inliers_train])

        # Use the fitted imputer to predict the values for the outliers in the training, validation, and test sets
        for dataset, outliers in zip([X_train_feature, X_val_feature, X_test_feature], [outliers_train, outliers_val, outliers_test]):
            dataset[outliers] = imputer.transform(dataset[outliers.reshape(-1,1)])

        # Assign the processed feature back to the original datasets
        X_train[:, :, feature] = X_train_feature
        X_val[:, :, feature] = X_val_feature
        X_test[:, :, feature] = X_test_feature

    return X_train, X_val, X_test

X_train, X_val, X_test = handle_outliers_and_impute(X_train, X_val, X_test)

### MAD

In [48]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import numpy as np
from scipy.stats import median_abs_deviation
import pandas as pd

def handle_outliers_and_impute(X_train, X_val, X_test, random_state=42, num_mad=3, verbose=False):
    # Number of features
    num_features = X_train.shape[2]

    # Impute missing values before outlier detection
    imputer = IterativeImputer(random_state=random_state)

    # Reshape the data to 2D, impute, then reshape back to 3D
    X_train_shape = X_train.shape
    X_val_shape = X_val.shape
    X_test_shape = X_test.shape
    
    X_train = imputer.fit_transform(X_train.reshape(-1, X_train_shape[-1])).reshape(X_train_shape)
    X_val = imputer.transform(X_val.reshape(-1, X_val_shape[-1])).reshape(X_val_shape)
    X_test = imputer.transform(X_test.reshape(-1, X_test_shape[-1])).reshape(X_test_shape)

    print("Initial imputation complete.")

    # Print missing values
    if verbose:
        print("Missing values before outlier detection:")
        print(pd.DataFrame({
            'Train': [np.mean(np.isnan(X_train))],
            'Validation': [np.mean(np.isnan(X_val))],
            'Test': [np.mean(np.isnan(X_test))]
        }))

    # Initialize arrays to store outliers
    outliers_train = np.zeros_like(X_train, dtype=bool)
    outliers_val = np.zeros_like(X_val, dtype=bool)
    outliers_test = np.zeros_like(X_test, dtype=bool)

    # Initialize DataFrame to store percentage of outliers
    outliers_df = pd.DataFrame(columns=['Feature', 'Train', 'Validation', 'Test'])

    for feature in range(num_features):
        # Select the feature from each dataset
        X_train_feature = X_train[:, :, feature]
        X_val_feature = X_val[:, :, feature]
        X_test_feature = X_test[:, :, feature]

        # Median Absolute Deviation
        mad = median_abs_deviation(X_train_feature)
        threshold = num_mad * mad  # 3x median absolute deviation as threshold

        outliers_train[:, :, feature] = np.abs(X_train_feature - np.median(X_train_feature)) > threshold
        outliers_val[:, :, feature] = np.abs(X_val_feature - np.median(X_val_feature)) > threshold
        outliers_test[:, :, feature] = np.abs(X_test_feature - np.median(X_test_feature)) > threshold

        # Add percentage of outliers to DataFrame
        outliers_df = pd.concat([outliers_df, pd.DataFrame({
            'Feature': feature,
            'Train': np.mean(outliers_train[:, :, feature]) * 100,
            'Validation': np.mean(outliers_val[:, :, feature]) * 100,
            'Test': np.mean(outliers_test[:, :, feature]) * 100
        }, index=[0])], ignore_index=True)

    # Replace outliers with np.nan in the original datasets
    X_train = np.where(outliers_train, np.nan, X_train)
    X_val = np.where(outliers_val, np.nan, X_val)
    X_test = np.where(outliers_test, np.nan, X_test)

    # Impute missing values after outlier detection
    X_train = imputer.fit_transform(X_train.reshape(-1, X_train_shape[-1])).reshape(X_train_shape)
    X_val = imputer.transform(X_val.reshape(-1, X_val_shape[-1])).reshape(X_val_shape)
    X_test = imputer.transform(X_test.reshape(-1, X_test_shape[-1])).reshape(X_test_shape)

    print("Final imputation complete.")

    # Print DataFrame of outliers
    if verbose:
        print(outliers_df)

    return X_train, X_val, X_test

X_train_imp_3, X_val_imp_3, X_test_imp_3 = handle_outliers_and_impute(X_train, X_val, X_test, verbose=True)
X_train_imp_4, X_val_imp_4, X_test_imp_4 = handle_outliers_and_impute(X_train, X_val, X_test, num_mad=4)

Initial imputation complete.
Missing values before outlier detection:
   Train  Validation  Test
0   0.00        0.00  0.00


  outliers_df = pd.concat([outliers_df, pd.DataFrame({


Final imputation complete.
  Feature  Train  Validation  Test
0       0   2.28        3.60  0.00
1       1  18.39       31.15 15.65
2       2   6.89        2.01  0.38
3       3  10.19        9.69 16.01
4       4  13.12        0.02 14.30
5       5  27.09       22.64 28.72
Initial imputation complete.


  outliers_df = pd.concat([outliers_df, pd.DataFrame({


Final imputation complete.


## Per-participant outlier detection and imputation

In [33]:
def handle_outliers_and_impute(X_train, X_val, X_test, p_train, p_val, p_test, random_state=0, num_mad=3):
    # Number of features
    num_features = X_train.shape[2]

    # Initialize arrays to store outliers
    outliers_train = np.zeros_like(X_train, dtype=bool)
    outliers_val = np.zeros_like(X_val, dtype=bool)
    outliers_test = np.zeros_like(X_test, dtype=bool)

    # Initialize DataFrame to store percentage of outliers
    outliers_df = pd.DataFrame(columns=['Feature', 'Train', 'Validation', 'Test'])

    # Unique participants
    unique_participants = np.unique(np.concatenate([p_train, p_val, p_test]))

    for participant in unique_participants:
        # Get indices for this participant
        train_indices = np.where(p_train == participant)[0]
        val_indices = np.where(p_val == participant)[0]
        test_indices = np.where(p_test == participant)[0]

        for feature in range(num_features):
            # Select the feature from each dataset for this participant
            X_train_feature = X_train[train_indices, :, feature]
            X_val_feature = X_val[val_indices, :, feature]
            X_test_feature = X_test[test_indices, :, feature]

            # Median Absolute Deviation
            mad_train = median_abs_deviation(X_train_feature)
            mad_val = median_abs_deviation(X_val_feature)
            mad_test = median_abs_deviation(X_test_feature)

            threshold_train = num_mad * mad_train
            threshold_val = num_mad * mad_val
            threshold_test = num_mad * mad_test

            outliers_train[train_indices, :, feature] = np.abs(X_train_feature - np.median(X_train_feature)) > threshold_train
            outliers_val[val_indices, :, feature] = np.abs(X_val_feature - np.median(X_val_feature)) > threshold_val
            outliers_test[test_indices, :, feature] = np.abs(X_test_feature - np.median(X_test_feature)) > threshold_test

            # Add percentage of outliers to DataFrame
            outliers_df = pd.concat([outliers_df, pd.DataFrame({
                'Feature': feature,
                'Train': np.mean(outliers_train[train_indices, :, feature]) * 100,
                'Validation': np.mean(outliers_val[val_indices, :, feature]) * 100,
                'Test': np.mean(outliers_test[test_indices, :, feature]) * 100
            }, index=[0])], ignore_index=True)

    # Replace outliers with np.nan in the original datasets
    X_train = np.where(outliers_train, np.nan, X_train)
    X_val = np.where(outliers_val, np.nan, X_val)
    X_test = np.where(outliers_test, np.nan, X_test)

    # Impute missing values
    imputer = IterativeImputer(random_state=random_state) # estimatorestimator object, default=BayesianRidge()

    # Reshape the data to 2D, impute, then reshape back to 3D
    X_train_shape = X_train.shape
    X_val_shape = X_val.shape
    X_test_shape = X_test.shape
    
    X_train = imputer.fit_transform(X_train.reshape(-1, X_train_shape[-1])).reshape(X_train_shape)
    X_val = imputer.transform(X_val.reshape(-1, X_val_shape[-1])).reshape(X_val_shape)
    X_test = imputer.transform(X_test.reshape(-1, X_test_shape[-1])).reshape(X_test_shape)

    print("Imputation complete.")

    # Print DataFrame of outliers
    print(outliers_df)

    return X_train, X_val, X_test

X_train_imp_3_pp, X_val_imp_3_pp, X_test_imp_3_pp = handle_outliers_and_impute(X_train, X_val, X_test, p_train, p_val, p_test)
X_train_imp_4_pp, X_val_imp_4_pp, X_test_imp_4_pp = handle_outliers_and_impute(X_train, X_val, X_test, p_train, p_val, p_test, num_mad=4)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  outliers_df = pd.concat([outliers_df, pd.DataFrame({


Imputation complete.
   Feature  Train  Validation  Test
0        0    NaN         NaN 28.48
1        1    NaN         NaN 10.56
2        2    NaN         NaN  8.81
3        3    NaN         NaN 13.63
4        4    NaN         NaN  2.30
5        5    NaN         NaN 23.90
6        0   3.65         NaN   NaN
7        1  22.73         NaN   NaN
8        2  10.09         NaN   NaN
9        3  19.14         NaN   NaN
10       4  17.63         NaN   NaN
11       5  26.02         NaN   NaN
12       0   3.68         NaN   NaN
13       1  14.37         NaN   NaN
14       2  12.07         NaN   NaN
15       3  20.10         NaN   NaN
16       4   4.58         NaN   NaN
17       5  29.00         NaN   NaN
18       0  15.42         NaN   NaN
19       1  29.09         NaN   NaN
20       2  11.74         NaN   NaN
21       3  11.87         NaN   NaN
22       4  22.22         NaN   NaN
23       5  27.48         NaN   NaN
24       0  36.83         NaN   NaN
25       1  15.74         NaN   NaN
26     

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  outliers_df = pd.concat([outliers_df, pd.DataFrame({


Imputation complete.
   Feature  Train  Validation  Test
0        0    NaN         NaN 28.47
1        1    NaN         NaN  6.00
2        2    NaN         NaN  3.31
3        3    NaN         NaN  6.32
4        4    NaN         NaN  1.82
5        5    NaN         NaN 19.24
6        0   0.00         NaN   NaN
7        1  17.82         NaN   NaN
8        2   6.86         NaN   NaN
9        3  14.17         NaN   NaN
10       4  15.29         NaN   NaN
11       5  21.63         NaN   NaN
12       0   0.00         NaN   NaN
13       1   9.80         NaN   NaN
14       2   3.71         NaN   NaN
15       3  13.66         NaN   NaN
16       4   0.89         NaN   NaN
17       5  24.65         NaN   NaN
18       0  15.24         NaN   NaN
19       1  25.91         NaN   NaN
20       2  11.11         NaN   NaN
21       3   9.02         NaN   NaN
22       4  18.95         NaN   NaN
23       5  23.35         NaN   NaN
24       0  36.02         NaN   NaN
25       1  10.45         NaN   NaN
26     

## Evaluation

In [41]:
# %pip install openpyxl
num_features = X_train.shape[2]
data = []

# Assuming 'temp', 'bvp', 'hr', 'body_acc', 'eda_tonic', 'eda_phasic' are feature names
feature_names = ['temp', 'bvp', 'hr', 'body_acc', 'eda_tonic', 'eda_phasic']

for i in range(num_features):
    # Select the feature from each dataset
    X_feature = X_train[:, :, i]
    X_feature_imp_3 = X_train_imp_3[:, :, i]
    X_feature_imp_4 = X_train_imp_4[:, :, i]

    data.append({
        'Feature': feature_names[i],
        'Original Mean': np.nanmean(X_feature),
        'Imputed Mean (3MAD)': np.mean(X_feature_imp_3),
        'Imputed Mean (4MAD)': np.mean(X_feature_imp_4),
        'Original Std': np.nanstd(X_feature),
        'Imputed Std (3MAD)': np.std(X_feature_imp_3),
        'Imputed Std (4MAD)': np.std(X_feature_imp_4),
        'Original Min': np.min(X_feature),
        'Imputed Min (3MAD)': np.min(X_feature_imp_3),
        'Imputed Min (4MAD)': np.min(X_feature_imp_4),
        'Original Max': np.max(X_feature),
        'Imputed Max (3MAD)': np.max(X_feature_imp_3),
        'Imputed Max (4MAD)': np.max(X_feature_imp_4)
    })
    
pd.set_option('display.max_columns', None)

df = pd.DataFrame(data)
# Make the feature a column instead of an index

df = df.transpose()
print(df)
df= df.to_excel('output/imputation_stats.xlsx', index=True)


                        0        1      2         3          4           5
Feature              temp      bvp     hr  body_acc  eda_tonic  eda_phasic
Original Mean       29.43     0.01  79.35     64.49       0.56        0.00
Imputed Mean (3MAD) 29.51     0.56  77.01     64.41       0.49        0.00
Imputed Mean (4MAD) 29.52     0.17  77.52     64.41       0.50        0.00
Original Std         2.49    78.74  17.15      2.81       0.48        0.04
Imputed Std (3MAD)   2.04    27.91  14.64      0.48       0.30        0.01
Imputed Std (4MAD)   2.08    33.80  15.21      0.56       0.32        0.01
Original Min        21.91 -1578.11  55.30      6.16       0.02       -1.00
Imputed Min (3MAD)  25.10  -384.55  49.93     53.44      -0.27       -0.07
Imputed Min (4MAD)  25.10  -522.05  55.30     50.05       0.03       -0.09
Original Max        34.80  2222.22 149.34    185.16       3.34        0.73
Imputed Max (3MAD)  33.25   328.27 149.34     76.48       1.78        0.07
Imputed Max (4MAD)  33.66

In [34]:
# %pip install openpyxl
num_features = X_train.shape[2]
data = []

# Assuming 'temp', 'bvp', 'hr', 'body_acc', 'eda_tonic', 'eda_phasic' are feature names
feature_names = ['temp', 'bvp', 'hr', 'body_acc', 'eda_tonic', 'eda_phasic']

# Adapt loop for new datasets
df_names = {
    'original' : X_train,
    'imputed_3' : X_train_imp_3,
    'imputed_3_pp' : X_train_imp_3_pp,
    'imputed_4' : X_train_imp_4,
    'imputed_4_pp' : X_train_imp_4_pp
}

for i in range(num_features):
    for name, dataset in df_names.items():
        # Select the feature from each dataset
        X_feature = dataset[:, :, i]

        data.append({
            'Feature': feature_names[i],
            'Dataset': name,
            'Mean': np.nanmean(X_feature),
            'Std': np.nanstd(X_feature),
            'Min': np.nanmin(X_feature),
            'Max': np.nanmax(X_feature)
        })
    
pd.set_option('display.max_columns', None)

df = pd.DataFrame(data)
# df.to_excel('output/imputation_stats.xlsx', index=False)
print(df)

       Feature       Dataset  Mean   Std      Min     Max
0         temp      original 29.43  2.49    21.91   34.80
1         temp     imputed_3 29.51  2.04    25.10   33.25
2         temp  imputed_3_pp 29.51  2.04    25.10   33.25
3         temp     imputed_4 29.52  2.08    25.10   33.66
4         temp  imputed_4_pp 29.52  2.08    25.10   33.66
5          bvp      original  0.01 78.74 -1578.11 2222.22
6          bvp     imputed_3  0.56 27.91  -384.55  328.27
7          bvp  imputed_3_pp  0.56 27.91  -384.55  328.27
8          bvp     imputed_4  0.17 33.80  -522.05  508.66
9          bvp  imputed_4_pp  0.17 33.80  -522.05  508.66
10          hr      original 79.35 17.15    55.30  149.34
11          hr     imputed_3 77.01 14.64    49.93  149.34
12          hr  imputed_3_pp 77.01 14.64    49.93  149.34
13          hr     imputed_4 77.52 15.21    55.30  149.34
14          hr  imputed_4_pp 77.52 15.21    55.30  149.34
15    body_acc      original 64.49  2.81     6.16  185.16
16    body_acc

## Comparison of normalised data pre and post imputation

In [59]:
X_train, X_val, X_test, y_train, y_val, y_test, p_train, p_val, p_test = h.prepare_train_val_test_sets(filenames=['output/dl_X_wl24_sr32_original.pkl', 'output/dl_y_wl24_sr32_original.pkl', 'output/dl_p_wl24_sr32_original.pkl'])

X_train_pre, X_val_pre, X_test_pre = h.scale_features(X_train, X_val, X_test, p_train, p_val, p_test, normalise=True)
X_train, X_val, X_test = h.handle_outliers_and_impute(X_train, X_val, X_test, num_mad=4, verbose=True)
X_train_post, X_val_post, X_test_post = h.scale_features(X_train, X_val, X_test, p_train, p_val, p_test, normalise=True)

import pandas as pd

def create_stats_df(X_pre, X_post):
    stats_df = pd.DataFrame({
        'mean_pre': np.mean(X_pre, axis=(0, 1)),
        'mean_post': np.mean(X_post, axis=(0, 1)),
        'std_pre': np.std(X_pre, axis=(0, 1)),
        'std_post': np.std(X_post, axis=(0, 1)),
        'min_pre': np.min(X_pre, axis=(0, 1)),
        'min_post': np.min(X_post, axis=(0, 1)),
        'max_pre': np.max(X_pre, axis=(0, 1)),
        'max_post': np.max(X_post, axis=(0, 1))
    })

    return stats_df

train_stats_df = create_stats_df(X_train_pre, X_train_post)
val_stats_df = create_stats_df(X_val_pre, X_val_post)
test_stats_df = create_stats_df(X_test_pre, X_test_post)

print("TRAIN STATS")
print(train_stats_df)
print("VAL STATS")
print(val_stats_df)
print("TEST STATS")
print(test_stats_df)

train_stats_df.to_excel('output/train_imputation_stats_prepost_norm.xlsx', index=True)
val_stats_df.to_excel('output/val_imputation_stats_prepost_norm.xlsx', index=True)
test_stats_df.to_excel('output/test_imputation_stats_prepost_norm.xlsx', index=True)

Train size:  80.23809523809524
Val size:  8.333333333333332
Test size:  11.428571428571429
Size: : (1011, 768, 6)
[2.94260982e+01 1.18182592e-02 7.93496987e+01 6.44916532e+01
 5.63933428e-01 2.32704741e-05]
Initial imputation complete.
Missing values before outlier detection:
   Train  Validation  Test
0   0.00        0.00  0.00




Final imputation complete.
  Feature  Train  Validation  Test
0       0   0.59        0.00  0.00
1       1  15.58       14.62  6.23
2       2  10.81        0.00  0.00
3       3  11.32       15.51 13.71
4       4  13.67        0.00  0.00
5       5  23.94        9.54 13.29
TRAIN STATS
   mean_pre  mean_post  std_pre  std_post  min_pre  min_post  max_pre  \
0      0.00       0.00     1.00      1.00    -3.56     -5.13     2.96   
1     -0.00       0.00     1.00      1.00   -36.92     -4.59    53.08   
2     -0.00      -0.00     1.00      1.00    -2.97     -2.97     5.64   
3      0.00      -0.00     1.00      1.00   -19.71     -4.17    66.65   
4     -0.00       0.00     1.00      1.00    -6.12     -6.12     8.20   
5      0.00      -0.00     1.00      1.00   -23.81     -9.04    25.21   

   max_post  
0      2.96  
1      4.99  
2      4.17  
3      4.32  
4      8.20  
5      9.35  
VAL STATS
   mean_pre  mean_post  std_pre  std_post  min_pre  min_post  max_pre  \
0     -0.00      -0.00 

In [57]:

X_train.shape

(1011, 768, 6)