In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import plotly.express as px
# import plotly.graph_objects as go
# from plotly.subplots import make_subplots
# import pickle
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
# from sklearn.feature_selection import SelectFromModel
# from collections import defaultdict
import scipy.stats as stats

import warnings
warnings.filterwarnings("ignore", category = UserWarning)

# Preprocessing

In [5]:
# Preprocess dataset (Outputs: ids, trial_nums, predictors_df, outcomes_df, outcomes_df_shuffled)
# Import dataset
df = pd.read_csv("./../../Datasets/kieranFeatures_1-31_21-Jan-2025_avgof3_rawSA.csv").replace([np.inf, -np.inf], np.nan)

# Create Low vs High Columns
df["Lv_1_Lo"] = (df["SA1"] < 5).astype(bool)
df["Lv_2_Lo"] = (df["SA2"] < 5).astype(bool)
df["Lv_3_Lo"] = (df["SA3"] < 5).astype(bool)
df["Tot_Lo"] = (df["SAtotal"] < 15).astype(bool)

# Impute missing values with mean of column
for col in df.columns:
    df[col] = df[col].fillna(value = df[col].mean())

# Split up dataset
ids = df["ID"].astype(np.uint8)
predictors_df = df.drop(columns = ["ID", "trialNum", "SA1", "SA2", "SA3", "SAtotal", "Lv_1_Lo", "Lv_2_Lo", "Lv_3_Lo", "Tot_Lo"]).astype(np.float64)
outcomes_df = df[["Lv_1_Lo", "Lv_2_Lo", "Lv_3_Lo", "Tot_Lo"]]

# Divide up dataframe
ids = df["ID"]
ECG_df = df.loc[:, [col for col in df if col.startswith("ECG")]]
EDA_df = df.loc[:, [col for col in df if col.startswith("EDA")]]
EEG_df = df.loc[:, [col for col in df if col.startswith("EEG")]]
EYE_df = df.loc[:, [col for col in df if col.startswith("EYE")]]
fNIRS_df = df.loc[:, [col for col in df if col.startswith("fNIRS")]]
RSP_df = df.loc[:, [col for col in df if col.startswith("RSP")]]
sensor_dfs = [(ECG_df, "ECG"), (EDA_df, "EDA"), (EEG_df, "EEG"), (EYE_df, "EYE"), (fNIRS_df, "fNIRS"), (RSP_df, "RSP")]

# Train-Validation-Test Splits
ids_train, ids_test = train_test_split(ids, test_size = 0.2, random_state = 42, stratify = ids)

outcomes_train = outcomes_df.loc[ids_train.index]
outcomes_test = outcomes_df.loc[ids_test.index]

# Free up memory
del col, df, predictors_df, outcomes_df

# Tuning

In [6]:
def hyperparameter_tune_sensor_model(df, level):
    """
    Performs custom Cross Validation to find the best hyperparameters for the sensor model.
    Train-Validate-Test Split: 70-15-15 done.
    5-Fold Stratifies Cross Validation is done on the training set with respect to the ids.
    The best model is selected based on the F1 score.
    """

    # Train-Validate-Test Split
    X_train, y_train = df.iloc[ids_train.index], outcomes_train[level]

    Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
    best_model_C = 1.0
    best_model_score = 0.0

    # Get Best C Hyperparameter
    for c_param in Cs:
        skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
        model_scores = []

        # Iterate Through All Possible Folds
        for _, (train_index, test_index) in enumerate(skf.split(X_train, ids_train)):
            # Train-Test Split for Cross Validation of Base Models
            X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
            y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

            # Scale the Training Data
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train_fold)
            X_test_scaled = scaler.transform(X_test_fold)

            # Fit Model
            model = LogisticRegression(
                C = c_param,
                solver = "saga",
                max_iter = 20000,
                class_weight = "balanced",
                n_jobs = -1,
                random_state = 42
            )

            model.fit(X_train_scaled, y_train_fold)
            model_scores.append(f1_score(y_test_fold, model.predict(X_test_scaled)))

        # Update Best Model if CV Score is Better
        if np.mean(model_scores) > best_model_score:
            best_model_C = c_param
            best_model_score = np.mean(model_scores)

    return best_model_C

In [8]:
for level in ["Lv_1_Lo", "Lv_2_Lo", "Lv_3_Lo", "Tot_Lo"]:
    print(level)

    for sensor_df, sensor_name in sensor_dfs:
        print(sensor_name + ": ", end = "")
        print(hyperparameter_tune_sensor_model(sensor_df, level))

    print()

Lv_1_Lo
ECG: 0.01
EDA: 100
EEG: 10
EYE: 0.001
fNIRS: 0.01
RSP: 0.01

Lv_2_Lo
ECG: 10
EDA: 0.1
EEG: 0.1
EYE: 0.01
fNIRS: 0.001
RSP: 10

Lv_3_Lo
ECG: 10
EDA: 0.1
EEG: 0.01
EYE: 0.1
fNIRS: 0.001
RSP: 1000

Tot_Lo
ECG: 100
EDA: 10
EEG: 0.1
EYE: 0.01
fNIRS: 0.001
RSP: 1

