# Initial Imports

In [1]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# import plotly.express as px
# import plotly.graph_objects as go
# from plotly.subplots import make_subplots
# import pickle
# from sklearn.metrics import f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import StratifiedKFold, train_test_split
# from sklearn.feature_selection import SelectFromModel
# from collections import defaultdict
# import scipy.stats as stats

import warnings
warnings.filterwarnings("ignore", category = UserWarning)

# Preprocessing

In [2]:
# Preprocess dataset (Outputs: ids, trial_nums, predictors_df, outcomes_df, outcomes_df_shuffled)
# Import dataset
df = pd.read_csv("./../../Datasets/kieranFeatures_1-31_21-Jan-2025_avgof3_rawSA.csv").replace([np.inf, -np.inf], np.nan)

# Create Low vs High Columns
df["Lv_1_Lo"] = (df["SA1"] < 5).astype(np.bool_)
df["Lv_2_Lo"] = (df["SA2"] < 5).astype(np.bool_)
df["Lv_3_Lo"] = (df["SA3"] < 5).astype(np.bool_)
df["Tot_Lo"] = (df["SAtotal"] < 15).astype(np.bool_)

# Impute missing values with mean of column
for col in df.columns:
    df[col] = df[col].fillna(value = df[col].mean())

# Split up dataset
ids = df["ID"].astype(np.uint8)
predictors_df = df.drop(columns = ["ID", "trialNum", "SA1", "SA2", "SA3", "SAtotal", "Lv_1_Lo", "Lv_2_Lo", "Lv_3_Lo", "Tot_Lo"]).astype(np.float64)
outcomes_df = df[["Lv_1_Lo", "Lv_2_Lo", "Lv_3_Lo", "Tot_Lo"]]

# Divide up dataframe
ids = df["ID"]
ECG_df = df.loc[:, [col for col in df if col.startswith("ECG")]]
EDA_df = df.loc[:, [col for col in df if col.startswith("EDA")]]
EEG_df = df.loc[:, [col for col in df if col.startswith("EEG")]]
EYE_df = df.loc[:, [col for col in df if col.startswith("EYE")]]
fNIRS_df = df.loc[:, [col for col in df if col.startswith("fNIRS")]]
RSP_df = df.loc[:, [col for col in df if col.startswith("RSP")]]
sensor_dfs = [(ECG_df, "ECG"), (EDA_df, "EDA"), (EEG_df, "EEG"), (EYE_df, "EYE"), (fNIRS_df, "fNIRS"), (RSP_df, "RSP")]

# Free up memory
del col, df, predictors_df

# Hyperparameter Tuning

In [None]:
def get_sensor_model(df, level):
    # Train-Test Split
    X_cv, _, y_cv, _ = train_test_split(df, outcomes_df, test_size = 0.15, random_state = 42, stratify = ids)
    ids_cv = ids.iloc[X_cv.index.values]
    X_train, _, y_train, _ = train_test_split(X_cv, y_cv, test_size = 0.17647, random_state = 42, stratify = ids_cv) # 15% of original dataframe for validation set
    ids_train = ids.iloc[X_train.index.values]

    # Scale the Training Data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)

    skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)

    # Setup CV model which utilizes Ridge regression
    sensor_model = LogisticRegressionCV(
        Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        fit_intercept = False,
        cv = skf.split(X_train_scaled, ids_train),
        scoring = "f1",
        solver = "saga",
        max_iter = 20000,
        class_weight = "balanced",
        n_jobs = -1,
        random_state = 42
    )

    sensor_model.fit(X_train_scaled, y_train[level])

    return sensor_model

## SA Level 1

In [4]:
for sensor_df, sensor_name in sensor_dfs:
    print(sensor_name + ": ", end = "")
    print(get_sensor_model(sensor_df, "Lv_1_Lo").C_)

ECG: [1.]
EDA: [1.]
EEG: [0.01]
EYE: [0.01]
fNIRS: [0.1]
RSP: [0.1]
