# Initial Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.metrics import f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectFromModel
from collections import defaultdict
import scipy.stats as stats

import warnings
warnings.filterwarnings("ignore", category = UserWarning)

# Preprocessing

In [2]:
# Preprocess dataset (Outputs: ids, trial_nums, predictors_df, outcomes_df, outcomes_df_shuffled)
# Import dataset
df = pd.read_csv("./../../Datasets/kieranFeatures_1-31_21-Jan-2025_avgof3_rawSA.csv").replace([np.inf, -np.inf], np.nan)

# Create Low vs High Columns
df["Lv_1_Lo"] = (df["SA1"] < 5).astype(np.bool_)
df["Lv_2_Lo"] = (df["SA2"] < 5).astype(np.bool_)
df["Lv_3_Lo"] = (df["SA3"] < 5).astype(np.bool_)
df["Tot_Lo"] = (df["SAtotal"] < 15).astype(np.bool_)

# Impute missing values with mean of column
for col in df.columns:
    df[col] = df[col].fillna(value = df[col].mean())

# Split up dataset
ids = df["ID"].astype(np.uint8)
predictors_df = df.drop(columns = ["ID", "trialNum", "SA1", "SA2", "SA3", "SAtotal", "Lv_1_Lo", "Lv_2_Lo", "Lv_3_Lo", "Tot_Lo"]).astype(np.float64)
outcomes_df = df[["Lv_1_Lo", "Lv_2_Lo", "Lv_3_Lo", "Tot_Lo"]]
outcomes_df_shuffled = outcomes_df.copy()

# Shuffle labels for shuffled data
outcomes_df_shuffled["Lv_1_Lo"] = np.random.permutation(outcomes_df_shuffled["Lv_1_Lo"])
outcomes_df_shuffled["Lv_2_Lo"] = np.random.permutation(outcomes_df_shuffled["Lv_2_Lo"])
outcomes_df_shuffled["Lv_3_Lo"] = np.random.permutation(outcomes_df_shuffled["Lv_3_Lo"])
outcomes_df_shuffled["Tot_Lo"] = np.random.permutation(outcomes_df_shuffled["Tot_Lo"])

# Divide up dataframe
ids = df["ID"]
ECG_df = df.loc[:, [col for col in df if col.startswith("ECG")]]
EDA_df = df.loc[:, [col for col in df if col.startswith("EDA")]]
EEG_df = df.loc[:, [col for col in df if col.startswith("EEG")]]
EYE_df = df.loc[:, [col for col in df if col.startswith("EYE")]]
fNIRS_df = df.loc[:, [col for col in df if col.startswith("fNIRS")]]
RSP_df = df.loc[:, [col for col in df if col.startswith("RSP")]]

# Free up memory
del col, df, predictors_df

In [3]:
# Initialize f1_scores, accuracy_scores, CV_models, and modified_Ridge_features
sensor_names = ["ECG", "EDA", "EEG", "EYE", "fNIRS", "RSP"]
dataset_types = ["Actual", "Shuffled"]
levels = ["Lv_1_Lo", "Lv_2_Lo", "Lv_3_Lo", "Tot_Lo"]

f1_scores = {}
accuracy_scores = {}
CV_models = {}
modified_Ridge_features = {}

for data in [f1_scores, accuracy_scores, CV_models, modified_Ridge_features]:
    for dataset in dataset_types:
        data[dataset] = {}

        for sensor in sensor_names:
            data[dataset][sensor] = {}

            for level in levels:
                data[dataset][sensor][level] = []

del dataset_types, levels, data, dataset, sensor, level

# Training

In [4]:
def evaluate_models(pred_df, out_df, ids):
    f1_scores = defaultdict(list)
    accuracy_scores = defaultdict(list)
    models = defaultdict(list)
    modified_Ridge_selected_features = []

    # Obtain 10 test folds stratifying by participant ID
    skf = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 42)
    for i, (CV_idx, test_idx) in enumerate(skf.split(pred_df, ids)):
        # Train-Test Split for the Fold
        pred_CV = pred_df.iloc[CV_idx, :].values
        pred_test = pred_df.iloc[test_idx, :].values
        out_CV = out_df.iloc[CV_idx].values
        out_test = out_df.iloc[test_idx].values

        # Standardize Data
        scaler = StandardScaler()
        pred_CV_normalized = scaler.fit_transform(pred_CV)
        pred_test_normalized = scaler.transform(pred_test)

        # Free Up Memory
        del pred_CV
        del pred_test

        # Initialize Models to Train
        no_penalty_model = LogisticRegression(
            fit_intercept = False,
            solver = "saga",
            n_jobs = -1,
            max_iter = 20000,
            class_weight = "balanced",
            penalty = None,
            random_state = 42
        )

        Ridge_model = LogisticRegression(
            C = 0.001,
            fit_intercept = False,
            solver = "saga",
            n_jobs = -1,
            max_iter = 20000,
            class_weight = "balanced",
            penalty = "l2",
            random_state = 42
        )

        LASSO_model = LogisticRegression(
            C = 1,
            fit_intercept = False,
            solver = "saga",
            n_jobs = -1,
            max_iter = 20000,
            class_weight = "balanced",
            penalty = "l1",
            random_state = 42
        )

        modified_Ridge_model_l2 = LogisticRegression(
            fit_intercept = False,
            solver = "saga",
            n_jobs = -1,
            max_iter = 20000,
            class_weight = "balanced",
            penalty = "l2",
            C = 0.001
        )

        # Fit Models
        no_penalty_model.fit(pred_CV_normalized, out_CV)
        Ridge_model.fit(pred_CV_normalized, out_CV)
        LASSO_model.fit(pred_CV_normalized, out_CV)

        # Obtain and fit "Modified Ridge" Model
        Ridge_selector = SelectFromModel(Ridge_model, prefit = True)
        Ridge_selected_features = Ridge_selector.get_support()
        pred_CV_selected_Ridge = pred_CV_normalized[:, Ridge_selected_features]
        pred_test_selected_Ridge = pred_test_normalized[:, Ridge_selected_features]
        modified_Ridge_model_l2.fit(pred_CV_selected_Ridge, out_CV)

        # Append F1 Scores
        f1_scores["No Penalty"].append(f1_score(out_test, no_penalty_model.predict(pred_test_normalized)))
        f1_scores["Ridge"].append(f1_score(out_test, Ridge_model.predict(pred_test_normalized)))
        f1_scores["LASSO"].append(f1_score(out_test, LASSO_model.predict(pred_test_normalized)))
        f1_scores["Modified Ridge (L2)"].append(f1_score(out_test, modified_Ridge_model_l2.predict(pred_test_selected_Ridge)))

        # Append accuracy Scores
        accuracy_scores["No Penalty"].append(accuracy_score(out_test, no_penalty_model.predict(pred_test_normalized)))
        accuracy_scores["Ridge"].append(accuracy_score(out_test, Ridge_model.predict(pred_test_normalized)))
        accuracy_scores["LASSO"].append(accuracy_score(out_test, LASSO_model.predict(pred_test_normalized)))
        accuracy_scores["Modified Ridge (L2)"].append(accuracy_score(out_test, modified_Ridge_model_l2.predict(pred_test_selected_Ridge)))

        # Add Models and Scores to Dictionaries
        models["No Penalty"].append(no_penalty_model)
        models["Ridge"].append(Ridge_model)
        models["LASSO"].append(LASSO_model)
        models["Modified Ridge (L2)"].append(modified_Ridge_model_l2)

        # Store selected features for relaxed LASSO
        modified_Ridge_selected_features.append(Ridge_selected_features)

    return f1_scores, accuracy_scores, models, modified_Ridge_selected_features

In [7]:
for outcome_var in ["Lv_2_Lo", "Lv_3_Lo", "Tot_Lo"]:
    for outcome_type in ["Actual", "Shuffled"]:
        for sensor, df in [("ECG", ECG_df), ("EDA", EDA_df), ("EEG", EEG_df), ("EYE", EYE_df), ("fNIRS", fNIRS_df), ("RSP", RSP_df)]:
            if outcome_type == "Actual":
                f1_scores[outcome_type][sensor][outcome_var], accuracy_scores[outcome_type][sensor][outcome_var], CV_models[outcome_type][sensor][outcome_var], modified_Ridge_features[outcome_type][sensor][outcome_var] = evaluate_models(df, outcomes_df[outcome_var], ids)
            else:
                f1_scores[outcome_type][sensor][outcome_var], accuracy_scores[outcome_type][sensor][outcome_var], CV_models[outcome_type][sensor][outcome_var], modified_Ridge_features[outcome_type][sensor][outcome_var] = evaluate_models(df, outcomes_df_shuffled[outcome_var], ids)
            
            print("Completed Training for", outcome_type, outcome_var, sensor)

Completed Training for Actual Lv_2_Lo ECG
Completed Training for Actual Lv_2_Lo EDA
Completed Training for Actual Lv_2_Lo EEG
Completed Training for Actual Lv_2_Lo EYE
Completed Training for Actual Lv_2_Lo fNIRS
Completed Training for Actual Lv_2_Lo RSP
Completed Training for Shuffled Lv_2_Lo ECG
Completed Training for Shuffled Lv_2_Lo EDA
Completed Training for Shuffled Lv_2_Lo EEG
Completed Training for Shuffled Lv_2_Lo EYE
Completed Training for Shuffled Lv_2_Lo fNIRS
Completed Training for Shuffled Lv_2_Lo RSP
Completed Training for Actual Lv_3_Lo ECG
Completed Training for Actual Lv_3_Lo EDA
Completed Training for Actual Lv_3_Lo EEG
Completed Training for Actual Lv_3_Lo EYE
Completed Training for Actual Lv_3_Lo fNIRS
Completed Training for Actual Lv_3_Lo RSP
Completed Training for Shuffled Lv_3_Lo ECG
Completed Training for Shuffled Lv_3_Lo EDA
Completed Training for Shuffled Lv_3_Lo EEG
Completed Training for Shuffled Lv_3_Lo EYE
Completed Training for Shuffled Lv_3_Lo fNIRS
Comp

In [6]:
outcome_var = "Lv_1_Lo"
for outcome_type in ["Actual", "Shuffled"]:
    for sensor, df in [("ECG", ECG_df), ("EDA", EDA_df), ("EEG", EEG_df), ("EYE", EYE_df), ("fNIRS", fNIRS_df), ("RSP", RSP_df)]:
        if outcome_type == "Actual":
            f1_scores[outcome_type][sensor][outcome_var], accuracy_scores[outcome_type][sensor][outcome_var], CV_models[outcome_type][sensor][outcome_var], modified_Ridge_features[outcome_type][sensor][outcome_var] = evaluate_models(df, outcomes_df[outcome_var], ids)
        else:
            f1_scores[outcome_type][sensor][outcome_var], accuracy_scores[outcome_type][sensor][outcome_var], CV_models[outcome_type][sensor][outcome_var], modified_Ridge_features[outcome_type][sensor][outcome_var] = evaluate_models(df, outcomes_df_shuffled[outcome_var], ids)
        
        print("Completed Training for", outcome_type, outcome_var, sensor)

Completed Training for Actual Lv_1_Lo ECG
Completed Training for Actual Lv_1_Lo EDA
Completed Training for Actual Lv_1_Lo EEG
Completed Training for Actual Lv_1_Lo EYE
Completed Training for Actual Lv_1_Lo fNIRS
Completed Training for Actual Lv_1_Lo RSP
Completed Training for Shuffled Lv_1_Lo ECG
Completed Training for Shuffled Lv_1_Lo EDA
Completed Training for Shuffled Lv_1_Lo EEG
Completed Training for Shuffled Lv_1_Lo EYE
Completed Training for Shuffled Lv_1_Lo fNIRS
Completed Training for Shuffled Lv_1_Lo RSP


In [8]:
# # Save data
# with open("./../../Pickle Files/3_Moving_Average_Model_by_Sensor.pkl", "wb") as f:
#     pickle.dump((f1_scores, accuracy_scores, CV_models, modified_Ridge_features), f)

In [11]:
# Load data
with open("./../../Pickle Files/3_Moving_Average_Model_by_Sensor.pkl", "rb") as f:
    f1_scores, accuracy_scores, CV_models, modified_Ridge_features = pickle.load(f)

In [10]:
CV_models

{'Actual': {'ECG': {'Lv_1_Lo': defaultdict(list,
               {'No Penalty': [LogisticRegression(class_weight='balanced', fit_intercept=False, max_iter=20000,
                                    n_jobs=-1, penalty=None, random_state=42, solver='saga'),
                 LogisticRegression(class_weight='balanced', fit_intercept=False, max_iter=20000,
                                    n_jobs=-1, penalty=None, random_state=42, solver='saga'),
                 LogisticRegression(class_weight='balanced', fit_intercept=False, max_iter=20000,
                                    n_jobs=-1, penalty=None, random_state=42, solver='saga'),
                 LogisticRegression(class_weight='balanced', fit_intercept=False, max_iter=20000,
                                    n_jobs=-1, penalty=None, random_state=42, solver='saga'),
                 LogisticRegression(class_weight='balanced', fit_intercept=False, max_iter=20000,
                                    n_jobs=-1, penalty=None, random_sta