# Initial Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
from sklearn import metrics
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.feature_selection import SelectFromModel, RFE, RFECV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from collections import defaultdict

ECG_df = pd.read_csv("./Data by Sensor/kieranFeatures_1-30_24-Oct-2024_ECG-only.csv").drop(columns = "ID")
EDA_df = pd.read_csv("./Data by Sensor/kieranFeatures_1-30_24-Oct-2024_EDA-only.csv").drop(columns = "ID")
EEG_df = pd.read_csv("./Data by Sensor/kieranFeatures_1-30_24-Oct-2024_EEG-only.csv").drop(columns = "ID")
EYE_df = pd.read_csv("./Data by Sensor/kieranFeatures_1-30_24-Oct-2024_EYE-only.csv").drop(columns = "ID")
fNIRS_df = pd.read_csv("./Data by Sensor/kieranFeatures_1-30_24-Oct-2024_fNIRS-only.csv").drop(columns = "ID")
RSP_df = pd.read_csv("./Data by Sensor/kieranFeatures_1-30_24-Oct-2024_RSP-only.csv").drop(columns = "ID")

# Pre-Processing

In [2]:
# Isolate the outcomes variables
outcomes_df = ECG_df.iloc[:, ECG_df.shape[1] - 4:]

# Create binary variables for high and low 
adj_SA_1_median = np.median(outcomes_df["adjSA1"])
adj_SA_2_median = np.median(outcomes_df["adjSA2"])
adj_SA_3_median = np.median(outcomes_df["adjSA3"])
adj_SA_tot_median = np.median(outcomes_df["adjSAtotal"])

# Will be high if adjusted SA level score is equal to or above median, low otherwise
outcomes_df["Lv_1_Hi"] = (outcomes_df["adjSA1"] >= adj_SA_1_median).astype(int)
outcomes_df["Lv_2_Hi"] = (outcomes_df["adjSA2"] >= adj_SA_2_median).astype(int)
outcomes_df["Lv_3_Hi"] = (outcomes_df["adjSA3"] >= adj_SA_3_median).astype(int)
outcomes_df["Tot_Hi"] = (outcomes_df["adjSAtotal"] >= adj_SA_tot_median).astype(int)

# Remove outcomes variables from each sensor dataframe
ECG_df = ECG_df.iloc[:, :ECG_df.shape[1] - 4]
EDA_df = EDA_df.iloc[:, :EDA_df.shape[1] - 4]
EEG_df = EEG_df.iloc[:, :EEG_df.shape[1] - 4]
EYE_df = EYE_df.iloc[:, :EYE_df.shape[1] - 4]
fNIRS_df = fNIRS_df.iloc[:, :fNIRS_df.shape[1] - 4]
RSP_df = RSP_df.iloc[:, :RSP_df.shape[1] - 4]

# Free Memory
del adj_SA_1_median
del adj_SA_2_median
del adj_SA_3_median
del adj_SA_tot_median

# Training Models

In [None]:
outcome_var = "Lv_1_Hi"

test_fold_scores = defaultdict(list)

# for i in range(10):
ECG_skf = StratifiedKFold(n_splits = 5, shuffle = True)
for i, (CV_idx, test_idx) in enumerate(ECG_skf.split(ECG_df, outcomes_df[outcome_var])):
    # Train-Test Split for the Fold
    ECG_pred_CV = ECG_df.iloc[CV_idx, :]
    ECG_pred_test = ECG_df.iloc[test_idx, :]
    ECG_out_CV = outcomes_df.iloc[CV_idx, :]
    ECG_out_test = outcomes_df.iloc[test_idx, :]

    ECG_cv = StratifiedKFold(n_splits = 10, shuffle = True)

    # Initialize Models
    Ridge_model = LogisticRegressionCV(
        Cs = [0.001, 0.01, 0.1, 1, 10, 100], 
        cv = ECG_cv, fit_intercept = False, 
        solver = "saga", 
        n_jobs = -1, 
        max_iter = 10000, 
        scoring = "accuracy", 
        refit = True)

    LASSO_model = LogisticRegressionCV(
        Cs = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], 
        cv = ECG_cv, 
        fit_intercept = False, 
        penalty = "l1", 
        solver = "saga", 
        n_jobs = -1, 
        max_iter = 10000, 
        scoring = "accuracy", 
        refit = True)

    RFE_model = RFECV(
        estimator = LogisticRegression(
            max_iter = 10000, 
            solver = "saga", 
            fit_intercept = False), 
        cv = ECG_cv, 
        scoring = "accuracy", 
        n_jobs = -1)

    RFE_No_Penalty_model = RFECV(
        estimator = LogisticRegression(
            penalty = None, 
            max_iter = 10000, 
            solver = "saga", 
            fit_intercept = False), 
        cv = ECG_cv, 
        scoring = "accuracy", 
        n_jobs = -1)

    # Fit Models with CV Splits
    Ridge_model.fit(ECG_pred_CV, ECG_out_CV[outcome_var])
    LASSO_model.fit(ECG_pred_CV, ECG_out_CV[outcome_var])
    RFE_model.fit(ECG_pred_CV, ECG_out_CV[outcome_var])
    RFE_No_Penalty_model.fit(ECG_pred_CV, ECG_out_CV[outcome_var])

    # Append Best Model's Score
    test_fold_scores["Ridge"].append(Ridge_model.score(ECG_pred_test, ECG_out_test[outcome_var]))
    test_fold_scores["LASSO"].append(LASSO_model.score(ECG_pred_test, ECG_out_test[outcome_var]))
    test_fold_scores["RFE"].append(RFE_model.score(ECG_pred_test, ECG_out_test[outcome_var]))
    test_fold_scores["RFE (No Penalty)"].append(RFE_No_Penalty_model.score(ECG_pred_test, ECG_out_test[outcome_var]))

NameError: name 'ECG_out_train' is not defined

In [10]:
for key in test_fold_scores:
    print(key, np.mean(test_fold_scores[key]))

Ridge 0.5766338797814208
LASSO 0.5855081967213115
