In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, cross_val_score
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours, TomekLinks
from imblearn.combine import SMOTEENN, SMOTETomek

In [2]:
# Import dataset
df = pd.read_csv("./../../Datasets/kieranFeatures_1-31_21-Jan-2025_avgof3_rawSA.csv").replace([np.inf, -np.inf], np.nan)

# Create Low vs High Columns
df["Lv_1_Lo"] = (df["SA1"] < 5).astype(np.bool_)
df["Lv_2_Lo"] = (df["SA2"] < 5).astype(np.bool_)
df["Lv_3_Lo"] = (df["SA3"] < 5).astype(np.bool_)
df["Tot_Lo"] = (df["SAtotal"] < 15).astype(np.bool_)

# Impute missing values with mean of column
for col in df.columns:
    df[col] = df[col].fillna(value = df[col].mean())

# Split up dataset
ids = df["ID"].astype(np.uint8)
# trial_nums = df["trialNum"].astype(np.uint8)
predictors_df = df.drop(columns = ["ID", "trialNum", "SA1", "SA2", "SA3", "SAtotal", "Lv_1_Lo", "Lv_2_Lo", "Lv_3_Lo", "Tot_Lo"]).astype(np.float64)
outcomes_df = df[["Lv_1_Lo", "Lv_2_Lo", "Lv_3_Lo", "Tot_Lo"]]
outcomes_df_shuffled = outcomes_df.copy()

# Shuffle labels for shuffled data
outcomes_df_shuffled["Lv_1_Lo"] = np.random.permutation(outcomes_df_shuffled["Lv_1_Lo"])
outcomes_df_shuffled["Lv_2_Lo"] = np.random.permutation(outcomes_df_shuffled["Lv_2_Lo"])
outcomes_df_shuffled["Lv_3_Lo"] = np.random.permutation(outcomes_df_shuffled["Lv_3_Lo"])
outcomes_df_shuffled["Tot_Lo"] = np.random.permutation(outcomes_df_shuffled["Tot_Lo"])

# Free up memory
del col, df

In [3]:
cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 42)

no_penalty_model = LogisticRegression(
    fit_intercept = False,
    solver = "saga",
    n_jobs = -1,
    max_iter = 20000,
    class_weight = "balanced",
    penalty = None,
    random_state = 42
)

Ridge_model = LogisticRegression(
    C = 0.001,
    fit_intercept = False,
    solver = "saga",
    n_jobs = -1,
    max_iter = 20000,
    class_weight = "balanced",
    penalty = "l2",
    random_state = 42
)

LASSO_model = LogisticRegression(
    C = 1,
    fit_intercept = False,
    solver = "saga",
    n_jobs = -1,
    max_iter = 20000,
    class_weight = "balanced",
    penalty = "l1",
    random_state = 42
)

modified_Ridge_model_l2 = LogisticRegression(
    fit_intercept = False,
    solver = "saga",
    n_jobs = -1,
    max_iter = 20000,
    class_weight = "balanced",
    penalty = "l2",
    C = 0.001
)

# No Penalty

In [6]:
# No Penalty Control
cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 42)
pipeline = Pipeline([("transformer", StandardScaler()), ("estimator", no_penalty_model)])
scores = cross_val_score(pipeline, predictors_df, outcomes_df["Lv_1_Lo"], scoring = "f1", cv = cv.split(predictors_df, ids), n_jobs = -1)
print("*** Mean F1: %.3f" % np.mean(scores))
print("*** Median F1: %.3f" % np.median(scores))



*** Mean F1: 0.376
*** Median F1: 0.364


In [4]:
# No Penalty (Just Over)
for oversampling_method in [SMOTE(random_state = 42), BorderlineSMOTE(random_state = 42), SVMSMOTE(random_state = 42), ADASYN(random_state = 42)]:
    cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 42)
    pipeline = Pipeline([("transformer", StandardScaler()), ("over", oversampling_method), ("estimator", no_penalty_model)])
    scores = cross_val_score(pipeline, predictors_df, outcomes_df["Lv_1_Lo"], scoring = "f1", cv = cv.split(predictors_df, ids), n_jobs = -1)
    print("*** Mean F1: %.3f" % np.mean(scores))
    print("*** Median F1: %.3f" % np.median(scores))



*** Mean F1: 0.377
*** Median F1: 0.369




*** Mean F1: 0.374
*** Median F1: 0.358




*** Mean F1: 0.373
*** Median F1: 0.369




*** Mean F1: 0.384
*** Median F1: 0.375


In [5]:
# No Penalty (Over and Under)
for oversampling_method in [SMOTE(random_state = 42), BorderlineSMOTE(random_state = 42), SVMSMOTE(random_state = 42), ADASYN(random_state = 42)]:
    cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 42)
    pipeline = Pipeline([("transformer", StandardScaler()), ("over", oversampling_method), ("under", RandomUnderSampler(random_state = 42)), ("estimator", no_penalty_model)])
    scores = cross_val_score(pipeline, predictors_df, outcomes_df["Lv_1_Lo"], scoring = "f1", cv = cv.split(predictors_df, ids), n_jobs = -1)
    print("*** Mean F1: %.3f" % np.mean(scores))
    print("*** Median F1: %.3f" % np.median(scores))



*** Mean F1: 0.377
*** Median F1: 0.369




*** Mean F1: 0.374
*** Median F1: 0.358




*** Mean F1: 0.389
*** Median F1: 0.400




*** Mean F1: 0.386
*** Median F1: 0.375


In [4]:
# No Penalty (Over and Under from library)
for combination_method in [SMOTEENN(random_state = 42), SMOTETomek(random_state = 42)]:
    cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 42)
    pipeline = Pipeline([("transformer", StandardScaler()), ("combine", combination_method), ("estimator", no_penalty_model)])
    scores = cross_val_score(pipeline, predictors_df, outcomes_df["Lv_1_Lo"], scoring = "f1", cv = cv.split(predictors_df, ids), n_jobs = -1)
    print("*** Mean F1: %.3f" % np.mean(scores))
    print("*** Median F1: %.3f" % np.median(scores))



*** Mean F1: 0.390
*** Median F1: 0.386




*** Mean F1: 0.376
*** Median F1: 0.388


In [6]:
# No Penalty (Varying Under)
for undersampling_method in [RandomUnderSampler(random_state = 42), EditedNearestNeighbours(), TomekLinks()]:
    cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 42)
    pipeline = Pipeline([("transformer", StandardScaler()), ("over", SVMSMOTE(random_state = 42)), ("under", undersampling_method), ("estimator", no_penalty_model)])
    scores = cross_val_score(pipeline, predictors_df, outcomes_df["Lv_1_Lo"], scoring = "f1", cv = cv.split(predictors_df, ids), n_jobs = -1)
    print("*** Mean F1: %.3f" % np.mean(scores))
    print("*** Median F1: %.3f" % np.median(scores))



*** Mean F1: 0.389
*** Median F1: 0.400




*** Mean F1: 0.373
*** Median F1: 0.375




*** Mean F1: 0.366
*** Median F1: 0.375


# Ridge

In [8]:
# Ridge Control
cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 42)
pipeline = Pipeline([("transformer", StandardScaler()), ("estimator", Ridge_model)])
scores = cross_val_score(pipeline, predictors_df, outcomes_df["Lv_1_Lo"], scoring = "f1", cv = cv.split(predictors_df, ids), n_jobs = -1)
print("*** Mean F1: %.3f" % np.mean(scores))
print("*** Median F1: %.3f" % np.median(scores))



*** Mean F1: 0.443
*** Median F1: 0.449


In [9]:
# Ridge (Just Over)
for oversampling_method in [SMOTE(random_state = 42), BorderlineSMOTE(random_state = 42), SVMSMOTE(random_state = 42), ADASYN(random_state = 42)]:
    cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 42)
    pipeline = Pipeline([("transformer", StandardScaler()), ("over", oversampling_method), ("estimator", Ridge_model)])
    scores = cross_val_score(pipeline, predictors_df, outcomes_df["Lv_1_Lo"], scoring = "f1", cv = cv.split(predictors_df, ids), n_jobs = -1)
    print("*** Mean F1: %.3f" % np.mean(scores))
    print("*** Median F1: %.3f" % np.median(scores))



*** Mean F1: 0.433
*** Median F1: 0.440




*** Mean F1: 0.439
*** Median F1: 0.445




*** Mean F1: 0.437
*** Median F1: 0.449




*** Mean F1: 0.442
*** Median F1: 0.435


In [10]:
# Ridge (Over and Under)
for oversampling_method in [SMOTE(random_state = 42), BorderlineSMOTE(random_state = 42), SVMSMOTE(random_state = 42), ADASYN(random_state = 42)]:
    cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 42)
    pipeline = Pipeline([("transformer", StandardScaler()), ("over", oversampling_method), ("under", RandomUnderSampler(random_state = 42)), ("estimator", Ridge_model)])
    scores = cross_val_score(pipeline, predictors_df, outcomes_df["Lv_1_Lo"], scoring = "f1", cv = cv.split(predictors_df, ids), n_jobs = -1)
    print("*** Mean F1: %.3f" % np.mean(scores))
    print("*** Median F1: %.3f" % np.median(scores))



*** Mean F1: 0.433
*** Median F1: 0.440




*** Mean F1: 0.440
*** Median F1: 0.445




*** Mean F1: 0.451
*** Median F1: 0.455




*** Mean F1: 0.431
*** Median F1: 0.435


In [5]:
# Ridge (Over and Under from library)
for combination_method in [SMOTEENN(random_state = 42), SMOTETomek(random_state = 42)]:
    cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 42)
    pipeline = Pipeline([("transformer", StandardScaler()), ("combine", combination_method), ("estimator", Ridge_model)])
    scores = cross_val_score(pipeline, predictors_df, outcomes_df["Lv_1_Lo"], scoring = "f1", cv = cv.split(predictors_df, ids), n_jobs = -1)
    print("*** Mean F1: %.3f" % np.mean(scores))
    print("*** Median F1: %.3f" % np.median(scores))



*** Mean F1: 0.394
*** Median F1: 0.392




*** Mean F1: 0.426
*** Median F1: 0.440


In [7]:
# Ridge (Varying Under)
for undersampling_method in [RandomUnderSampler(random_state = 42), EditedNearestNeighbours(), TomekLinks()]:
    cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 42)
    pipeline = Pipeline([("transformer", StandardScaler()), ("over", SVMSMOTE(random_state = 42)), ("under", undersampling_method), ("estimator", Ridge_model)])
    scores = cross_val_score(pipeline, predictors_df, outcomes_df["Lv_1_Lo"], scoring = "f1", cv = cv.split(predictors_df, ids), n_jobs = -1)
    print("*** Mean F1: %.3f" % np.mean(scores))
    print("*** Median F1: %.3f" % np.median(scores))



*** Mean F1: 0.451
*** Median F1: 0.455




*** Mean F1: 0.410
*** Median F1: 0.421




*** Mean F1: 0.434
*** Median F1: 0.432


# LASSO

In [7]:
# LASSO Control
cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 42)
pipeline = Pipeline([("transformer", StandardScaler()), ("estimator", LASSO_model)])
scores = cross_val_score(pipeline, predictors_df, outcomes_df["Lv_1_Lo"], scoring = "f1", cv = cv.split(predictors_df, ids), n_jobs = -1)
print("*** Mean F1: %.3f" % np.mean(scores))
print("*** Median F1: %.3f" % np.median(scores))



*** Mean F1: 0.363
*** Median F1: 0.408


In [8]:
# LASSO (Just Over)
for oversampling_method in [SMOTE(random_state = 42), BorderlineSMOTE(random_state = 42), SVMSMOTE(random_state = 42), ADASYN(random_state = 42)]:
    cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 42)
    pipeline = Pipeline([("transformer", StandardScaler()), ("over", oversampling_method), ("estimator", LASSO_model)])
    scores = cross_val_score(pipeline, predictors_df, outcomes_df["Lv_1_Lo"], scoring = "f1", cv = cv.split(predictors_df, ids), n_jobs = -1)
    print("*** Mean F1: %.3f" % np.mean(scores))
    print("*** Median F1: %.3f" % np.median(scores))



*** Mean F1: 0.360
*** Median F1: 0.390




*** Mean F1: 0.361
*** Median F1: 0.378




*** Mean F1: 0.354
*** Median F1: 0.364




*** Mean F1: 0.348
*** Median F1: 0.378


In [9]:
# LASSO (Over and Under)
for oversampling_method in [SMOTE(random_state = 42), BorderlineSMOTE(random_state = 42), SVMSMOTE(random_state = 42), ADASYN(random_state = 42)]:
    cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 42)
    pipeline = Pipeline([("transformer", StandardScaler()), ("over", oversampling_method), ("under", RandomUnderSampler(random_state = 42)), ("estimator", LASSO_model)])
    scores = cross_val_score(pipeline, predictors_df, outcomes_df["Lv_1_Lo"], scoring = "f1", cv = cv.split(predictors_df, ids), n_jobs = -1)
    print("*** Mean F1: %.3f" % np.mean(scores))
    print("*** Median F1: %.3f" % np.median(scores))



*** Mean F1: 0.360
*** Median F1: 0.390




*** Mean F1: 0.361
*** Median F1: 0.378




*** Mean F1: 0.368
*** Median F1: 0.369




*** Mean F1: 0.349
*** Median F1: 0.388


In [4]:
# LASSO (Over and Under from library)
for combination_method in [SMOTEENN(random_state = 42), SMOTETomek(random_state = 42)]:
    cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 42)
    pipeline = Pipeline([("transformer", StandardScaler()), ("combine", combination_method), ("estimator", LASSO_model)])
    scores = cross_val_score(pipeline, predictors_df, outcomes_df["Lv_1_Lo"], scoring = "f1", cv = cv.split(predictors_df, ids), n_jobs = -1)
    print("*** Mean F1: %.3f" % np.mean(scores))
    print("*** Median F1: %.3f" % np.median(scores))



*** Mean F1: 0.401
*** Median F1: 0.417




*** Mean F1: 0.355
*** Median F1: 0.375


In [8]:
# LASSO (Varying Under)
for undersampling_method in [RandomUnderSampler(random_state = 42), EditedNearestNeighbours(), TomekLinks()]:
    cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 42)
    pipeline = Pipeline([("transformer", StandardScaler()), ("over", SVMSMOTE(random_state = 42)), ("under", undersampling_method), ("estimator", LASSO_model)])
    scores = cross_val_score(pipeline, predictors_df, outcomes_df["Lv_1_Lo"], scoring = "f1", cv = cv.split(predictors_df, ids), n_jobs = -1)
    print("*** Mean F1: %.3f" % np.mean(scores))
    print("*** Median F1: %.3f" % np.median(scores))



*** Mean F1: 0.368
*** Median F1: 0.369




*** Mean F1: 0.365
*** Median F1: 0.369




*** Mean F1: 0.340
*** Median F1: 0.350


In [4]:
def evaluate_models(pred_df, out_df, ids):
    # Obtain 10 test folds stratifying by participant ID
    skf = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 42)
    for i, (CV_idx, test_idx) in enumerate(skf.split(pred_df, ids)):
        # Train-Test Split for the Fold
        ids_CV = ids.iloc[CV_idx].values
        # ids_test = ids.iloc[test_idx].values
        # trial_nums_CV = ids.iloc[CV_idx].values
        # trial_nums_test = ids.iloc[test_idx].values
        pred_CV = pred_df.iloc[CV_idx, :].values
        pred_test = pred_df.iloc[test_idx, :].values
        out_CV = out_df.iloc[CV_idx].values
        # out_test = out_df.iloc[test_idx].values

        # Standardize data for each test fold
        scaler = StandardScaler()
        pred_CV = scaler.fit_transform(pred_CV)
        pred_test = scaler.transform(pred_test)

        # Setup Cross Validation Object
        rskf = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 2, random_state = 42)
        
        # Initialize Models to Train
        LASSO_model = LogisticRegressionCV(
            Cs = [0.001, 0.01, 0.1, 1], 
            cv = rskf.split(pred_CV, ids_CV), 
            fit_intercept = False,
            class_weight = "balanced",
            penalty = "l1", 
            solver = "saga", 
            n_jobs = -1, 
            max_iter = 20000, 
            scoring = "f1", 
            refit = True,
            random_state = 42
        )

        # Fit Models
        LASSO_model.fit(pred_CV, out_CV)

        # Display C value
        print(LASSO_model.C_)

        # Completion Message
        print(f"Test Fold {i + 1} Completed")

evaluate_models(predictors_df, outcomes_df["Lv_1_Lo"], ids)



[1.]
Test Fold 1 Completed
[1.]
Test Fold 2 Completed
[1.]
Test Fold 3 Completed
[1.]
Test Fold 4 Completed
[1.]
Test Fold 5 Completed
[1.]
Test Fold 6 Completed
[1.]
Test Fold 7 Completed
[1.]
Test Fold 8 Completed
[1.]
Test Fold 9 Completed
[1.]
Test Fold 10 Completed
