# Initial Imports

In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, balanced_accuracy_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.feature_selection import SelectFromModel
from collections import defaultdict

# Pre-Processing

In [None]:
df = pd.read_csv("./kieranFeatures_1-30_14-Nov-2024_rawSA.csv").replace([np.inf, -np.inf], np.nan).interpolate() # Perform linear interpolation on all non-valid numbers

df["Lv_1_Lo"] = (df["SA1"] < 5).astype(int)
df["Lv_2_Lo"] = (df["SA2"] < 5).astype(int)
df["Lv_3_Lo"] = (df["SA3"] < 5).astype(int)
df["Tot_Lo"] = (df["SAtotal"] < 15).astype(int)

ids = df.iloc[:, 0]
trial_nums = df.iloc[:, 1]
predictors_df = df.iloc[:, 2:df.shape[1] - 8]
outcomes_df = df.iloc[:, df.shape[1] - 8:]

# Shuffle Labels
outcomes_df["Lv_1_Lo"] = np.random.permutation(outcomes_df["Lv_1_Lo"])
outcomes_df["Lv_2_Lo"] = np.random.permutation(outcomes_df["Lv_2_Lo"])
outcomes_df["Lv_3_Lo"] = np.random.permutation(outcomes_df["Lv_3_Lo"])
outcomes_df["Tot_Lo"] = np.random.permutation(outcomes_df["Tot_Lo"])

# Free Memory
del df

# Cross Validation

In [None]:
f1_scores = {
    "Lv_1_Lo": [],
    "Lv_2_Lo": [],
    "Lv_3_Lo": [],
    "Tot_Lo": []
}

accuracy_scores = {
    "Lv_1_Lo": [],
    "Lv_2_Lo": [],
    "Lv_3_Lo": [],
    "Tot_Lo": []
}

CV_models = {
    "Lv_1_Lo": [],
    "Lv_2_Lo": [],
    "Lv_3_Lo": [],
    "Tot_Lo": []
}

relaxed_LASSO_features = {
    "Lv_1_Lo": [],
    "Lv_2_Lo": [],
    "Lv_3_Lo": [],
    "Tot_Lo": []
}

# Sum of F1 and accuracy score
def scorer(estimator, predictors, outcomes):
    return f1_score(outcomes, estimator.predict(predictors), average = "macro") + balanced_accuracy_score(outcomes, estimator.predict(predictors))

In [None]:
def evaluate_models(pred_df, out_df, ids, trial_nums):
    f1_scores = defaultdict(list)
    accuracy_scores = defaultdict(list)
    models = defaultdict(list)
    relaxed_LASSO_selected_features = []

    # Obtain 5 test folds stratifying by participant ID
    skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
    for i, (CV_idx, test_idx) in enumerate(skf.split(pred_df, ids)):
        # Train-Test Split for the Fold
        ids_CV = ids.iloc[CV_idx].values
        # ids_test = ids.iloc[test_idx].values
        # trial_nums_CV = ids.iloc[CV_idx].values
        # trial_nums_test = ids.iloc[test_idx].values
        pred_CV = pred_df.iloc[CV_idx, :].values
        pred_test = pred_df.iloc[test_idx, :].values
        out_CV = out_df.iloc[CV_idx].values
        out_test = out_df.iloc[test_idx].values

        # Standardize data
        scaler = StandardScaler()
        pred_CV = scaler.fit_transform(pred_CV)
        pred_test = scaler.transform(pred_test)

        # Setup Cross Validation Object
        rskf = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 2, random_state = 42)
        
        # Initialize Models to Train
        no_penalty_model = LogisticRegression(
            fit_intercept = False,
            solver = "saga",
            n_jobs = -1,
            max_iter = 20000,
            class_weight = "balanced",
            penalty = None
        )

        Ridge_model = LogisticRegressionCV(
            Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000], 
            cv = rskf.split(pred_CV, ids_CV), 
            fit_intercept = False,
            class_weight = "balanced", 
            solver = "saga", 
            n_jobs = -1, 
            max_iter = 20000, 
            scoring = scorer, 
            refit = True
        )

        LASSO_model = LogisticRegressionCV(
            Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000], 
            cv = rskf.split(pred_CV, ids_CV), 
            fit_intercept = False,
            class_weight = "balanced",
            penalty = "l1", 
            solver = "saga", 
            n_jobs = -1, 
            max_iter = 20000, 
            scoring = scorer, 
            refit = True
        )

        relaxed_LASSO_model = LogisticRegression(
            fit_intercept = False,
            solver = "saga",
            n_jobs = -1,
            max_iter = 20000,
            class_weight = "balanced",
            penalty = None
        )

        # Fit Models
        no_penalty_model.fit(pred_CV, out_CV)
        Ridge_model.fit(pred_CV, out_CV)
        LASSO_model.fit(pred_CV, out_CV)

        # Obtain and fit "Relaxed LASSO" Model
        selector = SelectFromModel(LASSO_model, threshold = "mean", prefit = True)
        selected_features = selector.get_support()
        pred_CV_selected = pred_CV[:, selected_features]
        pred_test_selected = pred_test[:, selected_features]
        relaxed_LASSO_model.fit(pred_CV_selected, out_CV)

        # Append F1 Scores
        f1_scores["No Penalty"].append(f1_score(out_test, no_penalty_model.predict(pred_test), average = "macro"))
        f1_scores["Ridge"].append(f1_score(out_test, Ridge_model.predict(pred_test), average = "macro"))
        f1_scores["LASSO"].append(f1_score(out_test, LASSO_model.predict(pred_test), average = "macro"))
        f1_scores["Relaxed LASSO"].append(f1_score(out_test, relaxed_LASSO_model.predict(pred_test_selected), average = "macro"))

        # Append accuracy Scores
        accuracy_scores["No Penalty"].append(balanced_accuracy_score(out_test, no_penalty_model.predict(pred_test)))
        accuracy_scores["Ridge"].append(balanced_accuracy_score(out_test, Ridge_model.predict(pred_test)))
        accuracy_scores["LASSO"].append(balanced_accuracy_score(out_test, LASSO_model.predict(pred_test)))
        accuracy_scores["Relaxed LASSO"].append(balanced_accuracy_score(out_test, relaxed_LASSO_model.predict(pred_test_selected)))

        # Remove generator from models to allow pickling
        Ridge_model.set_params(cv = None)
        LASSO_model.set_params(cv = None)

        # Add Models and Scores to Dictionaries
        models["No Penalty"].append(no_penalty_model)
        models["Ridge"].append(Ridge_model)
        models["LASSO"].append(LASSO_model)
        models["Relaxed LASSO"].append(relaxed_LASSO_model)

        # Store selected features for relaxed LASSO
        relaxed_LASSO_selected_features.append(selected_features)

    return f1_scores, accuracy_scores, models, relaxed_LASSO_selected_features

In [None]:
# Saving Models and Test Scores
with open("Pickle Files/raw_SA_investigation_shuffled.pkl", "wb") as f:
    pickle.dump((f1_scores, accuracy_scores, CV_models, relaxed_LASSO_features), f)

In [None]:
# Load Test Scores and CV Models from File
with open("Pickle Files/raw_SA_investigation.pkl_shuffled", "rb") as f:
    f1_scores, accuracy_scores, CV_models, relaxed_LASSO_features = pickle.load(f)

## SA 1

In [11]:
test_scores["Lv_1_Hi"], CV_models["Lv_1_Hi"], relaxed_LASSO_features["Lv_1_Hi"] = evaluate_models(predictors_df, outcomes_df["Lv_1_Hi"], ids, trial_nums)

## SA 2

In [5]:
test_scores["Lv_2_Hi"], CV_models["Lv_2_Hi"], relaxed_LASSO_features["Lv_2_Hi"] = evaluate_models(predictors_df, outcomes_df["Lv_2_Hi"], ids, trial_nums)

## SA 3

In [6]:
test_scores["Lv_3_Hi"], CV_models["Lv_3_Hi"], relaxed_LASSO_features["Lv_3_Hi"] = evaluate_models(predictors_df, outcomes_df["Lv_3_Hi"], ids, trial_nums)

## Total SA

In [7]:
test_scores["Tot_Hi"], CV_models["Tot_Hi"], relaxed_LASSO_features["Tot_Hi"] = evaluate_models(predictors_df, outcomes_df["Tot_Hi"], ids, trial_nums)