# Initial Imports

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
import pickle
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.feature_selection import SelectFromModel
from collections import defaultdict, Counter

# Pre-Processing

In [8]:
df = pd.read_csv("./kieranFeatures_1-30_14-Nov-2024_rawSA.csv").replace([np.inf, -np.inf], np.nan).interpolate() # Perform linear interpolation on all non-valid numbers

df["Lv_1_Hi"] = (df["SA1"] >= 5).astype(int)
df["Lv_2_Hi"] = (df["SA2"] >= 5).astype(int)
df["Lv_3_Hi"] = (df["SA3"] >= 5).astype(int)
df["Tot_Hi"] = (df["SAtotal"] >= 15).astype(int)

ids = df.iloc[:, 0]
trial_nums = df.iloc[:, 1]
predictors_df = df.iloc[:, 2:df.shape[1] - 8]
outcomes_df = df.iloc[:, df.shape[1] - 8:]

# Shuffle Labels
outcomes_df["Lv_1_Hi"] = np.random.permutation(outcomes_df["Lv_1_Hi"])
outcomes_df["Lv_2_Hi"] = np.random.permutation(outcomes_df["Lv_2_Hi"])
outcomes_df["Lv_3_Hi"] = np.random.permutation(outcomes_df["Lv_3_Hi"])
outcomes_df["Tot_Hi"] = np.random.permutation(outcomes_df["Tot_Hi"])

# Free Memory
del df

# Cross Validation

In [9]:
test_scores = {
    "Lv_1_Hi": [],
    "Lv_2_Hi": [],
    "Lv_3_Hi": [],
    "Tot_Hi": []
}

CV_models = {
    "Lv_1_Hi": [],
    "Lv_2_Hi": [],
    "Lv_3_Hi": [],
    "Tot_Hi": []
}

relaxed_LASSO_features = {
    "Lv_1_Hi": [],
    "Lv_2_Hi": [],
    "Lv_3_Hi": [],
    "Tot_Hi": []
}

In [10]:
def evaluate_models(pred_df, out_df, ids, trial_nums):
    f1_scores = defaultdict(list)
    models = defaultdict(list)
    relaxed_LASSO_selected_features = []

    # Obtain 5 test folds stratifying by participant ID
    skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
    for i, (CV_idx, test_idx) in enumerate(skf.split(pred_df, ids)):
        # Train-Test Split for the Fold
        ids_CV = ids.iloc[CV_idx].values
        # ids_test = ids.iloc[test_idx].values
        # trial_nums_CV = ids.iloc[CV_idx].values
        # trial_nums_test = ids.iloc[test_idx].values
        pred_CV = pred_df.iloc[CV_idx, :].values
        pred_test = pred_df.iloc[test_idx, :].values
        out_CV = out_df.iloc[CV_idx].values
        out_test = out_df.iloc[test_idx].values

        # Setup Cross Validation Object
        rskf = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 2, random_state = 42)
        
        # Initialize Models to Train
        no_penalty_model = LogisticRegression(
            fit_intercept = False,
            solver = "saga",
            n_jobs = -1,
            max_iter = 20000,
            class_weight = "balanced",
            penalty = None
        )

        LASSO_model = LogisticRegressionCV(
            Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000], 
            cv = rskf.split(pred_CV, ids_CV), 
            fit_intercept = False,
            class_weight = "balanced",
            penalty = "l1", 
            solver = "saga", 
            n_jobs = -1, 
            max_iter = 20000, 
            scoring = "f1_macro", 
            refit = True
        )

        relaxed_LASSO_model = LogisticRegression(
            fit_intercept = False,
            solver = "saga",
            n_jobs = -1,
            max_iter = 20000,
            class_weight = "balanced",
            penalty = None
        )

        # Fit Models
        no_penalty_model.fit(pred_CV, out_CV)
        LASSO_model.fit(pred_CV, out_CV)

        # Obtain and fit "Relaxed LASSO" Model
        selector = SelectFromModel(LASSO_model, threshold = "mean", prefit = True)
        selected_features = selector.get_support()
        pred_CV_selected = pred_CV[:, selected_features]
        pred_test_selected = pred_test[:, selected_features]
        relaxed_LASSO_model.fit(pred_CV_selected, out_CV)

        # Append F1 Scores
        f1_scores["No Penalty"].append(f1_score(out_test, no_penalty_model.predict(pred_test)))
        f1_scores["LASSO"].append(f1_score(out_test, LASSO_model.predict(pred_test)))
        f1_scores["Relaxed LASSO"].append(f1_score(out_test, relaxed_LASSO_model.predict(pred_test_selected)))

        # Remove generator from LASSO_model to allow pickling
        LASSO_model.set_params(cv = None)

        # Add Models and Scores to Dictionaries
        models["No Penalty"].append(no_penalty_model)
        models["LASSO"].append(LASSO_model)
        models["Relaxed LASSO"].append(relaxed_LASSO_model)

        # Store selected features for relaxed LASSO
        relaxed_LASSO_selected_features.append(selected_features)

    return f1_scores, models, relaxed_LASSO_selected_features

In [12]:
# Saving Models and Test Scores
with open("raw_SA_investigation_shuffled.pkl", "wb") as f:
    pickle.dump((test_scores, CV_models, relaxed_LASSO_features), f)

In [None]:
# Load Test Scores and CV Models from File
with open("raw_SA_investigation_shuffled.pkl", "rb") as f:
    test_scores, CV_models, relaxed_LASSO_features = pickle.load(f)

## SA 1

In [11]:
test_scores["Lv_1_Hi"], CV_models["Lv_1_Hi"], relaxed_LASSO_features["Lv_1_Hi"] = evaluate_models(predictors_df, outcomes_df["Lv_1_Hi"], ids, trial_nums)