# Initial Imports

In [14]:
import pprint
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate, KFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# Importing the Dataset

In [2]:
# Raw SA Single Observation Data
df = pd.read_csv("./../Datasets/kieranFeatures_1-30_14-Nov-2024_rawSA.csv")

In [None]:
# Adjusted SA Single Observation Data
df = pd.read_csv("./../Datasets/kieranFeatures_1-31_24-Jan-2025.csv")

In [None]:
# Raw SA Average of 3
df = pd.read_csv("./../Datasets/kieranFeatures_1-31_24-Jan-2025_avgof3_rawSA.csv")

In [None]:
# Adjusted SA Average of 3
df = pd.read_csv("./../Datasets/kieranFeatures_1-31_24-Jan-2025_avgof3.csv")

# Preprocessing

In [3]:
# Imput missing values
df = df.replace([np.inf, -np.inf], np.nan)
df = df.fillna(df.mean())

# Establish binary columns
df = df.assign(
    Lv_1_Lo = lambda x: (x["SA1"] < np.median(x["SA1"])),
    Lv_2_Lo = lambda x: (x["SA2"] < np.median(x["SA2"])),
    Lv_3_Lo = lambda x: (x["SA3"] < np.median(x["SA3"])),
    Tot_Lo = lambda x: (x["SAtotal"] < np.median(x["SAtotal"]))
)

# Decompose dataset
ids = df["ID"]
trial_nums = df["trialNum"]
X = df.drop(columns = ["ID", "trialNum", "SA1", "SA2", "SA3", "SAtotal", "Lv_1_Lo", "Lv_2_Lo", "Lv_3_Lo", "Tot_Lo"])
y = df[["Lv_1_Lo", "Lv_2_Lo", "Lv_3_Lo", "Tot_Lo"]]
y_shuffled = y.sample(frac = 1, random_state = 42).reset_index(drop = True)

# Initialize data structure for results
dataset_types = ["Actual", "Shuffled"]
levels = ["Lv_1_Lo", "Lv_2_Lo", "Lv_3_Lo", "Tot_Lo"]

# Training

In [6]:
def get_model_cv_scores(X, y, level):
    """
        Function to get the cross-validated F1, accuracy, ROC-AUC, and models for a given SA level.
    """

    X_CV = X.values
    y_CV = y[level].values

    scaler = StandardScaler()
    model = LogisticRegression(penalty = None, max_iter = 10000, random_state = 42)
    model_pipeline = Pipeline([
        ("scaler", scaler),
        ("classifier", model)
    ])
    cv_strategy = KFold(n_splits = 10, shuffle = True, random_state = 42)

    return cross_validate(model_pipeline, X_CV, y_CV, cv = cv_strategy, n_jobs = -1, scoring = ["f1", "accuracy", "roc_auc"], return_estimator = True)

In [None]:
data = {}

for dataset_type in ["Actual", "Shuffled"]:
    data[dataset_type] = {}

    for level in ["Lv_1_Lo", "Lv_2_Lo", "Lv_3_Lo", "Tot_Lo"]:
        data[dataset_type][level] = get_model_cv_scores(X, y if dataset_type == "Actual" else y_shuffled, level)

## Save and Load Data

In [15]:
with open("./../Pickle Files/Baseline_Results.pkl", "wb") as f:
    pickle.dump(data, f)

In [None]:
with open("./../Pickle Files/Baseline_Results.pkl", "rb") as f:
    data = pickle.load(f)