# Initial Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.metrics import f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectFromModel
from collections import defaultdict
from imblearn.over_sampling import SVMSMOTE
from imblearn.under_sampling import RandomUnderSampler
import scipy.stats as stats

# Preprocessing

In [2]:
# Calculate moving average data, impute missing values, and divide data into separate dataframes (ids, ECG_df, EDA_df, EEG_df, EYE_df, fNIRS_df, RSP_df, outcomes_df)
# Import dataset
df = pd.read_csv("./kieranFeatures_1-30_26-Sep-2024.csv")



#
# Compute 3-Moving Average for each ID
#
n_rows = df.shape[0]

df["avg_adjSA1"] = np.nan
df["avg_adjSA2"] = np.nan
df["avg_adjSA3"] = np.nan
df["avg_adjSAtotal"] = np.nan

cur_row = 1
while cur_row + 1 < n_rows:
    cur_ID = df.loc[cur_row, "ID"]
    next_ID = df.loc[cur_row + 1, "ID"]

    if cur_ID == next_ID:
        # Compute moving average for current trial number if not last trial
        df.loc[cur_row, "avg_adjSA1"] = np.mean([df.loc[cur_row - 1, "adjSA1"], df.loc[cur_row, "adjSA1"], df.loc[cur_row + 1, "adjSA1"]])
        df.loc[cur_row, "avg_adjSA2"] = np.mean([df.loc[cur_row - 1, "adjSA2"], df.loc[cur_row, "adjSA2"], df.loc[cur_row + 1, "adjSA1"]])
        df.loc[cur_row, "avg_adjSA3"] = np.mean([df.loc[cur_row - 1, "adjSA3"], df.loc[cur_row, "adjSA3"], df.loc[cur_row + 1, "adjSA1"]])
        df.loc[cur_row, "avg_adjSAtotal"] = np.mean([df.loc[cur_row - 1, "adjSAtotal"], df.loc[cur_row, "adjSAtotal"], df.loc[cur_row + 1, "adjSAtotal"]])

        cur_row += 1
    else:
        # Move to 2nd trial of next participant
        cur_row += 2

# Remove first and last trials of each participant
df.dropna(inplace = True)



#
# Impute Data
#

# Impute missing values with mean of column
df.replace(0, np.nan, inplace = True)
for col in df.columns:
    df[col] = df[col].fillna(value = df[col].mean())



#
# Divide Dataframe
#
ids = df["ID"]
ECG_df = df.loc[:, [col for col in df if col.startswith("ECG")]]
EDA_df = df.loc[:, [col for col in df if col.startswith("EDA")]]
EEG_df = df.loc[:, [col for col in df if col.startswith("EEG")]]
EYE_df = df.loc[:, [col for col in df if col.startswith("EYE")]]
fNIRS_df = df.loc[:, [col for col in df if col.startswith("fNIRS")]]
RSP_df = df.loc[:, [col for col in df if col.startswith("RSP")]]
outcomes_df = df.iloc[:, df.shape[1] - 8:]



#
# Free Memory
#
del col, cur_ID, cur_row, df, n_rows, next_ID