# Initial Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.metrics import f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectFromModel
from collections import defaultdict
from imblearn.over_sampling import SVMSMOTE
from imblearn.under_sampling import RandomUnderSampler
import scipy.stats as stats

# Preprocessing

In [11]:
# Import dataset
df = pd.read_csv("./kieranFeatures_1-30_26-Sep-2024.csv")
n_rows = df.shape[0]


# Compute 3-Moving Average for each ID
df["avg_adjSA1"] = np.nan
df["avg_adjSA2"] = np.nan
df["avg_adjSA3"] = np.nan
df["avg_adjSAtotal"] = np.nan

start_row = 0
end_row = start_row

while end_row < n_rows:
    cur_ID = df.loc[start_row, "ID"]

    # Position end_row to be the last row of the same ID as start_row
    while end_row < n_rows - 1 and df.loc[end_row + 1, "ID"] == cur_ID:
        end_row += 1

    # Compute 3-Moving Average for current participant ID
    cur_row = start_row + 1
    while cur_row < end_row:
        df.loc[cur_row, "avg_adjSA1"] = np.mean([df.loc[cur_row - 1, "adjSA1"], df.loc[cur_row, "adjSA1"], df.loc[cur_row + 1, "adjSA1"]])
        df.loc[cur_row, "avg_adjSA2"] = np.mean([df.loc[cur_row - 1, "adjSA2"], df.loc[cur_row, "adjSA2"], df.loc[cur_row + 1, "adjSA1"]])
        df.loc[cur_row, "avg_adjSA3"] = np.mean([df.loc[cur_row - 1, "adjSA3"], df.loc[cur_row, "adjSA3"], df.loc[cur_row + 1, "adjSA1"]])
        df.loc[cur_row, "avg_adjSAtotal"] = np.mean([df.loc[cur_row - 1, "adjSAtotal"], df.loc[cur_row, "adjSAtotal"], df.loc[cur_row + 1, "adjSAtotal"]])

        cur_row += 1

    # Move to next participant ID
    start_row = end_row + 1
    end_row = start_row

df.dropna(inplace = True)

# Fill in 0 data (missing)
df = df.replace(0, np.nan)

# Impute missing values with mean of column
for col in df.columns:
    df[col] = df[col].fillna(value = df[col].mean())