# Preprocessing.ipynb
Original code by Zach Sletton and Allen Chezick - preprocessing and feature engineering.

In [None]:
import pandas as pd
import numpy as np

#path definition
csv_data = "/voc/work/GitHub Submission Milestone II/mds_ed.csv"
output_path = "/voc/work/GitHub Submission Milestone II/Outputs/final_pkl/df_best.pkl"

#Zach
#load csv and trim data
df_full = pd.read_csv(csv_data, low_memory=False)
remove_vals_ = ['diagnoses_', 'general_file_name', 'Unnamed: 0', 'subject_id', 'stay_id', 'row_id']
reduced_cols = [col for col in df_full.columns if not any(val in col for val in remove_vals_)]
df = pd.read_csv(csv_data, low_memory=False, usecols=reduced_cols)

#Allen
#round specific numeric columns
cols_to_round = [
    'vitals_temperature_first', 'vitals_temperature_last',
    'biometrics_weight', 'general_mortality_hours', 'general_mortality_days'
]
for c in cols_to_round:
    if c in df.columns:
        df[c] = df[c].round(2)

#remove extraneous lab value
drop_cols = [c for c in df.columns if c.startswith("labvalues_") and not (c.endswith("_change") or c.endswith("_first"))]
df = df.drop(columns=drop_cols)

#missing data calculation
df = df.replace(-999, np.nan)
df = df.dropna(axis=1, thresh=len(df)*0.1)      # keep cols with ≥10% data
df = df.dropna(axis=0, thresh=df.shape[1]*0.40) # keep rows with ≥40% data

#Zach
#flagging for admission
df["admit"] = df["general_ed_hadm_id"].notna().astype(int)
if "general_ed_hadm_id" in df.columns:
    df = df.drop(columns=["general_ed_hadm_id"])

#mortality and deterioration target creation
df["general_mortality_days"] = pd.to_numeric(df["general_mortality_days"], errors="coerce")
df["mortality_any"] = df["general_mortality_days"].notna().astype(int)
df["mortality_28d"] = ((df["general_mortality_days"] <= 28) & df["general_mortality_days"].notna()).astype(int)
df["mortality_365d"] = ((df["general_mortality_days"] <= 365) & df["general_mortality_days"].notna()).astype(int)
df["mortality_gt365d"] = ((df["general_mortality_days"] > 365) & df["general_mortality_days"].notna()).astype(int)

bins = [-np.inf, 28, 365, np.inf]
labels = ["Short-term", "Medium-term", "Long-term"]
df["mortality_category"] = pd.cut(df["general_mortality_days"], bins=bins, labels=labels)
df["mortality_category"] = df["mortality_category"].cat.add_categories(["Alive"]).fillna("Alive")

deterioration_cols = [c for c in df.columns if "deterioration" in c.lower() and "mortality" not in c.lower()]
df["clinical_deterioration_any"] = df[deterioration_cols].any(axis=1).astype(int)
df["death_or_deterioration_any"] = df[["mortality_any","clinical_deterioration_any"]].any(axis=1).astype(int)

#time series features
if "general_90min" in df.columns:
    s = pd.to_datetime(df["general_90min"], errors="coerce")
    df["month_time"] = s.dt.month
    df["tod_time"] = s.dt.hour.astype("Int64")

#removing final columns
to_remove = [
    'general_dod','general_ecg_no_within_stay','general_strat_fold',
    'general_intime','general_outtime','general_ecg_time','AAbificant_Digits',
    'general_icu_time_hours','target','Unnamed: 0','general_study_id',
    'general_subject_id','general_ed_stay_id','general_ed_hadm_id','general_data'
]

df = df.drop(columns=[c for c in to_remove if c in df.columns])
df.to_pickle(output_path)
print(f"done, check {output_path}")