In [None]:
import pandas as pd
import numpy as np

# Load the dataset
file_path = "../data/stroke_data.csv"
df = pd.read_csv(file_path)
# Standardize column names (strip spaces, make uppercase for consistency)
df.columns = df.columns.str.strip().str.upper()

# Select relevant columns
selected_columns = [
    "TD",  # Outcome: Time to death or discharge (continuous)
    "RXASP",  # Treatment: Trial aspirin allocation (Y/N)
    "RXHEP",  # Treatment: Trial heparin allocation (M/L/N)
    "AGE",  # Covariate: Age in years
    "SEX",  # Covariate: Gender
    "RDELAY",  # Covariate: Delay to randomization
    "RCONSC",  # Covariate: Conscious state
    "RSBP",  # Covariate: Systolic blood pressure
    "RATRIAL",  # Covariate: Atrial fibrillation (Y/N)
    "RDEF4",  # Covariate: Dysphasia (Y/N/C)
    "STYPE",  # Covariate: Stroke subtype
]

# Extract selected variables
df_selected = df[selected_columns].copy()

# Drop rows with any missing values
df_selected = df_selected.dropna()

# Convert categorical variables to category dtype
df_selected["SEX"] = df_selected["SEX"].astype("category")
df_selected["RCONSC"] = df_selected["RCONSC"].astype("category")
df_selected["RATRIAL"] = df_selected["RATRIAL"].astype("category")
df_selected["RDEF4"] = df_selected["RDEF4"].astype("category")
df_selected["STYPE"] = df_selected["STYPE"].astype("category")
df_selected["RXASP"] = df_selected["RXASP"].astype("category")
df_selected["RXHEP"] = df_selected["RXHEP"].astype("category")

# Define treatment variable with four categories
def categorize_treatment(row):
    if row["RXASP"] == "Y": # Aspirin # 25% probability
        return 1  # Aspirin
    elif row["RXHEP"] == "M":
        return 2  # Heparin Medium # 12.5% probability
    elif row["RXHEP"] == "L":
        return 3  # Heparin Low # 12.5% probability
    else:
        return 0  # No treatment # 50% probability 

# Apply treatment categorization
df_selected["TREATMENT"] = df_selected.apply(categorize_treatment, axis=1)

# One-hot encode treatment variable
df_selected = pd.get_dummies(df_selected, columns=["TREATMENT"], drop_first=False)
df_selected.drop(columns=["RXASP", "RXHEP"], inplace=True)

# Save preprocessed dataset to a new CSV file
preprocessed_file_path = "../data/stroke_data_preprocessed.csv"
df_selected.to_csv(preprocessed_file_path, index=False)

print(f"Preprocessed dataset saved at: {preprocessed_file_path}")