# Predictive Algorithms

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
import os

src_path = os.path.abspath(os.path.join(os.getcwd(), "src"))
if src_path not in sys.path:
    sys.path.append(src_path)


## Load and Preprocess Data

This step will involve loading the data, handling missing values, and encoding categorical variables to prepare them for modeling.

In [None]:
from src.data.data_processing import DataSplitter, DataPreprocessor
from src.data.lookups import LookupManager

db_config_path = '/home/jupyter/.my.cnf'
db_name = 'actin_personalization'
query = "SELECT * FROM knownPalliativeTreatments"

preprocessor = DataPreprocessor(db_config_path, db_name)

lookup_manager = LookupManager()
features = lookup_manager.features

In [None]:
pfs_df, pfs_features, pfs_encoded_columns = preprocessor.preprocess_data(query, duration_col = "observedPfsDays", event_col = "hadProgressionEvent", features = features)

In [None]:
os_df, os_features, os_encoded_columns = preprocessor.preprocess_data(query, duration_col = "observedOsFromTreatmentStartDays", event_col = "hadSurvivalEvent", features = features)

## Models

In [None]:
from src.models.survival_models import (
    BaseSurvivalModel,
    CoxPHModel,
    AalenAdditiveModel,
    RandomSurvivalForestModel,
    GradientBoostingSurvivalModel,
    DeepSurv,
    LogisticHazardModel,
    DeepHitModel,
    PCHazardModel,
    MTLRModel
)

from src.models.model_trainer import *

from sksurv.util import Surv

splitter = DataSplitter(test_size=0.1, random_state=42)

#pfs
event_col = 'hadProgressionEvent'
duration_col = 'observedPfsDays'

y_pfs = Surv.from_dataframe(event=event_col, time=duration_col, data=pfs_df)
X_train, X_test, y_train, y_test = splitter.split(pfs_df[pfs_features], y_pfs, 'systemicTreatmentPlan', pfs_encoded_columns)


#os
# y_os = Surv.from_dataframe(event='hadSurvivalEvent', time='observedOsFromTreatmentStartDays', data=os_df)
# X_train, X_test, y_train, y_test = splitter.split(os_df[os_features], y_os, 'systemicTreatmentPlan', os_encoded_columns)
# event_col = 'hadSurvivalEvent'
# duration_col = 'observedOsFromTreatmentStartDays'

models = {
    'DeepSurv': DeepSurv(input_size=X_train.shape[1]),
    'LogisticHazardModel': LogisticHazardModel(input_size=X_train.shape[1]),
    'DeepHitModel': DeepHitModel(input_size=X_train.shape[1]), 
    'PCHazardModel': PCHazardModel(input_size=X_train.shape[1]), 
    'MTLRModel': MTLRModel(input_size=X_train.shape[1]),
    'AalenAdditive': AalenAdditiveModel(),
    'CoxPH': CoxPHModel(),
    'RandomSurvivalForest': RandomSurvivalForestModel(),
    'GradientBoosting': GradientBoostingSurvivalModel(),
}

trainer = ModelTrainer(models=models, n_splits=5, random_state=42)

results = trainer.train_and_evaluate(
    X_train,
    y_train,
    X_test,
    y_test,
    treatment_col='systemicTreatmentPlan',
    encoded_columns=os_encoded_columns,
    event_col= event_col,
    duration_col=duration_col
)

print("\nFinal Results:") 
for model_name, metrics in results.items():
    print(f"{model_name}:")
    for metric_name, value in metrics.items():
        print(f"  {metric_name}: {value:.4f}")