# Predictive Algorithms for Survival Analysis for predicting survival days

This notebook demonstrates a full pipeline for training and evaluating predictive models for survival analysis. We handle both overall survival (OS) and progression‑free survival (PFS) by reusing the same functions. The pipeline loads and preprocesses data, visualizes the target distributions (before and after log transformation), displays correlation heatmaps and residual plots, evaluates a set of regression models, and performs hyperparameter optimization (using random search) for selected models.


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
os.chdir('/data/repos/actin-personalization/scripts/personalization/prediction')

In [None]:
from src.data.data_processing import DataSplitter, DataPreprocessor
from src.data.lookups import LookupManager

In [None]:
db_config_path = '/home/jupyter/.my.cnf'
db_name = 'actin_personalization'
query = "SELECT * FROM knownPalliativeTreatments"

preprocessor = DataPreprocessor(db_config_path, db_name)

lookup_manager = LookupManager()
features = lookup_manager.features

## Data Preprocessing

We define a function to load and prepare the data. We load the data, filter for rows with the event of interest, apply a log transformation to the survival days target, and splits the features and target.

TODO write something about why drop censored


In [None]:
def get_data(query, event_col, duration_col, features, group_treatments):
    splitter = DataSplitter(test_size=0.1, random_state=42)
    
    df, features, encoded_columns = preprocessor.preprocess_data(query, duration_col, event_col, features, group_treatments)
    
    df = df[df[event_col] == 1]
    
    X_train, X_test, y_train, y_test = splitter.split(df[features], df[duration_col], encoded_columns = encoded_columns)
    
    return df, X_train, X_test, y_train, y_test, encoded_columns

In [None]:
os_df, os_X_train, os_X_test, os_y_train, os_y_test, os_encoded_columns = get_data(
    query, 'hadSurvivalEvent', 'observedOsFromTreatmentStartDays', features, group_treatments=True
)

In [None]:
pfs_df, pfs_X_train, pfs_X_test, pfs_y_train, pfs_y_test, pfs_encoded_columns = get_data(
    query, 'hadProgressionEvent', 'observedPfsDays', features, group_treatments=True
)

### Visualization Functions

We now visualize the distribution of the OS target before and after log transformation. This helps us understand the skewness of the data and the effect of the transformation. These functions also generate a correlation heatmap and plot residuals.



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

def visualize_target_distribution(df, target_col):
    plt.figure(figsize=(10,6))
    sns.histplot(df[target_col], kde=True, bins=30)
    plt.title(f"Distribution of {target_col} (Days)")
    plt.xlabel(f"{target_col} (Days)")
    plt.ylabel("Frequency")
    plt.show()
    log_target = np.log1p(df[target_col])
    plt.figure(figsize=(10,6))
    sns.histplot(log_target, kde=True, bins=30, color='green')
    plt.title(f"Distribution of Log-Transformed {target_col}")
    plt.xlabel(f"Log({target_col} + 1)")
    plt.ylabel("Frequency")
    plt.show()
    return log_target

def plot_correlation_heatmap(df, target_col):
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
    for col in [target_col, 'hadSurvivalEvent', 'hadProgressionEvent']:
        if col in numeric_cols:
            numeric_cols.remove(col)
    corr_matrix = df[numeric_cols + [target_col]].corr()
    plt.figure(figsize=(12,10))
    sns.heatmap(corr_matrix, annot=False, fmt=".2f", cmap="coolwarm", square=True)
    plt.title(f"Correlation Heatmap with {target_col}")
    plt.show()

def plot_residuals(y_true, y_pred):
    residuals = y_true - y_pred
    plt.figure(figsize=(8,6))
    sns.histplot(residuals, kde=True, bins=30)
    plt.title("Residuals Distribution (Log Scale)")
    plt.xlabel("Residuals")
    plt.ylabel("Frequency")
    plt.show()
    plt.figure(figsize=(8,6))
    sns.scatterplot(x=y_pred, y=residuals)
    plt.axhline(0, color='red', linestyle='--')
    plt.title("Residuals vs Predicted (Log Scale)")
    plt.xlabel("Predicted Log Value")
    plt.ylabel("Residuals")
    plt.show()

def plot_predictions(y_test, y_pred, title):
    y_test_orig = np.expm1(y_test)
    y_pred_orig = np.expm1(y_pred)
    plt.figure(figsize=(8,6))
    sns.scatterplot(x=y_test_orig, y=y_pred_orig)
    plt.plot([y_test_orig.min(), y_test_orig.max()], [y_test_orig.min(), y_test_orig.max()], color='red', lw=2)
    plt.xlabel("Actual (Days)")
    plt.ylabel("Predicted (Days)")
    plt.title(title)
    plt.show()


## Model Evaluation and Optimization Functions

The `evaluate_models` function trains a set of models on the log-transformed target and reports performance on both the log scale and the original scale. The `optimize_model_random_search` function performs randomized hyperparameter tuning.


In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

In [None]:
import pandas as pd

def evaluate_models(models, X_train, X_test, y_train, y_test):
    results = []
    for model_name, model in models.items():
        y_train_log = np.log1p(y_train)
        y_test_log = np.log1p(y_test)
        model.fit(X_train, y_train_log)
        y_pred_log = model.predict(X_test)
        y_pred_orig = np.expm1(y_pred_log)
        results.append({
            "Model": model_name,
            "MSE (log)": mean_squared_error(y_test_log, y_pred_log),
            "R² (log)": r2_score(y_test_log, y_pred_log),
            "MSE (original)": mean_squared_error(y_test, y_pred_orig),
            "R² (original)": r2_score(y_test, y_pred_orig)
        })
    return pd.DataFrame(results)


In [None]:
def hyperparameter_search(model, param_dist, X_train, y_train, cv_folds=5, n_iter=20):
    random_search = RandomizedSearchCV(model, param_dist, n_iter=n_iter, cv=cv_folds,
                                       scoring='neg_mean_squared_error', verbose=1,
                                       n_jobs=-1, random_state=42)
    random_search.fit(X_train, y_train)
    return random_search.best_estimator_, random_search.best_params_, random_search.best_score_

def optimize_model_random_search(X_train, y_train):
    rf_param_dist = {
        'n_estimators': [100, 200, 300, 500],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt', 'log2']
    }
    best_rf, best_rf_params, best_rf_score = hyperparameter_search(RandomForestRegressor(random_state=42), rf_param_dist, X_train, np.log1p(y_train))
    print("RandomForest best parameters:", best_rf_params)
    print("RandomForest best score (neg MSE):", best_rf_score)
    
    gb_param_dist = {
        'n_estimators': [100, 200, 300, 500],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'max_depth': [3, 4, 5, 6],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt', 'log2']
    }
    best_gb, best_gb_params, best_gb_score = hyperparameter_search(GradientBoostingRegressor(random_state=42), gb_param_dist, X_train, np.log1p(y_train))
    print("GradientBoosting best parameters:", best_gb_params)
    print("GradientBoosting best score (neg MSE):", best_gb_score)
    
    mlp_param_dist = {
        'hidden_layer_sizes': [(64, 32), (128, 64, 32), (32,), (128, 64, 64, 32), (32, 32)],
        'activation': ['relu', 'tanh'],
        'solver': ['adam', 'lbfgs'],
        'alpha': [1e-4, 1e-3, 1e-2],
        'learning_rate_init': [0.001, 0.01, 0.05]
    }
    best_mlp, best_mlp_params, best_mlp_score = hyperparameter_search(MLPRegressor(max_iter=1000, random_state=42), mlp_param_dist, X_train, np.log1p(y_train))
    print("MLPRegressor best parameters:", best_mlp_params)
    print("MLPRegressor best score (neg MSE):", best_mlp_score)

## KNN and Best-K Determination

The function `determine_best_k_nn` runs cross‑validation over a range of K values for a K‑Nearest Neighbors regressor (using log‑transformed target values) and plots the cross‑validated negative MSE. The best K (with the highest score) is returned.


In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
def determine_best_k_nn(X_train, y_train, k_range):
    scores = {}
    for k in k_range:
        knn = KNeighborsRegressor(n_neighbors=k)
        cv_scores = cross_val_score(knn, X_train, np.log1p(y_train), cv=5, scoring='neg_mean_squared_error')
        scores[k] = np.mean(cv_scores)
    best_k = max(scores, key=scores.get)
    plt.figure(figsize=(8,6))
    plt.plot(list(scores.keys()), list(scores.values()), marker='o')
    plt.title("Cross-Validated Negative MSE for Different k")
    plt.xlabel("k")
    plt.ylabel("CV Negative MSE")
    plt.show()
    return best_k, scores

In [None]:
best_knn_k, knn_scores = determine_best_k_nn(os_X_train, os_y_train, range(1, 21))    

In [None]:
best_knn_k, knn_scores = determine_best_k_nn(pfs_X_train, pfs_y_train, range(1, 21))

## Full Pipeline Function

The `run_pipeline` function runs the entire workflow for a given survival type ("OS" or "PFS"). It loads data, performs visualizations, evaluates models, plots predictions, and executes hyperparameter tuning.


In [None]:
def run_pipeline(survival_type, optimize=False):
    if survival_type.lower() == 'os':
        event_col = 'hadSurvivalEvent'
        duration_col = 'observedOsFromTreatmentStartDays'
        label = "Observed OS"
        best_k = 6
    elif survival_type.lower() == 'pfs':
        event_col = 'hadProgressionEvent'
        duration_col = 'observedPfsDays'
        label = "Observed PFS"
        best_k = 7
    else:
        raise ValueError("survival_type must be 'OS' or 'PFS'")
    
    df, X_train, X_test, y_train, y_test, enc_cols = get_data(query, event_col, duration_col, features, group_treatments=True)
    
    log_distribution = visualize_target_distribution(df, duration_col)
    plot_correlation_heatmap(df, duration_col)
    
    models_dict = {
        "LinearRegression": LinearRegression(),
        "Ridge": Ridge(random_state=42),
        "Lasso": Lasso(random_state=42),
        "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
        "GradientBoosting": GradientBoostingRegressor(random_state=42),
        "MLPRegressor": MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42),
        "SVR_RBF": SVR(kernel='rbf', C=1.0),
        "XGBRegressor": XGBRegressor(n_estimators=100, random_state=42), 
        "KNN": KNeighborsRegressor(n_neighbors=best_k)
    }
 
    results_df = evaluate_models(models_dict, X_train, X_test, y_train, y_test)
    print(results_df)
    
    best_model_name = results_df.sort_values(by="R² (log)", ascending=False).iloc[0]["Model"]
    best_model = models_dict[best_model_name]
    best_model.fit(X_train, np.log1p(y_train))
    y_pred_log = best_model.predict(X_test)
    plot_predictions(y_test, y_pred_log, f"{best_model_name}: Predicted vs Actual {label}")
    plot_residuals(np.log1p(y_test), y_pred_log)
    
    if optimize:
        optimize_model_random_search(X_train, y_train)

### Running the Pipeline

Call `run_pipeline` with either `"OS"` or `"PFS"` to execute the entire workflow.


In [None]:
run_pipeline('OS')

In [None]:
run_pipeline('PFS')