In [1]:
import numpy as np
import pandas as pd
from survivors.external import ClassifWrapSA, RegrWrapSA, SAWrapSA

In [None]:
from sklearn.metrics import root_mean_squared_error, r2_score
from scipy.stats import spearmanr
# Метрики регрессии
# - RMSE (Root Mean Squared Error)
# - R^2 (Coefficient of Determination)
# - MAPE (Mean Absolute Percentage Error)
# - MEDAPE (Median Absolute Percentage Error)
# - Spearman корреляция
# - RMSLE (Root Mean Squared Logarithmic Error)

rmse_exp_time = lambda y_tr, y_tst, pred_time, pred_sf, pred_hf, bins: root_mean_squared_error(y_tst["time"], pred_time)
r2_exp_time = lambda y_tr, y_tst, pred_time, pred_sf, pred_hf, bins: r2_score(y_tst["time"], pred_time)
mape_exp_time = lambda y_tr, y_tst, pred_time, pred_sf, pred_hf, bins: np.mean(np.abs((y_tst["time"] - pred_time) / np.maximum(y_tst["time"], 1))) * 100
medape_exp_time = lambda y_tr, y_tst, pred_time, pred_sf, pred_hf, bins: np.median(np.abs((y_tst["time"] - pred_time) / np.maximum(y_tst["time"], 1))) * 100
spearman_exp_time = lambda y_tr, y_tst, pred_time, pred_sf, pred_hf, bins: spearmanr(y_tst["time"], pred_time)[0]
rmsle_exp_time = lambda y_tr, y_tst, pred_time, pred_sf, pred_hf, bins: np.sqrt(np.mean((np.log1p(np.clip(y_tst["time"], 0, None)) - np.log1p(np.clip(pred_time, 0, None)))**2))

In [None]:
from sklearn.metrics import root_mean_squared_error, roc_auc_score, log_loss
# Метрики классификации
# - AUC вероятности события
# - log-loss вероятности события
# - RMSE исхода

def find_sf_at_truetime(pred_sf, event_time, bins):
    idx_pred = np.clip(np.searchsorted(bins, event_time), 0, len(bins) - 1)
    proba = np.take_along_axis(pred_sf, idx_pred[:, np.newaxis], axis=1).squeeze()
    return proba

## example
# true_times = np.array([1, 19, 21, 31])
# bins = np.array([10,20,30])
# sf = np.array([[0.9, 0.8, 0.7], 
#                [0.7, 0.6, 0.5], 
#                [0.5, 0.4, 0.3],
#                [0.05, 0.04, 0.03]])
# print(np.mean(sf, axis=1))
# print(find_sf_at_truetime(sf, true_times, bins))  # [0.9  0.6  0.3  0.03] 

auc_event = lambda y_tr, y_tst, pred_time, pred_sf, pred_hf, bins: roc_auc_score(y_tst["cens"].astype(int), 1 - np.mean(pred_sf, axis=1))
log_loss_event = lambda y_tr, y_tst, pred_time, pred_sf, pred_hf, bins: log_loss(y_tst["cens"], 1 - np.mean(pred_sf, axis=1))
rmse_event = lambda y_tr, y_tst, pred_time, pred_sf, pred_hf, bins: root_mean_squared_error(y_tst["cens"], 1 - np.mean(pred_sf, axis=1))

# auc_event_T = lambda y_tr, y_tst, pred_time, pred_sf, pred_hf, bins: roc_auc_score(y_tst["cens"].astype(int), 1 - find_sf_at_truetime(pred_sf, y_tst["time"], bins))
# log_loss_event_T = lambda y_tr, y_tst, pred_time, pred_sf, pred_hf, bins: log_loss(y_tst["cens"], 1 - find_sf_at_truetime(pred_sf, y_tst["time"], bins))
# rmse_event_T = lambda y_tr, y_tst, pred_time, pred_sf, pred_hf, bins: root_mean_squared_error(y_tst["cens"], 1 - find_sf_at_truetime(pred_sf, y_tst["time"], bins))

In [14]:
from survivors.experiments import grid as exp
import survivors.datasets as ds

l_metrics = ["CI", "IBS", "AUPRC", "RMSE_TIME", "R2_TIME", 
             "AUC_EVENT", "LOGLOSS_EVENT", "RMSE_EVENT",
            #  "AUC_EVENT_T", "LOGLOSS_EVENT_T", "RMSE_EVENT_T",
             "MAPE_TIME", "MEDAPE_TIME", "SPEARMAN_TIME", "RMSLE_TIME"]
X, y, features, categ, _ = ds.load_pbc_dataset()
experim = exp.Experiments(folds=5, mode="CV+SAMPLE")
experim.add_new_metric("RMSE_TIME", rmse_exp_time)
experim.add_new_metric("R2_TIME", r2_exp_time)
experim.add_new_metric("MAPE_TIME", mape_exp_time)
experim.add_new_metric("MEDAPE_TIME", medape_exp_time)
experim.add_new_metric("SPEARMAN_TIME", spearman_exp_time)
experim.add_new_metric("RMSLE_TIME", rmsle_exp_time)
experim.add_new_metric("AUC_EVENT", auc_event)
experim.add_new_metric("LOGLOSS_EVENT", log_loss_event)
experim.add_new_metric("RMSE_EVENT", rmse_event)
experim.set_metrics(l_metrics)

In [5]:
# Гиперпараметры и модели классификации

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

CLASS_PARAM_GRIDS = {
    "logistic_regression": dict(),
    "svc": dict(),
    "knn_classifier": dict(),
    "decision_tree_classifier": dict(),
    "random_forest_classifier": dict(),
    "gradient_boosting_classifier": dict()
}

# CLASS_PARAM_GRIDS = {
#     "logistic_regression": {
#         "penalty": ["l2"],
#         "C": [0.01, 0.1, 1, 10],
#         "solver": ["liblinear", "lbfgs"],
#         "class_weight": [None, "balanced"],
#         "max_iter": [1000],
#     },
#     "svc": {
#         "kernel": ["linear", "rbf"],
#         "C": [0.1, 1, 10],
#         "class_weight": [None, "balanced"],
#         "probability": [True],
#     },
#     "knn_classifier": {
#         "n_neighbors": [5, 10, 20],
#         "weights": ["uniform", "distance"],
#     },
#     "decision_tree_classifier": {
#         "max_depth": [5, 10, 20],
#         "min_samples_split": [2, 10],
#         "min_samples_leaf": [1, 5],
#         "criterion": ["gini", "entropy"],
#     },
#     "random_forest_classifier": {
#         "n_estimators": [100, 300],
#         "max_depth": [10, 30],
#         "min_samples_split": [2, 10],
#         "min_samples_leaf": [1, 5],
#     },
#     "gradient_boosting_classifier": {
#         "n_estimators": [100, 300],
#         "learning_rate": [0.05, 0.1],
#         "max_depth": [2, 3],
#         "subsample": [0.7, 1.0],
#     }
# }

experim.add_method(ClassifWrapSA(LogisticRegression()), CLASS_PARAM_GRIDS['logistic_regression'])
experim.add_method(ClassifWrapSA(SVC()), CLASS_PARAM_GRIDS['svc'])
experim.add_method(ClassifWrapSA(KNeighborsClassifier()), CLASS_PARAM_GRIDS['knn_classifier'])
experim.add_method(ClassifWrapSA(DecisionTreeClassifier()), CLASS_PARAM_GRIDS['decision_tree_classifier'])
experim.add_method(ClassifWrapSA(RandomForestClassifier()), CLASS_PARAM_GRIDS['random_forest_classifier'])
experim.add_method(ClassifWrapSA(GradientBoostingClassifier()), CLASS_PARAM_GRIDS['gradient_boosting_classifier'])

In [6]:
# Гиперпараметры и модели регрессии

from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

REGR_PARAM_GRIDS = {
    "elastic_net": dict(),
    "decision_tree_regressor": dict(),
    "random_forest_regressor": dict(),
    "gradient_boosting_regressor": dict(),
    "svr": dict(),
    "knn_regressor": dict()
}

# REGR_PARAM_GRIDS = {
#     "elastic_net": {
#         "alpha": [0.001, 0.01, 0.1],
#         "l1_ratio": [0.2, 0.5, 0.8],
#         "max_iter": [1000, 5000],
#     }, 
#     "decision_tree_regressor": {
#         "max_depth": [5, 10, 20],
#         "min_samples_split": [2, 10],
#         "min_samples_leaf": [1, 5],
#         "criterion": ["squared_error", "friedman_mse"],
#     },
#     "random_forest_regressor": {
#         "n_estimators": [100, 300],
#         "max_depth": [10, 30],
#         "min_samples_split": [2, 10],
#         "min_samples_leaf": [1, 5],
#     },
#     "gradient_boosting_regressor": {
#         "n_estimators": [100, 300],
#         "learning_rate": [0.05, 0.1],
#         "max_depth": [2, 3],
#         "subsample": [0.7, 1.0],
#     },
#     "svr": {
#         "kernel": ["linear", "rbf"],
#         "C": [0.1, 1, 10],
#         "epsilon": [0.1, 0.2],
#     },
#     "knn_regressor": {
#         "n_neighbors": [5, 10, 20],
#         "weights": ["uniform", "distance"],
#     }
# }

experim.add_method(RegrWrapSA(ElasticNet()), REGR_PARAM_GRIDS['elastic_net'])
experim.add_method(RegrWrapSA(DecisionTreeRegressor()), REGR_PARAM_GRIDS['decision_tree_regressor'])
experim.add_method(RegrWrapSA(RandomForestRegressor()), REGR_PARAM_GRIDS['random_forest_regressor'])
experim.add_method(RegrWrapSA(GradientBoostingRegressor()), REGR_PARAM_GRIDS['gradient_boosting_regressor'])
experim.add_method(RegrWrapSA(SVR()), REGR_PARAM_GRIDS['svr'])
experim.add_method(RegrWrapSA(KNeighborsRegressor()), REGR_PARAM_GRIDS['knn_regressor'])

In [7]:
# Гиперпараметры моделей выживаемости (внешние для survivors)

from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.tree import SurvivalTree
from sksurv.ensemble import RandomSurvivalForest
from sksurv.ensemble import GradientBoostingSurvivalAnalysis
from lifelines import KaplanMeierFitter

EXTERNAL_SURV_PARAM_GRIDS = {
    "km": dict(),
    "cox_ph": dict(),
    "random_survival_forest": dict(),
    "survival_tree": dict(),
    "gbsa": dict(),
}

# EXTERNAL_SURV_PARAM_GRIDS = {
#     "km": {},
#     "cox_ph": {
#         'alpha': [100, 10, 1, 0.1, 0.01, 0.001],
#         'ties': ["breslow"]
#     },
#     "random_survival_forest": {
#         'n_estimators': [50],
#         'max_depth': [5, 20],
#         'min_samples_leaf': [0.001, 0.01, 0.1, 0.25],
#         "random_state": [123]
#     },
#     "survival_tree": {
#         'max_depth': [None, 20, 30],
#         'min_samples_leaf': [1, 10, 20],
#         'max_features': [None, "sqrt"],
#         "random_state": [123]
#     },
#     "gbsa": {
#         'loss': ["coxph"],
#         'learning_rate': [0.01, 0.05, 0.1, 0.5],
#         'n_estimators': [50],
#         'min_samples_leaf': [1, 10, 50, 100],
#         'max_features': ["sqrt"],
#         "random_state": [123]
#     },
# }

experim.add_method(SAWrapSA(KaplanMeierFitter()), EXTERNAL_SURV_PARAM_GRIDS['km'])
experim.add_method(CoxPHSurvivalAnalysis, EXTERNAL_SURV_PARAM_GRIDS['cox_ph'])
experim.add_method(RandomSurvivalForest, EXTERNAL_SURV_PARAM_GRIDS['random_survival_forest'])
experim.add_method(SurvivalTree, EXTERNAL_SURV_PARAM_GRIDS['survival_tree'])
experim.add_method(GradientBoostingSurvivalAnalysis, EXTERNAL_SURV_PARAM_GRIDS['gbsa'])

In [8]:
# Гиперпараметры моделей выживаемости (внутри survivors)

from survivors.tree import CRAID
from survivors.ensemble import ParallelBootstrapCRAID

INTERNAL_SURV_PARAM_GRIDS = {
    "CRAID": {"depth": [10]},
    "ParallelBootstrapCRAID": {"depth": [10]}
}

# INTERNAL_SURV_PARAM_GRIDS = {
#     "CRAID": {
#         "depth": [10],
#         "criterion": ["wilcoxon", "logrank"],
#         "l_reg": [0, 0.01, 0.1],
#         "min_samples_leaf": [0.05, 0.01, 0.001],
#         "categ": [categ]
#     },
#     "ParallelBootstrapCRAID": {
#         "n_estimators": [50],
#         "depth": [7],
#         "size_sample": [0.3, 0.7],
#         "l_reg": [0, 0.01, 0.1],
#         "criterion": ["tarone-ware", "wilcoxon"],
#         "min_samples_leaf": [0.05, 0.01],
#         "ens_metric_name": ["IBS_REMAIN"],
#         "max_features": ["sqrt"],
#         "categ": [categ]
#     }
# }

experim.add_method(CRAID, INTERNAL_SURV_PARAM_GRIDS["CRAID"])
experim.add_method(ParallelBootstrapCRAID, INTERNAL_SURV_PARAM_GRIDS["ParallelBootstrapCRAID"])

In [9]:
import warnings
warnings.filterwarnings("ignore")

experim.run_effective(X, y, verbose=0, stratify_best=[])
df_results = experim.get_best_by_mode()

<survivors.external.mlwrap.ClassifWrapSA object at 0x149e916d0> {}
<survivors.external.mlwrap.ClassifWrapSA object at 0x14c746650> {}
<survivors.external.mlwrap.ClassifWrapSA object at 0x14be41810> {}
<survivors.external.mlwrap.ClassifWrapSA object at 0x14c67ff90> {}
<survivors.external.mlwrap.ClassifWrapSA object at 0x14c6923d0> {}
<survivors.external.mlwrap.ClassifWrapSA object at 0x14c6b6ad0> {}
<survivors.external.mlwrap.RegrWrapSA object at 0x14cc93a10> {}
<survivors.external.mlwrap.RegrWrapSA object at 0x14be11d10> {}
<survivors.external.mlwrap.RegrWrapSA object at 0x14be10950> {}
<survivors.external.mlwrap.RegrWrapSA object at 0x14be12310> {}
<survivors.external.mlwrap.RegrWrapSA object at 0x14c67d0d0> {}
<survivors.external.mlwrap.RegrWrapSA object at 0x14a71ca50> {}
<survivors.external.mlwrap.SAWrapSA object at 0x14c692790> {}
<class 'sksurv.linear_model.coxph.CoxPHSurvivalAnalysis'> {}
<class 'sksurv.ensemble.forest.RandomSurvivalForest'> {}
<class 'sksurv.tree.tree.SurvivalT

In [10]:
df_results.columns

Index(['METHOD', 'PARAMS', 'TIME', 'CI', 'IBS', 'AUPRC', 'RMSE_TIME',
       'R2_TIME', 'AUC_EVENT', 'LOGLOSS_EVENT', 'RMSE_EVENT', 'MAPE_TIME',
       'MEDAPE_TIME', 'SPEARMAN_TIME', 'RMSLE_TIME', 'CRIT', 'TIMES', 'MEMS',
       'MEM', 'CI_mean', 'IBS_mean', 'AUPRC_mean', 'RMSE_TIME_mean',
       'R2_TIME_mean', 'AUC_EVENT_mean', 'LOGLOSS_EVENT_mean',
       'RMSE_EVENT_mean', 'MAPE_TIME_mean', 'MEDAPE_TIME_mean',
       'SPEARMAN_TIME_mean', 'RMSLE_TIME_mean', 'CI_CV', 'CI_CV_mean',
       'IBS_CV', 'IBS_CV_mean', 'AUPRC_CV', 'AUPRC_CV_mean', 'RMSE_TIME_CV',
       'RMSE_TIME_CV_mean', 'R2_TIME_CV', 'R2_TIME_CV_mean', 'AUC_EVENT_CV',
       'AUC_EVENT_CV_mean', 'LOGLOSS_EVENT_CV', 'LOGLOSS_EVENT_CV_mean',
       'RMSE_EVENT_CV', 'RMSE_EVENT_CV_mean', 'MAPE_TIME_CV',
       'MAPE_TIME_CV_mean', 'MEDAPE_TIME_CV', 'MEDAPE_TIME_CV_mean',
       'SPEARMAN_TIME_CV', 'SPEARMAN_TIME_CV_mean', 'RMSLE_TIME_CV',
       'RMSLE_TIME_CV_mean'],
      dtype='object')

In [13]:
# for m in ["AUC_EVENT", "LOGLOSS_EVENT", "RMSE_EVENT"]:
#     df_res_sort = df_results[["METHOD", f"{m}_mean", f"{m}_T_mean"]].round(3).sort_values(f"{m}_mean")
#     display(df_res_sort)

for m in ["RMSE_TIME", "R2_TIME", "MAPE_TIME", "SPEARMAN_TIME", "RMSLE_TIME", "MEDAPE_TIME"]:
    df_res_sort = df_results[["METHOD", f"{m}_mean"]].round(3).sort_values(f"{m}_mean")
    display(df_res_sort)

Unnamed: 0,METHOD,RMSE_TIME_mean
8,RegrWrapSA(RandomForestRegressor),878.717
6,RegrWrapSA(ElasticNet),910.183
9,RegrWrapSA(GradientBoostingRegressor),918.39
18,ParallelBootstrapCRAID,964.447
17,CRAID,1061.354
10,RegrWrapSA(SVR),1099.397
11,RegrWrapSA(KNeighborsRegressor),1115.114
7,RegrWrapSA(DecisionTreeRegressor),1213.685
0,ClassifWrapSA(LogisticRegression),1308.114
4,ClassifWrapSA(RandomForestClassifier),1326.049


Unnamed: 0,METHOD,R2_TIME_mean
15,SurvivalTree,-3.223
14,RandomSurvivalForest,-3.218
13,CoxPHSurvivalAnalysis,-3.126
16,GradientBoostingSurvivalAnalysis,-3.124
3,ClassifWrapSA(DecisionTreeClassifier),-2.876
5,ClassifWrapSA(GradientBoostingClassifier),-1.206
2,ClassifWrapSA(KNeighborsClassifier),-1.086
1,ClassifWrapSA(SVC),-0.885
12,SAWrapSA(KaplanMeierFitter),-0.649
4,ClassifWrapSA(RandomForestClassifier),-0.515


Unnamed: 0,METHOD,MAPE_TIME_mean
9,RegrWrapSA(GradientBoostingRegressor),80.612
8,RegrWrapSA(RandomForestRegressor),80.693
18,ParallelBootstrapCRAID,85.608
6,RegrWrapSA(ElasticNet),88.891
17,CRAID,90.765
7,RegrWrapSA(DecisionTreeRegressor),94.35
16,GradientBoostingSurvivalAnalysis,100.153
13,CoxPHSurvivalAnalysis,100.197
14,RandomSurvivalForest,107.765
15,SurvivalTree,108.718


Unnamed: 0,METHOD,SPEARMAN_TIME_mean
1,ClassifWrapSA(SVC),0.122
10,RegrWrapSA(SVR),0.147
2,ClassifWrapSA(KNeighborsClassifier),0.166
11,RegrWrapSA(KNeighborsRegressor),0.2
3,ClassifWrapSA(DecisionTreeClassifier),0.257
15,SurvivalTree,0.351
17,CRAID,0.364
5,ClassifWrapSA(GradientBoostingClassifier),0.368
7,RegrWrapSA(DecisionTreeRegressor),0.389
4,ClassifWrapSA(RandomForestClassifier),0.4


Unnamed: 0,METHOD,RMSLE_TIME_mean
8,RegrWrapSA(RandomForestRegressor),0.657
18,ParallelBootstrapCRAID,0.689
9,RegrWrapSA(GradientBoostingRegressor),0.762
17,CRAID,0.778
4,ClassifWrapSA(RandomForestClassifier),0.809
10,RegrWrapSA(SVR),0.815
11,RegrWrapSA(KNeighborsRegressor),0.843
6,RegrWrapSA(ElasticNet),0.964
12,SAWrapSA(KaplanMeierFitter),1.01
1,ClassifWrapSA(SVC),1.025


Unnamed: 0,METHOD,MEDAPE_TIME_mean
8,RegrWrapSA(RandomForestRegressor),31.183
9,RegrWrapSA(GradientBoostingRegressor),32.595
6,RegrWrapSA(ElasticNet),34.48
18,ParallelBootstrapCRAID,37.146
17,CRAID,40.203
11,RegrWrapSA(KNeighborsRegressor),40.584
10,RegrWrapSA(SVR),40.621
7,RegrWrapSA(DecisionTreeRegressor),43.378
4,ClassifWrapSA(RandomForestClassifier),50.572
0,ClassifWrapSA(LogisticRegression),52.89


In [12]:
pd.Series(y["time"]).describe()

count     418.000000
mean     1917.782297
std      1104.672992
min        41.000000
25%      1092.750000
50%      1730.000000
75%      2613.500000
max      4795.000000
dtype: float64