In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
import json
import pandas as pd
import wandb
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, KFold
from sklearn.pipeline import Pipeline

from src.architecture.models import make_class_models, make_risk_models
from src.preproc.preproc_utils import FCBFSelector, ToCuPy
from src.utils.utils import setup_hyperparameters

# 0) Setup run hyperparameters and W&B run
config_path = "src/configs/config.yaml"
_, run = setup_hyperparameters(config_path)

[34m[1mwandb[0m: Currently logged in as: [33mjuanravm[0m ([33mjuanravm-vall-d-hebron-institute-of-oncology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
# 1) Loading data
df = pd.read_csv("data/data_train.tsv", sep="\t")

# Filtering variables with too many missing values
na_frac = df.drop(columns=["risk_status", "dfs_status", "dfs_time"]).isna().mean()
keep = na_frac[na_frac <= 0.10].index.tolist()
df = df.loc[:, keep + ["risk_status", "dfs_status", "dfs_time"]]

# Removing samples with any missing value
keep = df.drop(columns=["risk_status", "dfs_status", "dfs_time"]).dropna().index.tolist()
df = df.loc[keep, :]

print(f"Training with {df.shape[0]} samples")

y_class = df["risk_status"].copy()
y_event = df["dfs_status"].copy()
y_time = df["dfs_time"].copy()
X = df.drop(columns=["risk_status", "dfs_status", "dfs_time", "os_status"])
X_class = X.loc[y_class.notna(), :]
y_class = y_class.loc[y_class.notna()]

# 4) Selector for best features
selector = FCBFSelector(mode="rank", threshold=0.0, n_bins=2)

# 5) Models and hyperparameters search
class_models = make_class_models()
risk_models = make_risk_models()

cv_class = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
scorer_class = make_scorer(roc_auc_score)
cv_risk = KFold(n_splits=5, shuffle=True, random_state=0)


def _cindex_scorer(y_true, y_pred):
    """
    Concordance index scorer for survival models.
    Assumes y_true is a structured array with fields ('event', 'time').
    Uses negative scores so that higher hazard -> lower survival time.
    """
    return concordance_index_censored(y_true["event"], y_true["time"], -y_pred)[0]


scorer_risk = make_scorer(_cindex_scorer, greater_is_better=True, needs_proba=False)

Training with 125 samples


In [6]:
from sklearn.base import is_classifier, is_regressor


# 6) Class models
class_results = {}
class_table_rows = []
# y_class = y_class.astype(int).to_numpy().ravel()

for name, (estimator, search_space) in class_models.items():
    if name == "xgb":
        pipe = Pipeline(
            [
                ("selector", selector),
                ("tocupy", ToCuPy()),
                ("model", estimator),
            ]
        )
    else:
        pipe = Pipeline(
            [
                ("selector", selector),
                ("model", estimator),
            ]
        )

    search = RandomizedSearchCV(
        pipe,
        param_distributions=search_space,
        n_iter=50,
        cv=cv_class,
        scoring="roc_auc",
        n_jobs=-1,
        random_state=0,
        refit=True,
    )

    search.fit(X_class, y_class)
    class_results[name] = {
        "best_score": search.best_score_,
        "best_params": search.best_params_,
        "best_estimator": search.best_estimator_,
    }
    print(f"{name}: AUC={search.best_score_:.3f}")
    if run:
        run.log(
            {
                f"class/{name}/best_auc": search.best_score_,
                f"class/{name}/best_params": json.dumps(search.best_params_, default=str),
            }
        )
        class_table_rows.append([name, search.best_score_, json.dumps(search.best_params_, default=str)])

logreg: AUC=0.852
svc: AUC=0.875
rf: AUC=0.897
xgb: AUC=0.896


In [7]:
class_results

{'logreg': {'best_score': 0.8522435897435898,
  'best_params': {'selector__kbest': 10,
   'model__l1_ratio': 0.55,
   'model__C': 26.826957952797244},
  'best_estimator': Pipeline(steps=[('selector', FCBFSelector(kbest=10, mode='rank')),
                  ('model',
                   LogisticRegression(C=26.826957952797244, l1_ratio=0.55,
                                      max_iter=5000, penalty='elasticnet',
                                      solver='saga'))])},
 'svc': {'best_score': 0.8749999999999998,
  'best_params': {'selector__kbest': 20,
   'model__kernel': 'rbf',
   'model__gamma': 0.001,
   'model__C': 0.01},
  'best_estimator': Pipeline(steps=[('selector', FCBFSelector(kbest=20, mode='rank')),
                  ('model', SVC(C=0.01, gamma=0.001, probability=True))])},
 'rf': {'best_score': 0.8972756410256411,
  'best_params': {'selector__kbest': 40,
   'model__n_estimators': 200,
   'model__min_samples_split': 2,
   'model__max_features': 'sqrt',
   'model__max_depth':

In [14]:
risk_models = make_risk_models()

In [15]:
from sksurv.util import Surv
from sklearn.model_selection import cross_val_score


# 7) Risk models
risk_results = {}
risk_table_rows = []
y_surv = Surv.from_arrays(event=y_event.astype(bool), time=y_time)

for name, (estimator, search_space) in risk_models.items():

    # Pipeline
    if name == "xgb":
        # Nota: para FCBF NO te aporta CuPy; si lo usas, asegúrate de que selector no reciba CuPy.
        pipe = Pipeline([
            ("selector", selector),
            ("tocupy", ToCuPy()),
            ("model", estimator),
        ])
    else:
        pipe = Pipeline([
            ("selector", selector),
            ("model", estimator),
        ])

    # Caso 1: sin hiperparámetros -> CV score + fit directo
    if (search_space is None) or (isinstance(search_space, dict) and len(search_space) == 0):
        scores = cross_val_score(pipe, X, y_surv, cv=cv_risk, scoring=scorer_risk, n_jobs=-1)
        pipe.fit(X, y_surv)

        best_score = float(np.mean(scores))
        best_params = {}
        best_estimator = pipe

    # Caso 2: con hiperparámetros -> RandomizedSearchCV
    else:
        # Caso especial: XGB survival:cox no acepta y_surv (structured)
        if name == "xgb":
            search = RandomizedSearchCV(
                pipe,
                param_distributions=search_space,
                n_iter=5,
                cv=cv_risk,
                scoring=scorer_risk,
                n_jobs=-1,
                random_state=0,
                refit=True,
                error_score="raise",
            )
            # XGB survival: y = time, weights = event
            search.fit(X, y_time.astype(float), model__sample_weight=y_event.astype(float))

        else:
            search = RandomizedSearchCV(
                pipe,
                param_distributions=search_space,
                n_iter=50,
                cv=cv_risk,
                scoring=scorer_risk,
                n_jobs=-1,
                random_state=0,
                refit=True,
                error_score="raise",
            )
            search.fit(X, y_surv)

        best_score = float(search.best_score_)
        best_params = search.best_params_
        best_estimator = search.best_estimator_

    # Guardar resultados
    risk_results[name] = {
        "best_score": best_score,
        "best_params": best_params,
        "best_estimator": best_estimator,
    }
    print(f"{name}: C-index={best_score:.3f}")

    if run:
        run.log({
            f"risk/{name}/best_cindex": best_score,
            f"risk/{name}/best_params": json.dumps(best_params, default=str),
        })
        risk_table_rows.append([name, best_score, json.dumps(best_params, default=str)])

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/mnt/CCBdata/projects/conda_envs/BC/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/mnt/CCBdata/projects/conda_envs/BC/lib/python3.11/site-packages/sklearn/base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/CCBdata/projects/conda_envs/BC/lib/python3.11/site-packages/sklearn/pipeline.py", line 655, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/CCBdata/projects/conda_envs/BC/lib/python3.11/site-packages/sklearn/pipeline.py", line 589, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/CCBdata/projects/conda_envs/BC/lib/python3.11/site-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/CCBdata/projects/conda_envs/BC/lib/python3.11/site-packages/sklearn/pipeline.py", line 1540, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/CCBdata/projects/conda_envs/BC/lib/python3.11/site-packages/sklearn/utils/_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/CCBdata/projects/conda_envs/BC/lib/python3.11/site-packages/sklearn/base.py", line 897, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/juanrafaelvalera@vhio.org/ondemand/CARE/src/preproc/preproc_utils.py", line 34, in fit
    selected_idX = fcbf(X, y, mode=self.mode, delta=self.threshold)  # Returns the list with the features ordered by importance
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/CCBdata/projects/conda_envs/BC/lib/python3.11/site-packages/skfeature/function/information_theoretical_based/FCBF.py", line 45, in fcbf
    t1[i, 1] = su_calculation(f, y)
               ^^^^^^^^^^^^^^^^^^^^
  File "/mnt/CCBdata/projects/conda_envs/BC/lib/python3.11/site-packages/skfeature/utility/mutual_information.py", line 58, in su_calculation
    t1 = information_gain(f1, f2)
         ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/CCBdata/projects/conda_envs/BC/lib/python3.11/site-packages/skfeature/utility/mutual_information.py", line 18, in information_gain
    ig = ee.entropyd(f1) - conditional_entropy(f1, f2)
                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/CCBdata/projects/conda_envs/BC/lib/python3.11/site-packages/skfeature/utility/mutual_information.py", line 37, in conditional_entropy
    ce = ee.entropyd(f1) - ee.midd(f1, f2)
                           ^^^^^^^^^^^^^^^
  File "/mnt/CCBdata/projects/conda_envs/BC/lib/python3.11/site-packages/skfeature/utility/entropy_estimators.py", line 104, in midd
    return -entropyd(list(zip(x, y))) + entropyd(x) + entropyd(y)
            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/CCBdata/projects/conda_envs/BC/lib/python3.11/site-packages/skfeature/utility/entropy_estimators.py", line 96, in entropyd
    return entropyfromprobs(hist(sx), base=base)
                            ^^^^^^^^
  File "/mnt/CCBdata/projects/conda_envs/BC/lib/python3.11/site-packages/skfeature/utility/entropy_estimators.py", line 119, in hist
    d[s] = d.get(s, 0) + 1
           ^^^^^^^^^^^
TypeError: unhashable type: 'writeable void-scalar'
