In [2]:
!pip install xgboost
!pip install xgbse
!pip install scikit-survival

Collecting scikit-survival
  Downloading scikit_survival-0.24.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.9/48.9 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ecos (from scikit-survival)
  Downloading ecos-2.0.14-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.0 kB)
Collecting osqp<1.0.0,>=0.6.3 (from scikit-survival)
  Downloading osqp-0.6.7.post3-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.9 kB)
Collecting qdldl (from osqp<1.0.0,>=0.6.3->scikit-survival)
  Downloading qdldl-0.1.7.post5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.7 kB)
Downloading scikit_survival-0.24.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.9/3.9 MB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
[?

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from xgbse import XGBSEKaplanNeighbors
from xgbse.converters import convert_to_structured
from sksurv.metrics import concordance_index_censored
import numpy as np
from sklearn.base import BaseEstimator

In [7]:
class XGBSEKaplanWrapper(BaseEstimator):
    def __init__(self, **kwargs):
        self.xgb_params = {}
        self.time_bins = None
        for k, v in kwargs.items():
            if k == "time_bins":
                self.time_bins = v
            elif k.startswith("xgb_params__"):
                self.xgb_params[k.replace("xgb_params__", "")] = v
        self.kn_model = XGBSEKaplanNeighbors(xgb_params=self.xgb_params)

    def fit(self, X, y):
        self.kn_model = XGBSEKaplanNeighbors(xgb_params=self.xgb_params)
        self.kn_model.fit(X, y, time_bins=self.time_bins)
        return self

    def predict(self, X):
        return self.kn_model.predict(X, time_bins=self.time_bins)

    def get_params(self, deep=True):
        return {f"xgb_params__{k}": v for k, v in self.xgb_params.items()}

    def set_params(self, **params):
        for k, v in params.items():
            if k.startswith("xgb_params__"):
                self.xgb_params[k.replace("xgb_params__", "")] = v
        return self

In [8]:
data_file_csv = "/content/drive/MyDrive/Ufes/Survival/dataset_fit.csv"

# Read the CSV file
data = pd.read_csv(data_file_csv)

# dados de saida
y = data["time_years"]

# transforma os dados em numeros
y_encoded = convert_to_structured(pd.Series(data['falha']), data["time_years"])

# dados de entrada filtrados
X = data.drop(columns=["time_years_cat", "time_years", "falha"])
# transforma os dados em numeros
X_encoded = pd.get_dummies(X)

# 3. Divisão treino/teste
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.25, random_state=42)

# bins de 0.25 em 0.25 anos, até o tempo máximo observado
time_max = data['time_years'].max()
time_bins = np.arange(0, time_max + 0.25, 0.25)

# Modelo base
mc = XGBSEKaplanWrapper(time_bins=time_bins, enable_categorical=True)

In [9]:
# scorer de C‑index
def cindex_scorer(estimator, X_, y_):
    # estimator.predict retorna tempo medio de sobrevivencia;
    # virar "risco" com sinal invertido
    pred = estimator.predict(X_)
    # retorna tupla (cindex, concordante, discordante, ...) -> index e [0]
    return concordance_index_censored(y_['event'], y_['time'], (1-pred).mean(axis=1))[0]

In [10]:
best_model = mc.fit(X_train, y_train)

In [14]:
# Previsões
y_score_train = best_model.predict(X_train)
# tempo esperado = soma de S(t) * delta t
delta_t = np.diff(np.insert(time_bins, 0, 0))  # delta t entre pontos
y_train_pred = np.sum(y_score_train * delta_t, axis=1)  # shape: (n_individuos,)

y_score_test = best_model.predict(X_test)
# tempo esperado = soma de S(t) * delta t
delta_t = np.diff(np.insert(time_bins, 0, 0))  # delta t entre pontos
y_pred_test = np.sum(y_score_test * delta_t, axis=1)  # shape: (n_individuos,)

# Métricas - Treino
y_event = np.array([y[0] for y in y_train])
y_time = np.array([y[1] for y in y_train])
c_train = concordance_index_censored(y_event, y_time, -y_train_pred)[0]

# Métricas - Teste
y_event = np.array([y[0] for y in y_test])
y_time = np.array([y[1] for y in y_test])
c_test = concordance_index_censored(y_event,  y_time, -y_pred_test)[0]

print("Hiperparâmetros ótimos:\n")
print(str(best_model.get_params()))
print("\n\nMétricas - Treinamento:\n")
print(f"C‑index: {c_train:.4f}\n")
print("\nMétricas - Teste:\n")
print(f"C‑index:  {c_test:.4f}\n")


  return pd.DataFrame(C_exp).fillna(method="bfill").fillna(method="ffill").values
  return pd.DataFrame(C_exp).fillna(method="bfill").fillna(method="ffill").values


Hiperparâmetros ótimos:

{}


Métricas - Treinamento:

C‑index: 0.8125


Métricas - Teste:

C‑index:  0.7975

