In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sksurv.ensemble import RandomSurvivalForest
from sksurv.util import Surv
from sksurv.metrics import concordance_index_censored

In [14]:
df_clinical = pd.read_csv('X_train/clinical_train.csv')
df_molecular = pd.read_csv('X_train/molecular_train.csv')
df_target = pd.read_csv('target_train.csv')

df_main = pd.merge(df_clinical, df_target, on='ID')

molecular_counts = df_molecular.groupby('ID').size().reset_index(name='Nmut')

df_main = pd.merge(df_clinical, df_target, on='ID')
# On ajoute le nombre de mutations (Left join pour garder les patients sans mutation)
df_main = pd.merge(df_main, molecular_counts, on='ID', how='left')

# Remplir les NaN de Nmut par 0 (ceux qui n'étaient pas dans le fichier moléculaire n'ont pas de mutation connue)
df_main['Nmut'] = df_main['Nmut'].fillna(0)

df_main.head()

Unnamed: 0,ID,CENTER,BM_BLAST,WBC,ANC,MONOCYTES,HB,PLT,CYTOGENETICS,OS_YEARS,OS_STATUS,Nmut
0,P132697,MSK,14.0,2.8,0.2,0.7,7.6,119.0,"46,xy,del(20)(q12)[2]/46,xy[18]",1.115068,1.0,9.0
1,P132698,MSK,1.0,7.4,2.4,0.1,11.6,42.0,"46,xx",4.928767,0.0,3.0
2,P116889,MSK,15.0,3.7,2.1,0.1,14.2,81.0,"46,xy,t(3;3)(q25;q27)[8]/46,xy[12]",2.043836,0.0,3.0
3,P132699,MSK,1.0,3.9,1.9,0.1,8.9,77.0,"46,xy,del(3)(q26q27)[15]/46,xy[5]",2.476712,1.0,11.0
4,P132700,MSK,6.0,128.0,9.7,0.9,11.1,195.0,"46,xx,t(3;9)(p13;q22)[10]/46,xx[10]",3.145205,0.0,1.0


In [15]:
df_main.dropna(subset=['OS_YEARS', 'OS_STATUS'], inplace=True)
df_main['OS_STATUS'] = df_main['OS_STATUS'].astype(bool) 
df_main['OS_YEARS'] = pd.to_numeric(df_main['OS_YEARS'], errors='coerce')

# Drop ID, Targets, and CYTOGENETICS (too complex for now)
features_to_drop = ['ID', 'OS_STATUS', 'OS_YEARS', 'CYTOGENETICS']

X = df_main.drop(columns=features_to_drop)
X = pd.get_dummies(X, columns=['CENTER'], drop_first=True)

# 4. Missing Values Handling (Median Imputation)
imputer = SimpleImputer(strategy='median')
X_columns = X.columns 
X = pd.DataFrame(imputer.fit_transform(X), columns=X_columns)

X.head()

Unnamed: 0,BM_BLAST,WBC,ANC,MONOCYTES,HB,PLT,Nmut,CENTER_CGM,CENTER_DUS,CENTER_DUTH,...,CENTER_MUV,CENTER_PV,CENTER_REL,CENTER_RMCN,CENTER_ROM,CENTER_TUD,CENTER_UMG,CENTER_UOB,CENTER_UOXF,CENTER_VU
0,14.0,2.8,0.2,0.7,7.6,119.0,9.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,7.4,2.4,0.1,11.6,42.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,15.0,3.7,2.1,0.1,14.2,81.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,3.9,1.9,0.1,8.9,77.0,11.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6.0,128.0,9.7,0.9,11.1,195.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
y = Surv.from_dataframe('OS_STATUS', 'OS_YEARS', df_main)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [17]:
# Random Survival Forest Model
rsf = RandomSurvivalForest(
    n_estimators=100,       # 100 trees
    min_samples_split=10,
    min_samples_leaf=15,
    max_features="sqrt",
    n_jobs=-1,              # Use all CPU cores
    random_state=42
)

rsf.fit(X_train, y_train)

0,1,2
,n_estimators,100
,max_depth,
,min_samples_split,10
,min_samples_leaf,15
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,bootstrap,True
,oob_score,False
,n_jobs,-1


In [18]:
# Calculate C-Index (1.0 = Perfect, 0.5 = Random)
train_score = rsf.score(X_train, y_train)
test_score = rsf.score(X_test, y_test)

print(f"C-Index on Train: {train_score:.4f}")
print(f"C-Index on Test : {test_score:.4f}")


C-Index on Train: 0.7588
C-Index on Test : 0.7302
