In [1]:
!pip install -q /kaggle/input/pytabkit/*.whl --no-deps

In [2]:
!pip install -q /kaggle/input/pip-install-lifelines/autograd-1.7.0-py3-none-any.whl
!pip install -q /kaggle/input/pip-install-lifelines/autograd-gamma-0.5.0.tar.gz
!pip install -q /kaggle/input/pip-install-lifelines/interface_meta-1.3.0-py3-none-any.whl
!pip install -q /kaggle/input/pip-install-lifelines/formulaic-1.0.2-py3-none-any.whl
!pip install -q /kaggle/input/pip-install-lifelines/lifelines-0.30.0-py3-none-any.whl

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for autograd-gamma (setup.py) ... [?25l[?25hdone


In [3]:
#!pip install -q /kaggle/input/faiss-cpu-173/faiss_cpu-1.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

In [4]:
!pip install /kaggle/input/faiss-gpu-173-python310/faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

Processing /kaggle/input/faiss-gpu-173-python310/faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [5]:
import pandas as pd
import pandas.api.types
import numpy as np
from lifelines.utils import concordance_index


class ParticipantVisibleError(Exception):
    pass


def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    del solution[row_id_column_name]
    del submission[row_id_column_name]
    
    event_label = 'efs'
    interval_label = 'efs_time'
    prediction_label = 'prediction'
    
    for col in submission.columns:
        if not pandas.api.types.is_numeric_dtype(submission[col]):
            raise ParticipantVisibleError(f'Submission column {col} must be a number')
            
    merged_df = pd.concat([solution, submission], axis=1)
    merged_df.reset_index(inplace=True)
    merged_df_race_dict = dict(merged_df.groupby(['race_group']).groups)
    
    metric_list = []
    for race in merged_df_race_dict.keys():
        indices = sorted(merged_df_race_dict[race])
        merged_df_race = merged_df.iloc[indices]
        c_index_race = concordance_index(
            merged_df_race[interval_label],
            -merged_df_race[prediction_label],
            merged_df_race[event_label]
        )
        metric_list.append(c_index_race)
        
    return float(np.mean(metric_list) - np.sqrt(np.var(metric_list)))

In [6]:
import pandas as pd
import numpy as np

In [7]:
train = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/train.csv')
test = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/test.csv')

In [8]:
from lifelines import KaplanMeierFitter
from sklearn.model_selection import KFold

def create_fold_specific_target(data, time_col='efs_time', event_col='efs'):
    cv = KFold(n_splits=10, shuffle=True, random_state=52)
    target = np.zeros(len(data))

    for train_index, valid_index in cv.split(data):
        train_data = data.iloc[train_index]
        valid_data = data.iloc[valid_index]

        kmf = KaplanMeierFitter()
        kmf.fit(durations=train_data[time_col], event_observed=train_data[event_col])
        target[valid_index] = kmf.survival_function_at_times(valid_data[time_col]).values
        
    data['y'] = target
    
    return data

In [9]:
train = create_fold_specific_target(train, time_col='efs_time', event_col='efs')

In [10]:
combined = pd.concat([train, test], axis=0)

RMV = ["ID", "efs", "efs_time", "y"]
FEATURES = [c for c in train.columns if not c in RMV]

In [11]:
CATS = []
for c in FEATURES:
    if combined[c].dtype == 'object':
        CATS.append(c)
        train[c] = train[c].fillna('nan')
        test[c] = test[c].fillna('nan')

NUMS = [c for c in FEATURES if not c in CATS]

In [12]:
for c in NUMS:
    combined[c] = combined[c].fillna(combined[c].mean())

train = combined.iloc[:len(train)].copy()
test = combined.iloc[len(train):].reset_index(drop=True).copy()

In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train[NUMS] = scaler.fit_transform(train[NUMS])
test[NUMS] = scaler.transform(test[NUMS])

In [14]:
from pytabkit.models.sklearn.sklearn_interfaces import (
    RealMLP_TD_Regressor,
    MLP_RTDL_D_Regressor,
    Resnet_RTDL_D_Regressor,
    FTT_D_Regressor,
    MLP_PLR_D_Regressor,
    # TabR_S_D_Regressor,
    # RealTabR_D_Regressor,
    TabM_D_Regressor
)

In [15]:
import os
import joblib

In [16]:
%%time
FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=52)
    
oof_realmlp = np.zeros(len(train))
pred_realmlp = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):
    print("#" * 25)
    print(f"### Fold {i + 1}")
    print("#" * 25)
    
    X_train = train.loc[train_index, FEATURES].copy()
    y_train = train.loc[train_index, "y"]
    X_valid = train.loc[test_index, FEATURES].copy()
    y_valid = train.loc[test_index, "y"]
    X_test = test[FEATURES].copy()

    model_realmlp = RealMLP_TD_Regressor(
        n_cv=5,
        use_early_stopping=True,
        early_stopping_additive_patience=20,
        random_state=52, 
        verbosity=2
    )
    model_realmlp.fit(
        X_train, y_train,
        X_valid, y_valid
    )
    os.makedirs('/kaggle/working/realmlp', exist_ok=True)
    joblib.dump(model_realmlp, f'/kaggle/working/realmlp/realmlp_fold_{i}.pkl')

    oof_realmlp[test_index] = model_realmlp.predict(X_valid)
    pred_realmlp += model_realmlp.predict(X_test)

pred_realmlp /= FOLDS

#########################
### Fold 1
#########################
Columns classified as continuous: ['hla_match_c_high', 'hla_high_res_8', 'hla_low_res_6', 'hla_high_res_6', 'hla_high_res_10', 'hla_match_dqb1_high', 'hla_nmdp_6', 'hla_match_c_low', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'year_hct', 'hla_match_a_high', 'donor_age', 'hla_match_b_low', 'age_at_hct', 'hla_match_a_low', 'hla_match_b_high', 'comorbidity_score', 'karnofsky_score', 'hla_low_res_8', 'hla_match_drb1_high', 'hla_low_res_10']
Columns classified as categorical: ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'tbi_status', 'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'cmv_status', 'tce_imm_match', 'rituximab', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe', 'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match', 'race_group', 'hepatic_mild', 'tce_div_m

In [17]:
y_true = train[["ID", "efs", "efs_time", "race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_realmlp
m_realmlp = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for realmlp KaplanMeier = ", m_realmlp) # 0.6688371595345517


Overall CV for realmlp KaplanMeier =  0.6672550692523211


In [18]:
%%time
FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=52)
    
oof_mlprdtl = np.zeros(len(train))
pred_mlprdtl = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):
    print("#" * 25)
    print(f"### Fold {i + 1}")
    print("#" * 25)
    
    X_train = train.loc[train_index, FEATURES].copy()
    y_train = train.loc[train_index, "y"]
    X_valid = train.loc[test_index, FEATURES].copy()
    y_valid = train.loc[test_index, "y"]
    X_test = test[FEATURES].copy()

    model_mlprdtl = MLP_RTDL_D_Regressor(
        n_cv=5,
        lr=0.0001,
        max_epochs=100,
        es_patience=20,
        random_state=52, 
        verbosity=2
    )
    model_mlprdtl.fit(
        X_train, y_train,
        X_valid, y_valid
    )
    os.makedirs('/kaggle/working/mlprdtl', exist_ok=True)
    joblib.dump(model_mlprdtl, f'/kaggle/working/mlprdtl/mlprdtl_fold_{i}.pkl')

    oof_mlprdtl[test_index] = model_mlprdtl.predict(X_valid)
    pred_mlprdtl += model_mlprdtl.predict(X_test)

pred_mlprdtl /= FOLDS

#########################
### Fold 1
#########################
Columns classified as continuous: ['hla_match_c_high', 'hla_high_res_8', 'hla_low_res_6', 'hla_high_res_6', 'hla_high_res_10', 'hla_match_dqb1_high', 'hla_nmdp_6', 'hla_match_c_low', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'year_hct', 'hla_match_a_high', 'donor_age', 'hla_match_b_low', 'age_at_hct', 'hla_match_a_low', 'hla_match_b_high', 'comorbidity_score', 'karnofsky_score', 'hla_low_res_8', 'hla_match_drb1_high', 'hla_low_res_10']
Columns classified as categorical: ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'tbi_status', 'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'cmv_status', 'tce_imm_match', 'rituximab', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe', 'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match', 'race_group', 'hepatic_mild', 'tce_div_m

In [19]:
y_true = train[["ID", "efs", "efs_time", "race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_mlprdtl
m_mlprdtl = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for mlprdtl KaplanMeier = ", m_mlprdtl) # 0.6642933850923877


Overall CV for mlprdtl KaplanMeier =  0.6643116873294168


In [20]:
%%time
FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=52)
    
oof_resnet = np.zeros(len(train))
pred_resnet = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):
    print("#" * 25)
    print(f"### Fold {i + 1}")
    print("#" * 25)
    
    X_train = train.loc[train_index, FEATURES].copy()
    y_train = train.loc[train_index, "y"]
    X_valid = train.loc[test_index, FEATURES].copy()
    y_valid = train.loc[test_index, "y"]
    X_test = test[FEATURES].copy()

    model_resnet = Resnet_RTDL_D_Regressor(
        n_cv=5,
        lr=0.0001,
        max_epochs=200,
        es_patience=20,
        random_state=52, 
        verbosity=2
    )
    model_resnet.fit(
        X_train, y_train,
        X_valid, y_valid
    )
    os.makedirs('/kaggle/working/resnet', exist_ok=True)
    joblib.dump(model_resnet, f'/kaggle/working/resnet/resnet_fold_{i}.pkl')

    oof_resnet[test_index] = model_resnet.predict(X_valid)
    pred_resnet += model_resnet.predict(X_test)

pred_resnet /= FOLDS

#########################
### Fold 1
#########################
Columns classified as continuous: ['hla_match_c_high', 'hla_high_res_8', 'hla_low_res_6', 'hla_high_res_6', 'hla_high_res_10', 'hla_match_dqb1_high', 'hla_nmdp_6', 'hla_match_c_low', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'year_hct', 'hla_match_a_high', 'donor_age', 'hla_match_b_low', 'age_at_hct', 'hla_match_a_low', 'hla_match_b_high', 'comorbidity_score', 'karnofsky_score', 'hla_low_res_8', 'hla_match_drb1_high', 'hla_low_res_10']
Columns classified as categorical: ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'tbi_status', 'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'cmv_status', 'tce_imm_match', 'rituximab', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe', 'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match', 'race_group', 'hepatic_mild', 'tce_div_m

In [21]:
y_true = train[["ID", "efs", "efs_time", "race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_resnet
m_resnet = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for resnet KaplanMeier = ", m_resnet) # 0.6663569259927158


Overall CV for resnet KaplanMeier =  0.6663968239200998


In [22]:
%%time
FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=52)
    
oof_ftt = np.zeros(len(train))
pred_ftt = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):
    print("#" * 25)
    print(f"### Fold {i + 1}")
    print("#" * 25)
    
    X_train = train.loc[train_index, FEATURES].copy()
    y_train = train.loc[train_index, "y"]
    X_valid = train.loc[test_index, FEATURES].copy()
    y_valid = train.loc[test_index, "y"]
    X_test = test[FEATURES].copy()

    model_ftt = FTT_D_Regressor(
        n_cv=5,
        lr=0.0001,
        max_epochs=100,
        es_patience=15,
        random_state=52, 
        verbosity=2
    )
    model_ftt.fit(
        X_train, y_train,
        X_valid, y_valid
    )
    os.makedirs('/kaggle/working/ftt', exist_ok=True)
    joblib.dump(model_ftt, f'/kaggle/working/ftt/ftt_fold_{i}.pkl')

    oof_ftt[test_index] = model_ftt.predict(X_valid)
    pred_ftt += model_ftt.predict(X_test)

pred_ftt /= FOLDS

#########################
### Fold 1
#########################
Columns classified as continuous: ['hla_match_c_high', 'hla_high_res_8', 'hla_low_res_6', 'hla_high_res_6', 'hla_high_res_10', 'hla_match_dqb1_high', 'hla_nmdp_6', 'hla_match_c_low', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'year_hct', 'hla_match_a_high', 'donor_age', 'hla_match_b_low', 'age_at_hct', 'hla_match_a_low', 'hla_match_b_high', 'comorbidity_score', 'karnofsky_score', 'hla_low_res_8', 'hla_match_drb1_high', 'hla_low_res_10']
Columns classified as categorical: ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'tbi_status', 'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'cmv_status', 'tce_imm_match', 'rituximab', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe', 'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match', 'race_group', 'hepatic_mild', 'tce_div_m

In [23]:
y_true = train[["ID", "efs", "efs_time", "race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_ftt
m_ftt = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for ftt KaplanMeier = ", m_ftt) # 0.6680354797728337


Overall CV for ftt KaplanMeier =  0.6734988161252538


In [24]:
%%time
FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=52)
    
oof_mlpprl = np.zeros(len(train))
pred_mlpprl = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):
    print("#" * 25)
    print(f"### Fold {i + 1}")
    print("#" * 25)
    
    X_train = train.loc[train_index, FEATURES].copy()
    y_train = train.loc[train_index, "y"]
    X_valid = train.loc[test_index, FEATURES].copy()
    y_valid = train.loc[test_index, "y"]
    X_test = test[FEATURES].copy()

    model_mlpprl = MLP_PLR_D_Regressor(
        n_cv=5,
        lr=0.0001,
        max_epochs=100,
        es_patience=20,
        random_state=52, 
        verbosity=2
    )
    model_mlpprl.fit(
        X_train, y_train,
        X_valid, y_valid
    )
    os.makedirs('/kaggle/working/mlpprl', exist_ok=True)
    joblib.dump(model_mlpprl, f'/kaggle/working/mlpprl/mlpprl_fold_{i}.pkl')

    oof_mlpprl[test_index] = model_mlpprl.predict(X_valid)
    pred_mlpprl += model_mlpprl.predict(X_test)

pred_mlpprl /= FOLDS

#########################
### Fold 1
#########################
Columns classified as continuous: ['hla_match_c_high', 'hla_high_res_8', 'hla_low_res_6', 'hla_high_res_6', 'hla_high_res_10', 'hla_match_dqb1_high', 'hla_nmdp_6', 'hla_match_c_low', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'year_hct', 'hla_match_a_high', 'donor_age', 'hla_match_b_low', 'age_at_hct', 'hla_match_a_low', 'hla_match_b_high', 'comorbidity_score', 'karnofsky_score', 'hla_low_res_8', 'hla_match_drb1_high', 'hla_low_res_10']
Columns classified as categorical: ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'tbi_status', 'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'cmv_status', 'tce_imm_match', 'rituximab', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe', 'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match', 'race_group', 'hepatic_mild', 'tce_div_m

In [25]:
y_true = train[["ID", "efs", "efs_time", "race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_mlpprl
m_mlpprl = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for mlpprl KaplanMeier = ", m_mlpprl) # 0.6706938280206677


Overall CV for mlpprl KaplanMeier =  0.6707513719221984


In [26]:
%%time
FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=52)
    
oof_tabmd = np.zeros(len(train))
pred_tabmd = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):
    print("#" * 25)
    print(f"### Fold {i + 1}")
    print("#" * 25)
    
    X_train = train.loc[train_index, FEATURES].copy()
    y_train = train.loc[train_index, "y"]
    X_valid = train.loc[test_index, FEATURES].copy()
    y_valid = train.loc[test_index, "y"]
    X_test = test[FEATURES].copy()

    model_tabmd = TabM_D_Regressor(
        n_cv=5,
        lr=0.0001,
        n_epochs=200,
        patience=20,
        random_state=52, 
        verbosity=0
    )
    model_tabmd.fit(
        X_train, y_train,
        X_valid, y_valid
    )
    os.makedirs('/kaggle/working/tabmd', exist_ok=True)
    joblib.dump(model_tabmd, f'/kaggle/working/tabmd/tabmd_fold_{i}.pkl')

    oof_tabmd[test_index] = model_tabmd.predict(X_valid)
    pred_tabmd += model_tabmd.predict(X_test)

pred_tabmd /= FOLDS

#########################
### Fold 1
#########################
Setting seed: 1387748341
Setting seed: 1388881564
Setting seed: 112175883
Setting seed: 628153869
Setting seed: 905253015
#########################
### Fold 2
#########################
Setting seed: 1387748341
Setting seed: 1388881564
Setting seed: 112175883
Setting seed: 628153869
Setting seed: 905253015
#########################
### Fold 3
#########################
Setting seed: 1387748341
Setting seed: 1388881564
Setting seed: 112175883
Setting seed: 628153869
Setting seed: 905253015
#########################
### Fold 4
#########################
Setting seed: 1387748341
Setting seed: 1388881564
Setting seed: 112175883
Setting seed: 628153869
Setting seed: 905253015
#########################
### Fold 5
#########################
Setting seed: 1387748341
Setting seed: 1388881564
Setting seed: 112175883
Setting seed: 628153869
Setting seed: 905253015
#########################
### Fold 6
#########################
Setting seed

In [27]:
y_true = train[["ID", "efs", "efs_time", "race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_tabmd
m_tabmd = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for tabmd KaplanMeier = ", m_tabmd) # dolgo, potom na gpu test


Overall CV for tabmd KaplanMeier =  0.6697422578152127
