In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from lifelines import CoxPHFitter, KaplanMeierFitter
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
from lifelines.utils import concordance_index
from metric import score
from sdv.single_table import CTGANSynthesizer, CopulaGANSynthesizer
from sdv.metadata import SingleTableMetadata

In [9]:
# Configure and train the CTGAN model
# ctgan = CTGANSynthesizer(
#     metadata=metadata,
#     embedding_dim=256,
#     generator_dim=[512, 512],
#     discriminator_dim=[512, 512],
#     generator_lr=1e-4,
#     discriminator_lr=1e-4,
#     batch_size=1000,
#     epochs=40,
#     verbose=True,
#     cuda=True,
# )

# efs_1_df = train[train["efs"] == 1]
# ctgan.fit(efs_1_df)

In [2]:
train_path = "data/train.csv"
test_path = "data/test.csv"
sample_path = "data/sample_submission.csv"
data_dict_path = "data/data_dictionary.csv"

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
data_dict_df = pd.read_csv(data_dict_path)

from lifelines import KaplanMeierFitter
def transform_survival_probability(df, time_col='efs_time', event_col='efs'):
    kmf = KaplanMeierFitter()
    kmf.fit(df[time_col], df[event_col])
    y = kmf.survival_function_at_times(df[time_col]).values
    return y
train["y"] = transform_survival_probability(train, time_col='efs_time', event_col='efs')

RMV = ["ID", "efs", "efs_time", "y"]
FEATURES = [c for c in train.columns if not c in RMV]
# print(f"Total features: {len(FEATURES)} - {FEATURES}")

CATS = []
NUMS = []
for c in FEATURES:
    if train[c].dtype=="object":
        CATS.append(c)
        train[c] = train[c].fillna("NAN")
        test[c] = test[c].fillna("NAN")
    else:
        NUMS.append(c)
        train[c] = train[c].fillna(-1)
        test[c] = test[c].fillna(-1)
# print(f"In these features, there are {len(CATS)} CATEGORICAL FEATURES: {CATS}")

combined = pd.concat([train,test],axis=0,ignore_index=True)
#print("Combined data shape:", combined.shape )

# LABEL ENCODE CATEGORICAL FEATURES
# print("We LABEL ENCODE the CATEGORICAL FEATURES: ",end="")
for c in FEATURES:

    # LABEL ENCODE CATEGORICAL AND CONVERT TO INT32 CATEGORY
    if c in CATS:
        # print(f"{c}, ",end="")
        combined[c],_ = combined[c].factorize()
        combined[c] -= combined[c].min()
        combined[c] = combined[c].astype("int32")
        combined[c] = combined[c].astype("category")
        
    # REDUCE PRECISION OF NUMERICAL TO 32BIT TO SAVE MEMORY
    else:
        if combined[c].dtype=="float64":
            combined[c] = combined[c].astype("float32")
        if combined[c].dtype=="int64":
            combined[c] = combined[c].astype("int32")
    
train = combined.iloc[:len(train)].copy()
test = combined.iloc[len(train):].reset_index(drop=True).copy()

In [11]:
count = 0
for col in train.columns:
    if train[col].dtype != "object" and len(train[col].unique()) <= 20:
        count += 1
        print(f"{col}: {sorted(train[col].unique())}")

print(count)

dri_score: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
psych_disturb: [0, 1, 2, 3]
cyto_score: [0, 1, 2, 3, 4, 5, 6, 7]
diabetes: [0, 1, 2, 3]
hla_match_c_high: [-1.0, 0.0, 1.0, 2.0]
hla_high_res_8: [-1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]
tbi_status: [0, 1, 2, 3, 4, 5, 6, 7]
arrhythmia: [0, 1, 2, 3]
hla_low_res_6: [-1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
graft_type: [0, 1]
vent_hist: [0, 1, 2]
renal_issue: [0, 1, 2, 3]
pulm_severe: [0, 1, 2, 3]
prim_disease_hct: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
hla_high_res_6: [-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0]
cmv_status: [0, 1, 2, 3, 4]
hla_high_res_10: [-1.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
hla_match_dqb1_high: [-1.0, 0.0, 1.0, 2.0]
tce_imm_match: [0, 1, 2, 3, 4, 5, 6, 7, 8]
hla_nmdp_6: [-1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
hla_match_c_low: [-1.0, 0.0, 1.0, 2.0]
rituximab: [0, 1, 2]
hla_match_drb1_low: [-1.0, 1.0, 2.0]
hla_match_dqb1_low: [-1.0, 0.0, 1.0, 2.0]
prod_type: [0, 1]
cyto_score_detail: [0, 1, 2, 3, 4, 5]
conditionin

In [14]:
augmented_train = train.copy()
augmented_train["donor_age"] = augmented_train["donor_age"] + np.random.uniform(-1, 1, size=len(augmented_train))
augmented_train["age_at_hct"] = augmented_train["age_at_hct"] + np.random.uniform(-1, 1, size=len(augmented_train))
augmented_train["year_hct"] = augmented_train["age_at_hct"] + np.random.choice([-1, 0, 1], size=len(augmented_train))

In [22]:
new_train = pd.concat([train, augmented_train])
new_train = new_train.reset_index()
new_train["ID"] = new_train["index"]

In [23]:
from lightgbm import LGBMRegressor
import lightgbm as lgb
print("Using LightGBM version",lgb.__version__)

Using LightGBM version 4.5.0


In [24]:
FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
    
oof_lgb = np.zeros(len(new_train))
pred_lgb = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(new_train)):
    print(f"Fold {i+1}")
    
    x_train = new_train.loc[train_index,FEATURES].copy()
    y_train = new_train.loc[train_index,"y"]    
    x_valid = new_train.loc[test_index,FEATURES].copy()
    y_valid = new_train.loc[test_index,"y"]
    x_test = test[FEATURES].copy()

    model_lgb = LGBMRegressor(
        device="gpu", 
        max_depth=3, 
        colsample_bytree=0.4,  
        subsample=0.9, 
        n_estimators=2500, 
        learning_rate=0.02, 
        objective="regression", 
        verbose=-1, 
        early_stopping_rounds=25,
    )
    model_lgb.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],
    )
    
    # INFER OOF
    oof_lgb[test_index] = model_lgb.predict(x_valid)
    # INFER TEST
    pred_lgb += model_lgb.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_lgb /= FOLDS

Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9
Fold 10


In [25]:
y_true = new_train[["ID","efs","efs_time","race_group"]].copy()
y_pred = new_train[["ID"]].copy()
y_pred["prediction"] = oof_lgb
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for LightGBM KaplanMeier =",m)

  merged_df_race_dict = dict(merged_df.groupby(['race_group']).groups)



Overall CV for LightGBM KaplanMeier = 0.6869253456166916


In [29]:
prediction = model_lgb.predict(train[FEATURES])
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = prediction
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for LightGBM KaplanMeier =",m)

  merged_df_race_dict = dict(merged_df.groupby(['race_group']).groups)



Overall CV for LightGBM KaplanMeier = 0.707889399249549
