In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from lifelines import CoxPHFitter, KaplanMeierFitter
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
from lifelines.utils import concordance_index
from metric import score

In [2]:
train_path = "data/train.csv"
test_path = "data/test.csv"
sample_path = "data/sample_submission.csv"
data_dict_path = "data/data_dictionary.csv"

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
data_dict_df = pd.read_csv(data_dict_path)

from lifelines import KaplanMeierFitter
def transform_survival_probability(df, time_col='efs_time', event_col='efs'):
    kmf = KaplanMeierFitter()
    kmf.fit(df[time_col], df[event_col])
    y = kmf.survival_function_at_times(df[time_col]).values
    return y
train["y"] = transform_survival_probability(train, time_col='efs_time', event_col='efs')

In [4]:
RMV = ["ID", "efs", "efs_time", "y"]
FEATURES = [c for c in train.columns if not c in RMV]
# print(f"Total features: {len(FEATURES)} - {FEATURES}")

CATS = []
NULLS = ["Not done", "Not tested", "N/A", "N/A, Mel not given", "No drugs reported"]
NUMS = []

for c in FEATURES:
    if train[c].dtype=="object":
        for null in NULLS:
            if null in train[c].unique():
                train[c] = train[c].fillna(null)
                test[c] = test[c].fillna(null)
        train[c] = train[c].fillna("NAN")
        test[c] = test[c].fillna("NAN")

        CATS.append(c)
    else:
        NUMS.append(c)
        train[c] = train[c].fillna(-1)
        test[c] = test[c].fillna(-1)

In [5]:
count = 0
for col in train.columns:
    if train[col].dtype == "object" and len(train[col].unique()) <= 20:
        if "NAN" in train[col].unique():
            print("#" * 50)
            print(f"{col}: {train[col].unique()}")
            count += 1

print(count)

##################################################
dri_score: ['N/A - non-malignant indication' 'Intermediate' 'High' 'Low'
 'N/A - disease not classifiable' 'N/A - pediatric' 'TBD cytogenetics'
 'Intermediate - TED AML case <missing cytogenetics' 'NAN'
 'High - TED AML case <missing cytogenetics' 'Very high'
 'Missing disease status']
##################################################
vent_hist: ['No' 'Yes' 'NAN']
##################################################
cmv_status: ['+/+' '-/+' '-/-' 'NAN' '+/-']
##################################################
tce_imm_match: ['NAN' 'P/P' 'G/B' 'H/B' 'G/G' 'P/H' 'P/B' 'H/H' 'P/G']
##################################################
rituximab: ['No' 'NAN' 'Yes']
##################################################
ethnicity: ['Not Hispanic or Latino' 'Hispanic or Latino' 'NAN'
 'Non-resident of the U.S.']
##################################################
mrd_hct: ['NAN' 'Positive' 'Negative']
#################################################

In [7]:
CAT_SIZE = []
CAT_EMB = []
NUMS = []

combined = pd.concat([train,test],axis=0,ignore_index=True)

for c in FEATURES:
    if c in CATS:
        # LABEL ENCODE
        combined[c],_ = combined[c].factorize()
        combined[c] -= combined[c].min()
        combined[c] = combined[c].astype("int32")
        combined[c] = combined[c].astype("category")
    else:
        if combined[c].dtype=="float64":
            combined[c] = combined[c].astype("float32")
        if combined[c].dtype=="int64":
            combined[c] = combined[c].astype("int32")
            
        NUMS.append(c)
        
train = combined.iloc[:len(train)].copy()
test = combined.iloc[len(train):].reset_index(drop=True).copy()

In [8]:
from lightgbm import LGBMRegressor
import lightgbm as lgb
print("Using LightGBM version",lgb.__version__)

Using LightGBM version 4.5.0


In [9]:
FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
    
oof_lgb = np.zeros(len(train))
pred_lgb = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):
    print(f"Fold {i+1}")
    
    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"y"]    
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"y"]
    x_test = test[FEATURES].copy()

    model_lgb = LGBMRegressor(
        device="gpu", 
        max_depth=3, 
        colsample_bytree=0.4,  
        subsample=0.9, 
        n_estimators=2500, 
        learning_rate=0.02, 
        objective="regression", 
        verbose=-1, 
        early_stopping_rounds=25,
    )
    model_lgb.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],
    )
    
    # INFER OOF
    oof_lgb[test_index] = model_lgb.predict(x_valid)
    # INFER TEST
    pred_lgb += model_lgb.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_lgb /= FOLDS

Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9
Fold 10


In [10]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_lgb
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for LightGBM KaplanMeier =", m)

  merged_df_race_dict = dict(merged_df.groupby(['race_group']).groups)



Overall CV for LightGBM KaplanMeier = 0.671764155740704
