In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from lifelines import CoxPHFitter, KaplanMeierFitter
from sklearn.preprocessing import LabelEncoder

def transform_survival_probability(df, time_col='efs_time', event_col='efs'):
    kmf = KaplanMeierFitter()
    kmf.fit(df[time_col], df[event_col])
    y = kmf.survival_function_at_times(df[time_col]).values
    return y

In [24]:
train_path = "data/train.csv"
test_path = "data/test.csv"
sample_path = "data/sample_submission.csv"
data_dict = "data/data_dictionary.csv"

train_data = pd.read_csv(train_path)

In [15]:
data_info_df = pd.read_csv(data_dict)
label_encoder = LabelEncoder()

for index, row in data_info_df.iterrows():
    if row["type"] == "Categorical":
        train_data[row["variable"]] = label_encoder.fit_transform(train_data[row["variable"]])
    else:
        train_data[row["variable"]] = train_data[row["variable"]].fillna(-1)

train_data.head()

Unnamed: 0,ID,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,...,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs,efs_time
0,0,7,0,7,0,-1.0,-1.0,0,0,6.0,...,4,2,1,8.0,0,2.0,0,10.0,0,42.356
1,1,2,0,1,0,2.0,8.0,6,0,6.0,...,3,1,1,8.0,0,2.0,2,10.0,1,4.672
2,2,7,0,7,0,2.0,8.0,0,0,6.0,...,3,1,1,8.0,0,2.0,0,10.0,0,19.793
3,3,0,0,1,0,2.0,8.0,0,0,6.0,...,3,2,1,8.0,0,2.0,0,10.0,0,102.349
4,4,0,0,7,0,2.0,8.0,0,0,6.0,...,3,1,0,8.0,0,2.0,0,10.0,0,16.223


In [16]:
cph = CoxPHFitter()
cph.fit(train_data, duration_col='efs_time', event_col='efs')

<lifelines.CoxPHFitter: fitted with 28800 total observations, 13268 right-censored observations>

In [19]:
test_df = pd.read_csv(test_path)

for index, row in data_info_df.iterrows():
    if row["variable"] == "efs" or row["variable"] == "efs_time":
        continue
    if row["type"] == "Categorical":
        test_df[row["variable"]] = label_encoder.fit_transform(test_df[row["variable"]])
    else:
        test_df[row["variable"]] = test_df[row["variable"]].fillna(-1)
        
test_df.head()

Unnamed: 0,ID,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,...,karnofsky_score,hepatic_mild,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10
0,28800,1,0,1,0,-1.0,-1.0,0,0,6.0,...,90.0,0,1,1,0,8.0,0,2.0,0,10.0
1,28801,0,0,0,0,2.0,8.0,1,0,6.0,...,90.0,0,0,0,0,8.0,0,2.0,1,10.0
2,28802,1,0,1,0,2.0,8.0,0,0,6.0,...,90.0,0,0,0,0,8.0,0,2.0,0,10.0


In [22]:
risk_scores = cph.predict_partial_hazard(test_df)
print(risk_scores.head())

0    0.427494
1    1.716417
2    0.404897
dtype: float64
