In [None]:
import pandas as pd
import numpy as np

cols = [
    "PTIME", # Target
    "PSTATUS", # most recent patient status (1-dead,0-alive)
    
    # "GTIME_KI", # graft survival
    "NPKID", # num of tx
    # "PREV_TX", # the same organ tx
    # "PREV_TX_ANY", # any previous tx

    # Recipient info
    "AGE",
    'AGE_GROUP',
    'GENDER',
    "ABO",
    "DIAG_KI",
    'ON_DIALYSIS',
    "BMI_CALC",
    "WGT_KG_CALC", # weight
    "HGT_CM_CALC", #"INIT_HGT_CM", # height
    # "LIV_DON_TY", # living donor relation to recipient

    # identification of recipients who passed away
    # "COD_KI", #
    # "COMPOSITE_DEATH_DATE",#
    # "END_DATE",# removal from waiting list
    # "FAILDATE_KI", # kidney fail date
    # "GSTATUS_KI", # graft failed  (1 - yes)

    # 'AMIS', 'BMIS', 'DRMIS', 'HLAMIS' - columns that break everything
    # Donor info
    "AGE_DON",
    'GENDER_DON',
    "ABO_DON",
    "DON_TY",
    "BMI_DON_CALC",
    "WGT_KG_DON_CALC",
    "HGT_CM_DON_CALC",
    # "CREAT_DON",
    # "DIABETES_DON",

    "ABO_MAT",
    # "TX_DATE",

    # "CMV_IGG", "CMV_IGM", "CMV_STATUS",
    # "PRE_TX_TXFUS", # pretransplant transfusions
    # "COLD_ISCH_KI",
    # "WARM_ISCH_TM_DON",
    # 'A1', 'A2', 'B1', 'B2', 'DR1', 'DR2', 'BW4', 'BW6', 'C1', 'C2', 'DR51', 'DR51_2', 'DR52',
    # 'DR52_2', 'DR53', 'DR53_2', 'DQ1', 'DQ2', 'DA1', 'DA2', 'DB1', 'DB2', 'DDR1', 'DDR2', 'RA1',
    # 'RA2', 'RB1', 'RB2', 'RDR1', 'RDR2',
    # "KDPI", "KDRI_MED", 'KDRI_RAO',

    # "WL_ID_CODE", # registration identifier
    # 'DONOR_ID',
    # 'PT_CODE' # recipient id

    ]

dats = pd.read_parquet("/mnt/lustre/helios-home/stadnkyr/Kidney_transplants.parquet", engine='auto', columns=cols)

In [None]:
dats.head()

In [None]:
pd.set_option('display.max_columns', None)
display(dats.describe())

# dataframe

In [None]:
df = dats[dats["PSTATUS"].notnull()]
df.shape

In [None]:
display(df.head())

In [None]:
display(df["AGE_GROUP"].value_counts())

In [None]:
df['ON_DIALYSIS'].fillna('N', inplace=True) # could be a bad idea
df.dropna(subset=["GENDER_DON", "ABO_DON", "ABO_MAT"], inplace=True)
df.shape


In [None]:
categorical_values = ["AGE_GROUP", "GENDER", "ABO","DON_TY", "ON_DIALYSIS", "GENDER_DON", "ABO_DON", "ABO_MAT"]
numerical_values = ["NPKID", "AGE", "BMI_CALC", "WGT_KG_CALC", "HGT_CM_CALC", "AGE_DON", "BMI_DON_CALC", "WGT_KG_DON_CALC", "HGT_CM_DON_CALC", "PTIME", "PSTATUS"] # "DIAG_KI",

In [None]:
import numpy as np

for column in numerical_values:
    df[column] = df[column].astype(np.float)

In [None]:
for column in categorical_values:
    print(df[column].shape, df[column].dtypes, df[column].isna().sum())

In [None]:
# DONOR HEIGHT
median_adult_men_donor_height = df[df["GENDER_DON"] == "M"]["HGT_CM_DON_CALC"].astype(float).median()
median_adult_women_donor_height = df[df["GENDER_DON"] == "F"]["HGT_CM_DON_CALC"].astype(float).median() 

df.loc[(df['GENDER_DON'] == 'M') & (df['HGT_CM_DON_CALC'].isnull()), 'HGT_CM_DON_CALC'] = median_adult_men_donor_height
df.loc[(df['GENDER_DON'] == 'F') & (df['HGT_CM_DON_CALC'].isnull()), 'HGT_CM_DON_CALC'] = median_adult_women_donor_height

# DONOR WEIGHT
median_adult_men_donor_weight = df[df["GENDER_DON"] == "M"]["WGT_KG_DON_CALC"].astype(float).median()
median_adult_women_donor_weight = df[df["GENDER_DON"] == "F"]["WGT_KG_DON_CALC"].astype(float).median() 

df.loc[(df['GENDER_DON'] == 'M') & (df['WGT_KG_DON_CALC'].isnull()), 'WGT_KG_DON_CALC'] = median_adult_men_donor_weight
df.loc[(df['GENDER_DON'] == 'F') & (df['WGT_KG_DON_CALC'].isnull()), 'WGT_KG_DON_CALC'] = median_adult_women_donor_weight

# DONOR BMI
median_adult_men_donor_bmi = men = df[df["GENDER_DON"] == "M"]["BMI_DON_CALC"].astype(float).median()
median_adult_women_donor_bmi = df[df["GENDER_DON"] == "F"]["BMI_DON_CALC"].astype(float).median() 

df.loc[(df['GENDER_DON'] == 'M') & (df['BMI_DON_CALC'].isnull()), 'BMI_DON_CALC'] = median_adult_men_donor_bmi # COULDDO: calculate individual BMI
df.loc[(df['GENDER_DON'] == 'F') & (df['BMI_DON_CALC'].isnull()), 'BMI_DON_CALC'] = median_adult_women_donor_bmi # COULDDO: calculate individual BMI

In [None]:
#RECIPIENT HEIGHT
median_adult_men_recipient_height = df[df["GENDER_DON"] == "M"]["HGT_CM_DON_CALC"].astype(float).median()
median_adult_women_recipient_height = df[df["GENDER_DON"] == "F"]["HGT_CM_DON_CALC"].astype(float).median() 

df.loc[(df['GENDER'] == 'M') & (df['HGT_CM_CALC'].isnull()) & (df['AGE_GROUP'] == 'A'), 'HGT_CM_CALC'] = median_adult_men_recipient_height
df.loc[(df['GENDER'] == 'F') & (df['HGT_CM_CALC'].isnull()) & (df['AGE_GROUP'] == 'A'), 'HGT_CM_CALC'] = median_adult_women_recipient_height

# RECIPIENT WEIGHT
median_adult_men_recipient_weight = df[df["GENDER_DON"] == "M"]["WGT_KG_DON_CALC"].astype(float).median()
median_adult_women_recipient_weight = df[df["GENDER_DON"] == "F"]["WGT_KG_DON_CALC"].astype(float).median() 

df.loc[(df['GENDER'] == 'M') & (df['WGT_KG_CALC'].isnull()) & (df['AGE_GROUP'] == 'A'), 'WGT_KG_CALC'] = median_adult_men_recipient_weight
df.loc[(df['GENDER'] == 'F') & (df['WGT_KG_CALC'].isnull()) & (df['AGE_GROUP'] == 'A'), 'WGT_KG_CALC'] = median_adult_women_recipient_weight

# RECIPIENT BMI
median_adult_men_recipient_bmi = men = df[df["GENDER_DON"] == "M"]["BMI_DON_CALC"].astype(float).median()
median_adult_women_recipient_bmi = df[df["GENDER_DON"] == "F"]["BMI_DON_CALC"].astype(float).median() 

df.loc[(df['GENDER'] == 'M') & (df['BMI_CALC'].isnull()) & (df['AGE_GROUP'] == 'A'), 'BMI_CALC'] = median_adult_men_recipient_bmi # COULDDO: calculate individual BMI
df.loc[(df['GENDER'] == 'F') & (df['BMI_CALC'].isnull()) & (df['AGE_GROUP'] == 'A'), 'BMI_CALC'] = median_adult_women_recipient_bmi # COULDDO: calculate individual BMI

df.dropna(subset=["BMI_CALC", "WGT_KG_CALC", "HGT_CM_CALC", "AGE_DON", "PTIME"], inplace=True)

In [None]:
df.shape

In [None]:
# for column in categorical_columns:
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.reset_index()
df.head()

In [None]:
for column in numerical_values:
    print(column, df[column].shape, df[column].dtypes, df[column].isna().sum())

In [None]:
import numpy as np

# construct Y
from sksurv.util import Surv

survival_time = df['PTIME'].astype(np.float64)
event = df['PSTATUS'].astype(float).astype(np.bool)

# numerical_values.remove('PTIME')
# numerical_values.remove('PSTATUS')
df['PSTATUS'] = df['PSTATUS'].astype(bool)

# df.head()

y = Surv.from_dataframe('PSTATUS', 'PTIME', df)
y

In [None]:
y_oth = df

In [None]:

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

from sksurv.preprocessing import OneHotEncoder
from sksurv.column import encode_categorical
from sksurv.column import standardize

data_x = encode_categorical(df, categorical_values)

In [None]:
categorical = df[categorical_values]
numerical = df[numerical_values]

numerical_x = standardize(numerical)
categorical_x = encode_categorical(categorical)

df_prep = pd.concat([numerical_x, categorical_x], axis=1)
# df_prep = df_prep.drop(['PTIME', 'PSTATUS'], axis=1)

df_prep

In [None]:
import numpy as np

# construct Y
from sksurv.util import Surv

survival_time = data_x['PTIME'].astype(np.float64)
event = data_x['PSTATUS'].astype(float).astype(np.bool)

y = Surv.from_arrays(event, survival_time, "Status", "Days")
data_x = data_x.drop(['PTIME', 'PSTATUS'], axis=1)


In [None]:
# data_x.head()
data_x.shape

In [None]:
# df_prepared_small = data_x[:10000]
df_prepared_small = df_prep[:10000]
y_small = y[:10000]

df_prepared_small

In [None]:
# np.isnan(y_small['']).any()
print(np.any(np.isnan(y_small['Status'])))
print(np.all(np.isfinite(y_small['Status'])))

print(np.any(np.isnan(y_small['Days'])))
print(np.all(np.isfinite(y_small['Days'])))


In [None]:
from sksurv.linear_model import CoxPHSurvivalAnalysis, CoxnetSurvivalAnalysis

# neco = standardize(df_prepared_small)

cox_ph = CoxnetSurvivalAnalysis()
cox_ph.fit(df_prepared_small, y_small)