### This notebook performs some basic initial data processing on SRTR data.
### Processing tasks in this notebook are as follows:
    -We select candidates who were waitlisted at the age of 18 or older for heart transplants (excluding heart-lung transplants).
    
    -We search unrealistic outliers in each variable (i.e. negative hemodynamic scores, or physically impossible BMI). 
    
    -Individuals with unrealistic outlier variables are eliminated from the data. 
        *Note that this step is interactive and involves human jugdment!*  
        *Cases with outliers are printed to the screen for human decision-making in whether it is acceptable to simply eliminate these cases.*   
        *Any user applying this code to a new dataset or problem should view the notebook output and may need to add additional code to eliminate other cases.*
    
    -Some variables with many possible values in the raw data are condensed into a smaller number of meaningful values. 
        i.e. Many possible ventricular assist devices are described according to only three categories: left ventricular assist device, other ventricular assist device, or none. 
    
    -Physiological variables not directly available in the raw data are derived (glomerular filtration rate, cardiac index). 
    
    -Information on death dates, listing dates, and time of removal from the waitlist are used to derive the "time" variable for survival analysis algorithms. 
    
    -Information on death dates and transplants are used to derive the "status" variable for survival analysis. 
    




In [1]:
#Import necessary libraries
import numpy as np
import pandas as pd
import datetime
import math
from scipy.stats import chi2_contingency
from scipy.stats import ranksums
from scipy.stats import iqr

In [2]:
#Import data in csv form

df = pd.read_csv('/Users/kevinz94/Desktop/HeartDataPipeline/cand_thor_selected.csv')

In [3]:
#When the csv file is read in, any blank cells are turned into NaN or NaT
#The following line makes sure that any data entered as "." in the csv is converted to NaN
#When we convert to datetime format later, Pandas will convert NaN to NaT as appropriate

df = df.replace(to_replace = '.', value = np.nan)

In [4]:
#Convert date variables to datetime form
df['CAN_REM_DT'] = pd.to_datetime(df['CAN_REM_DT'],format="%m/%d/%y")
df['PERS_OPTN_DEATH_DT'] = pd.to_datetime(df['PERS_OPTN_DEATH_DT'],format="%m/%d/%y")
df['PERS_RESTRICT_DEATH_DT'] = pd.to_datetime(df['PERS_RESTRICT_DEATH_DT'],format="%m/%d/%y")
df['PERS_SSA_DEATH_DT'] = pd.to_datetime(df['PERS_SSA_DEATH_DT'],format="%m/%d/%y")
df['CAN_LAST_ACT_STAT_DT'] = pd.to_datetime(df['CAN_LAST_ACT_STAT_DT'],format="%m/%d/%y")
df['CAN_LAST_INACT_STAT_DT'] = pd.to_datetime(df['CAN_LAST_INACT_STAT_DT'],format="%m/%d/%y")
df['CAN_LISTING_DT'] = pd.to_datetime(df['CAN_LISTING_DT'],format="%m/%d/%y")

#Create variable that is just year of listing
df['listing_year'] = pd.DatetimeIndex(df['CAN_LISTING_DT']).year

#Choose the window of times for listing that we want to work with
cutoff1 = pd.to_datetime('01/01/2010', format='%m/%d/%Y')
cutoff2 = pd.to_datetime('12/31/2017', format='%m/%d/%Y')
cutoff3 = pd.to_datetime('11/01/2018', format='%m/%d/%Y')
cutoff4 = pd.to_datetime('03/01/2020', format='%m/%d/%Y')
condition_time1 = (df['CAN_LISTING_DT'] >= cutoff1) & (df['CAN_LISTING_DT'] <= cutoff2)
condition_time2 = (df['CAN_LISTING_DT'] >= cutoff3) & (df['CAN_LISTING_DT'] <= cutoff4)
condition_time = condition_time1 | condition_time2
df = df[condition_time]

#Create age variable in years (rounding down to integer value)
df['age'] = (df['CAN_AGE_IN_MONTHS_AT_LISTING']/12)
df['age'] = df['age'].apply(np.floor)

#Eliminate candidates under the age of 18 at listing
condition_choose =  (df['age'] >= 18)
df = df[condition_choose]

#Eliminate all but one entry for people listed twice
df = df.drop_duplicates(subset=['PX_ID'])



In [5]:
df['WL_ORG'].value_counts()

HR    32891
LU    24344
HL      418
Name: WL_ORG, dtype: int64

In [6]:
#For Powerpoint flow chart
32891+418

33309

In [7]:
#Select those with heart transplants only (not lung, and not heart-lung)
condition = (df['WL_ORG'] == 'HR')
df = df[condition]

In [8]:
#Visualize in notebook any cases with bad data for age (unrealistically old)

check_age = df[df['CAN_AGE_IN_MONTHS_AT_LISTING'] > 1200]
check_age

Unnamed: 0,PX_ID,WL_ORG,CAN_GENDER,CAN_ABO,CAN_RACE,PERS_SSA_DEATH_DT,PERS_OPTN_DEATH_DT,PERS_RESTRICT_DEATH_DT,CAN_LISTING_DT,CAN_REM_DT,...,CAN_OTHER_TOBACCO_USE,CAN_MALIG,CAN_AGE_IN_MONTHS_AT_LISTING,CAN_INIT_STAT,CAN_LAST_ACT_STAT_DT,CAN_LAST_INACT_STAT_DT,CAN_VAD_TY,CAN_PRIMARY_PAY,listing_year,age


In [9]:
#Visualize in notebook any cases with bad data for height (unrealistically short)

check_short = df[df['CAN_HGT_CM'] < 30]
check_short


Unnamed: 0,PX_ID,WL_ORG,CAN_GENDER,CAN_ABO,CAN_RACE,PERS_SSA_DEATH_DT,PERS_OPTN_DEATH_DT,PERS_RESTRICT_DEATH_DT,CAN_LISTING_DT,CAN_REM_DT,...,CAN_OTHER_TOBACCO_USE,CAN_MALIG,CAN_AGE_IN_MONTHS_AT_LISTING,CAN_INIT_STAT,CAN_LAST_ACT_STAT_DT,CAN_LAST_INACT_STAT_DT,CAN_VAD_TY,CAN_PRIMARY_PAY,listing_year,age
20882,1277322,HR,F,O,16.0,NaT,NaT,NaT,2019-05-24,NaT,...,,N,714.0,2120.0,2019-06-12,2021-03-02,1.0,3.0,2019.0,59.0
21584,1278646,HR,F,O,8.0,NaT,NaT,NaT,2019-06-04,NaT,...,,N,341.0,2160.0,2020-07-27,2021-03-02,1.0,2.0,2019.0,28.0
101864,1278785,HR,M,A,8.0,NaT,NaT,NaT,2019-06-04,2019-06-12,...,,Y,594.0,2120.0,2019-06-12,NaT,1.0,3.0,2019.0,49.0
102031,1281255,HR,M,O,8.0,NaT,NaT,NaT,2019-06-18,2019-06-21,...,,N,716.0,2120.0,2019-06-21,NaT,1.0,1.0,2019.0,59.0
103147,1282871,HR,M,A,8.0,NaT,NaT,NaT,2019-06-26,2019-08-22,...,,N,705.0,2120.0,2019-08-22,2019-08-07,1.0,1.0,2019.0,58.0
112365,1321330,HR,M,O,8.0,NaT,NaT,NaT,2020-01-28,2021-02-28,...,,N,675.0,2160.0,2021-02-28,NaT,1.0,1.0,2020.0,56.0


In [10]:
#Remove cases with bad data for height

condition = (df['CAN_HGT_CM'] >= 30) | (df['CAN_HGT_CM'].isna()==True)
df = df[condition]


In [11]:
#Visualize in notebook any cases with bad data for height (unrealistically tall)

check_tall = df[df['CAN_HGT_CM'] > 250]
check_tall

Unnamed: 0,PX_ID,WL_ORG,CAN_GENDER,CAN_ABO,CAN_RACE,PERS_SSA_DEATH_DT,PERS_OPTN_DEATH_DT,PERS_RESTRICT_DEATH_DT,CAN_LISTING_DT,CAN_REM_DT,...,CAN_OTHER_TOBACCO_USE,CAN_MALIG,CAN_AGE_IN_MONTHS_AT_LISTING,CAN_INIT_STAT,CAN_LAST_ACT_STAT_DT,CAN_LAST_INACT_STAT_DT,CAN_VAD_TY,CAN_PRIMARY_PAY,listing_year,age


In [12]:
#Visualize in notebook any cases with bad data for weight (unrealistically low)

check_light = df[df['CAN_WGT_KG'] < 15]
check_light

Unnamed: 0,PX_ID,WL_ORG,CAN_GENDER,CAN_ABO,CAN_RACE,PERS_SSA_DEATH_DT,PERS_OPTN_DEATH_DT,PERS_RESTRICT_DEATH_DT,CAN_LISTING_DT,CAN_REM_DT,...,CAN_OTHER_TOBACCO_USE,CAN_MALIG,CAN_AGE_IN_MONTHS_AT_LISTING,CAN_INIT_STAT,CAN_LAST_ACT_STAT_DT,CAN_LAST_INACT_STAT_DT,CAN_VAD_TY,CAN_PRIMARY_PAY,listing_year,age


In [13]:
#Remove cases with bad data for weight (too low)
#This condition also removes some children, which is acceptable because this Jupyter notebook will select only individuals 18 and older

condition = (df['CAN_WGT_KG'] >= 15) | (df['CAN_WGT_KG'].isna()==True)
df = df[condition]

In [14]:
#Visualize in notebook any cases with bad data for weight (unrealistically high)

check_heavy = df[df['CAN_WGT_KG'] > 450]
check_heavy

Unnamed: 0,PX_ID,WL_ORG,CAN_GENDER,CAN_ABO,CAN_RACE,PERS_SSA_DEATH_DT,PERS_OPTN_DEATH_DT,PERS_RESTRICT_DEATH_DT,CAN_LISTING_DT,CAN_REM_DT,...,CAN_OTHER_TOBACCO_USE,CAN_MALIG,CAN_AGE_IN_MONTHS_AT_LISTING,CAN_INIT_STAT,CAN_LAST_ACT_STAT_DT,CAN_LAST_INACT_STAT_DT,CAN_VAD_TY,CAN_PRIMARY_PAY,listing_year,age


In [15]:
#Visualize in notebook any cases with bad data for BMI (unrealistically low)

check_low_BMI = df[df['CAN_BMI'] < 5]
check_low_BMI

Unnamed: 0,PX_ID,WL_ORG,CAN_GENDER,CAN_ABO,CAN_RACE,PERS_SSA_DEATH_DT,PERS_OPTN_DEATH_DT,PERS_RESTRICT_DEATH_DT,CAN_LISTING_DT,CAN_REM_DT,...,CAN_OTHER_TOBACCO_USE,CAN_MALIG,CAN_AGE_IN_MONTHS_AT_LISTING,CAN_INIT_STAT,CAN_LAST_ACT_STAT_DT,CAN_LAST_INACT_STAT_DT,CAN_VAD_TY,CAN_PRIMARY_PAY,listing_year,age


In [16]:
#Visualize in notebook any cases with bad data for BMI (unrealistically high)

check_high_BMI = df[df['CAN_BMI'] > 100]
check_high_BMI

Unnamed: 0,PX_ID,WL_ORG,CAN_GENDER,CAN_ABO,CAN_RACE,PERS_SSA_DEATH_DT,PERS_OPTN_DEATH_DT,PERS_RESTRICT_DEATH_DT,CAN_LISTING_DT,CAN_REM_DT,...,CAN_OTHER_TOBACCO_USE,CAN_MALIG,CAN_AGE_IN_MONTHS_AT_LISTING,CAN_INIT_STAT,CAN_LAST_ACT_STAT_DT,CAN_LAST_INACT_STAT_DT,CAN_VAD_TY,CAN_PRIMARY_PAY,listing_year,age
24834,1174043,HR,M,A,16.0,NaT,NaT,NaT,2017-10-05,2017-11-01,...,,N,577.0,2020.0,2017-10-10,2017-11-01,1.0,3.0,2017.0,48.0
76591,1005342,HR,M,A,8.0,NaT,NaT,NaT,2014-11-21,2014-12-05,...,,Y,279.0,2030.0,2014-12-05,NaT,1.0,3.0,2014.0,23.0


In [17]:
#Remove cases with bad data for BMI (too high)

condition = (df['CAN_BMI'] <= 100) | (df['CAN_BMI'].isna()==True)
df = df[condition]

In [18]:
#Visualize in notebook any cases with bad data for cardiac output (negative value)

check_cardiac = df[df['CAN_CARDIAC_OUTPUT'] < 0]
check_cardiac

Unnamed: 0,PX_ID,WL_ORG,CAN_GENDER,CAN_ABO,CAN_RACE,PERS_SSA_DEATH_DT,PERS_OPTN_DEATH_DT,PERS_RESTRICT_DEATH_DT,CAN_LISTING_DT,CAN_REM_DT,...,CAN_OTHER_TOBACCO_USE,CAN_MALIG,CAN_AGE_IN_MONTHS_AT_LISTING,CAN_INIT_STAT,CAN_LAST_ACT_STAT_DT,CAN_LAST_INACT_STAT_DT,CAN_VAD_TY,CAN_PRIMARY_PAY,listing_year,age


In [19]:
#Visualize in notebook any cases with bad data for creatinine levels (unrealistically low)

check_creat = df[df['CAN_MOST_RECENT_CREAT'] < 0.1]
check_creat

Unnamed: 0,PX_ID,WL_ORG,CAN_GENDER,CAN_ABO,CAN_RACE,PERS_SSA_DEATH_DT,PERS_OPTN_DEATH_DT,PERS_RESTRICT_DEATH_DT,CAN_LISTING_DT,CAN_REM_DT,...,CAN_OTHER_TOBACCO_USE,CAN_MALIG,CAN_AGE_IN_MONTHS_AT_LISTING,CAN_INIT_STAT,CAN_LAST_ACT_STAT_DT,CAN_LAST_INACT_STAT_DT,CAN_VAD_TY,CAN_PRIMARY_PAY,listing_year,age
2100,1304376,HR,M,A,16.0,NaT,NaT,NaT,2019-10-22,NaT,...,,N,301.0,2140.0,2020-09-03,2020-11-22,2.0,4.0,2019.0,25.0
87906,1130847,HR,M,A,8.0,NaT,NaT,NaT,2017-01-27,2017-02-09,...,,N,656.0,2020.0,2017-02-09,NaT,2.0,3.0,2017.0,54.0


In [20]:
#Remove cases with bad data for creatinine levels (unrealistically low)

condition = (df['CAN_MOST_RECENT_CREAT'] >= 0.1) | (df['CAN_MOST_RECENT_CREAT'].isna()==True)
df = df[condition]

In [21]:
#Visualize in notebook any cases with bad data for pulmonary capillary wedge pressure (negative value)

check_pcw = df[df['CAN_PCW_MEAN'] < 0]
check_pcw

Unnamed: 0,PX_ID,WL_ORG,CAN_GENDER,CAN_ABO,CAN_RACE,PERS_SSA_DEATH_DT,PERS_OPTN_DEATH_DT,PERS_RESTRICT_DEATH_DT,CAN_LISTING_DT,CAN_REM_DT,...,CAN_OTHER_TOBACCO_USE,CAN_MALIG,CAN_AGE_IN_MONTHS_AT_LISTING,CAN_INIT_STAT,CAN_LAST_ACT_STAT_DT,CAN_LAST_INACT_STAT_DT,CAN_VAD_TY,CAN_PRIMARY_PAY,listing_year,age


In [22]:
#Visualize in notebook any cases with bad data for pulmonary artery mean pressure (negative value)

check_pulm = df[df['CAN_PULM_ART_MEAN'] < 0]
check_pulm

Unnamed: 0,PX_ID,WL_ORG,CAN_GENDER,CAN_ABO,CAN_RACE,PERS_SSA_DEATH_DT,PERS_OPTN_DEATH_DT,PERS_RESTRICT_DEATH_DT,CAN_LISTING_DT,CAN_REM_DT,...,CAN_OTHER_TOBACCO_USE,CAN_MALIG,CAN_AGE_IN_MONTHS_AT_LISTING,CAN_INIT_STAT,CAN_LAST_ACT_STAT_DT,CAN_LAST_INACT_STAT_DT,CAN_VAD_TY,CAN_PRIMARY_PAY,listing_year,age


In [23]:
#Visualize anyone with blood type Z, which means in utero
check_blood = df[df['CAN_ABO']=='Z']
check_blood

Unnamed: 0,PX_ID,WL_ORG,CAN_GENDER,CAN_ABO,CAN_RACE,PERS_SSA_DEATH_DT,PERS_OPTN_DEATH_DT,PERS_RESTRICT_DEATH_DT,CAN_LISTING_DT,CAN_REM_DT,...,CAN_OTHER_TOBACCO_USE,CAN_MALIG,CAN_AGE_IN_MONTHS_AT_LISTING,CAN_INIT_STAT,CAN_LAST_ACT_STAT_DT,CAN_LAST_INACT_STAT_DT,CAN_VAD_TY,CAN_PRIMARY_PAY,listing_year,age


In [24]:
#Race variable one-hot encoding

df = df.assign(race_white = 0)
condition = (df['CAN_RACE'] == 8)
df.loc[condition, 'race_white'] = 1

df = df.assign(race_black = 0)
condition = (df['CAN_RACE'] == 16)
df.loc[condition, 'race_black'] = 1

df = df.assign(race_asian = 0)
condition = (df['CAN_RACE'] == 64)
df.loc[condition, 'race_asian'] = 1

df = df.assign(race_hispanic = 0)
condition = (df['CAN_RACE'] == 2000)
df.loc[condition, 'race_hispanic'] = 1

df = df.assign(race_other = 0)
condition = (df['race_white']==0) & (df['race_black']==0)  & (df['race_asian']==0)  & (df['race_hispanic']==0)
df.loc[condition, 'race_other'] = 1

In [25]:
#Cardiac index calculated as cardiac output divided by body surface area, which gives a result in meters squared
#We use Mosteller's formula for body surface area

df['cardiac_index'] = df["CAN_CARDIAC_OUTPUT"]/(((df["CAN_WGT_KG"]*df["CAN_HGT_CM"])/3600)**0.5)

In [26]:
#For variables recorded as Y/N/U/NaN, re-code these so that 'Y'=1, everything else = 0

df = df.assign(defib=0)
condition_defib = (df['CAN_IMPLANT_DEFIB'] == 'Y')
df.loc[condition_defib, 'defib'] = 1

df = df.assign(periph_vasc=0)
condition_vasc = (df['CAN_PERIPH_VASC'] == 'Y')
df.loc[condition_vasc, 'periph_vasc'] = 1

df = df.assign(hypertension=0)
condition_hyp = (df['CAN_DRUG_TREAT_HYPERTEN'] == 'Y')
df.loc[condition_hyp, 'hypertension'] = 1

df = df.assign(cereb_vasc=0)
condition_vasc = (df['CAN_CEREB_VASC'] == 'Y')
df.loc[condition_vasc, 'cereb_vasc'] = 1

df = df.assign(anti_arrythm=0)
condition_anti = (df['CAN_ANTI_ARRYTHM'] == 'Y')
df.loc[condition_anti, 'anti_arrythm'] = 1

df = df.assign(malig=0)
condition_malig = (df['CAN_MALIG'] == 'Y')
df.loc[condition_malig, 'malig'] = 1


In [27]:
#Create variable for any type of diabetes

df = df.assign(diabetes = 0)
condition_diabetes = (df['CAN_DIAB_TY'] > 1) & (df['CAN_DIAB_TY'] <= 5)
df.loc[condition_diabetes,'diabetes'] = 1

In [28]:
#Create glomerular filt variable using 4-variable MDRD equation
#From the paper "Using Standardized Serum Creatinine Values in the Modification of Diet in Renal Disease Study Equation for Estimating Glomerular Filtration Rate"
#Ann Intern Med. 2006 Aug 15;145(4):247-54.


#Variable to use as != 1 if patient is black
df = df.assign(black_factor = 1)
condition_black = (df['race_black'] == 1)
df.loc[condition_black, 'black_factor'] = 1.212

#Variable to use as != 1 if patient is female
df = df.assign(female_factor = 1)
condition_female = (df['CAN_GENDER'] == 'F')
df.loc[condition_female, 'female_factor'] = 0.742


df['gfr']= 175 * df["CAN_MOST_RECENT_CREAT"]**(-1.154) * df["age"]**(-0.203) * df["black_factor"] * df["female_factor"]

In [29]:
#Eliminate people with GFR > 150

condition = (df['gfr'] <= 150) | (df['gfr'].isna()==True) 

#Make array of those that we need to cut
print(df[condition==False].shape[0])

#Clean dataset to cut these people
df = df[condition]


472


In [30]:
#Create variable for any tobacco use

df = df.assign(tobacco = 0)
condition_tobacco = (df['CAN_HIST_CIGARETTE']=='Y') | (df['CAN_OTHER_TOBACCO_USE']=='Y')
df.loc[condition_tobacco, 'tobacco'] = 1

In [31]:
#Manually create one-hot encoding of blood type, aggregating into four categories

df = df.assign(blood_type_A=0)
condition_A = (df['CAN_ABO']=='A') | (df['CAN_ABO']=='A1') | (df['CAN_ABO']=='A2')
df.loc[condition_A, 'blood_type_A'] = 1

df = df.assign(blood_type_AB=0)
condition_AB = (df['CAN_ABO']=='AB') | (df['CAN_ABO']=='A1B') | (df['CAN_ABO']=='A2B')
df.loc[condition_AB, 'blood_type_AB'] = 1

df = df.assign(blood_type_B=0)
condition_B = (df['CAN_ABO']=='B') 
df.loc[condition_B, 'blood_type_B'] = 1

df = df.assign(blood_type_O=0)
condition_O = (df['CAN_ABO']=='O') 
df.loc[condition_O, 'blood_type_O'] = 1



In [32]:
#Manually create one-hot encoding for diagnosis, aggregating into seven categories

#Dilated cardiomyopathy
df = df.assign(diag_dilated_CM=0)
condition_dilated_CM = (df['CAN_DGN']>=1000) & (df['CAN_DGN']<=1049) & (df['CAN_DGN']!=1007)
df.loc[condition_dilated_CM,'diag_dilated_CM'] = 1

#Restricted myopathy
df = df.assign(diag_restricted=0)
condition_rm = (df['CAN_DGN']>=1050) & (df['CAN_DGN']<=1099)
df.loc[condition_rm,'diag_restricted'] = 1

#Hypertrophic
df = df.assign(diag_hypertrophic=0)
condition_hyp = (df['CAN_DGN']==1201)
df.loc[condition_hyp,'diag_hypertrophic'] = 1
                
#Valvular
df = df.assign(diag_valvular=0)
condition_valv = (df['CAN_DGN']==1202)
df.loc[condition_valv,'diag_valvular'] = 1
                
#Congenital
df = df.assign(diag_congenital=0)
condition_congenital = (df['CAN_DGN']==1203) | ((df['CAN_DGN']>=1205) & (df['CAN_DGN']<=1209))
df.loc[condition_congenital,'diag_congenital'] = 1
                
#Ischemic
df = df.assign(diag_ischemic=0)
condition_ischemic = (df['CAN_DGN']==1200) | (df['CAN_DGN']==1007)
df.loc[condition_ischemic,'diag_ischemic'] = 1

#Not in the above categories
df = df.assign(diag_OTHER=0)
condition_OTHER = (df['diag_dilated_CM']==0) & (df['diag_restricted']==0) & (df['diag_hypertrophic']==0) & (df['diag_valvular']==0) & (df['diag_congenital']==0) & (df['diag_ischemic']==0)
df.loc[condition_OTHER,'diag_OTHER'] = 1


In [33]:
#Manually create one-hot coding for ventricular assist devices (VADs)

#Creates a variable named "no_vad" and initializes it to 0
df = df.assign(no_vad=0)
#Creates a condition for the case where the variable CAN_VAD_TY is equal to 1 ("NONE" in the documentation)
condition_NO_VAD = (df['CAN_VAD_TY']==1)
#Where this condition is true, no_vad is set to 1
df.loc[condition_NO_VAD,'no_vad'] = 1

#Also anyone for whom CAN_VAD_TY is NaN is set to no_vad=1
condition_NO_VAD = (df['CAN_VAD_TY'].isna()==True)
df.loc[condition_NO_VAD,'no_vad'] = 1

#Creates a variable named "lvad" and initializes it to 0
df = df.assign(lvad=0)
#Creates a condition that is true if CAN_VAD1 has a value corresponding to a left ventricular assist device
#The list is long because some LVAD brands are listed more than once with different numerical codes
condition_LVAD = (df["CAN_VAD1"]==205) |  (df["CAN_VAD1"]==236) | (df["CAN_VAD1"]==313) | (df["CAN_VAD1"]==330) | (df["CAN_VAD1"]==206) |  (df["CAN_VAD1"]==208) | (df["CAN_VAD1"]==314) | (df["CAN_VAD1"]==210) |  (df["CAN_VAD1"]==319) | (df["CAN_VAD1"]==216) | (df["CAN_VAD1"]==305) |(df["CAN_VAD1"]==217) |  (df["CAN_VAD1"]==306) | (df["CAN_VAD1"]==223) |  (df["CAN_VAD1"]==312) |  (df["CAN_VAD1"]==224) | (df["CAN_VAD1"]==316) |  (df["CAN_VAD1"]==230) |  (df["CAN_VAD1"]==324) | (df["CAN_VAD1"]==231) |  (df["CAN_VAD1"]==325) |  (df["CAN_VAD1"]==232) | (df["CAN_VAD1"]==326) | (df["CAN_VAD1"]==233) | (df["CAN_VAD1"]==327)
#Where this condition is true, lvad is set to 1
df.loc[condition_LVAD,'lvad'] = 1

#Include in lvad=1 people who were listed as having an LVAD but brand isn't specified
condition_LVAD2 = (df["CAN_VAD_TY"]==2) & ( (df["CAN_VAD1"]==999) | (df["CAN_VAD1"].isna()==True) )
df.loc[condition_LVAD2,'lvad'] = 1

#Creates a variable named other_vad and initializes it to 0
df = df.assign(other_vad=0)
#Creates a condition stating that anything that didn't fit into the previous two categories falls into this one
condition_OTHER_VAD = (df['no_vad']==0) & (df['lvad']==0)
#Where this condition is true, other_vad is set to 1
df.loc[condition_OTHER_VAD,'other_vad'] = 1

In [34]:
#Assign death_date as the minimum value of the three death dates that can be included in the SRTR database
df['death_date'] = df[['PERS_OPTN_DEATH_DT','PERS_RESTRICT_DEATH_DT','PERS_SSA_DEATH_DT']].min(axis=1)

#Assign censoring_date as the maximum value of the three removal dates that can be included in the SRTR database
df['censoring_date'] = df[['CAN_LAST_ACT_STAT_DT','CAN_LAST_INACT_STAT_DT','CAN_REM_DT']].max(axis=1)

#Initialize variables corresponding to time and relevant events
df = df.assign(time=0)
df = df.assign(transplanted=0)
df = df.assign(dead=0)
df = df.assign(died_noT=0)


In [35]:
#Identify candidates who received transplants, and assign time variable as time from listing until removal

#This condition includes several values for different transplant situations
condition_transplanted = (df['CAN_REM_CD']==4) | (df['CAN_REM_CD']==14) | (df['CAN_REM_CD']==18) | (df['CAN_REM_CD']==19) | (df['CAN_REM_CD']==21) | (df['CAN_REM_CD']==22)
df.loc[condition_transplanted,'transplanted'] = 1

#This identifies everyone who has died, with or without transplant
condition_dead = (df['PERS_SSA_DEATH_DT'].isna()==False) | (df['PERS_OPTN_DEATH_DT'].isna()==False) | (df['PERS_RESTRICT_DEATH_DT'].isna()==False)
df.loc[condition_dead,'dead'] = 1

#This defines our event variable, died_noT, and the time variable for people with died_noT=1
condition_died = (df['transplanted']==0) & (df['dead']==1)
df.loc[condition_died, 'died_noT'] = 1
df.loc[condition_died,'time'] = (df['death_date']-df['CAN_LISTING_DT'])

#This sets the value of time for everyone who is censored (anyone who does not have died_noT=1)
condition_censored = (df['died_noT']==0)
df.loc[condition_censored,'time'] = (df['censoring_date']-df['CAN_LISTING_DT'])




In [36]:
#Visualize in notebook any cases with bad data for death date (death occuring prior to listing date)

check_time = df[df['death_date'] < df['CAN_LISTING_DT']]
check_time

Unnamed: 0,PX_ID,WL_ORG,CAN_GENDER,CAN_ABO,CAN_RACE,PERS_SSA_DEATH_DT,PERS_OPTN_DEATH_DT,PERS_RESTRICT_DEATH_DT,CAN_LISTING_DT,CAN_REM_DT,...,diag_OTHER,no_vad,lvad,other_vad,death_date,censoring_date,time,transplanted,dead,died_noT
32328,1003837,HR,M,B,16.0,NaT,2014-10-10,NaT,2014-11-12,2014-11-12,...,1,1,0,0,2014-10-10,2014-11-12,-33 days +00:00:00,0,1,1


In [37]:
#Remove any cases with bad data for death date (death occuring prior to listing date)

#Coded using this condition (rather than anything that would select NaT values) because death_date is NaT for everyone that didn't die
condition_dont_choose = df['death_date'] < df['CAN_LISTING_DT']
df = df[condition_dont_choose == False]

In [38]:
#Visualize in notebook any cases with bad data for censoring date (censoring occuring prior to listing date)

check_time = df[df['censoring_date'] < df['CAN_LISTING_DT']]
check_time

Unnamed: 0,PX_ID,WL_ORG,CAN_GENDER,CAN_ABO,CAN_RACE,PERS_SSA_DEATH_DT,PERS_OPTN_DEATH_DT,PERS_RESTRICT_DEATH_DT,CAN_LISTING_DT,CAN_REM_DT,...,diag_OTHER,no_vad,lvad,other_vad,death_date,censoring_date,time,transplanted,dead,died_noT


In [39]:
#Cut people in later cohort who are missing 6-status ranking
before_cutting = df.shape[0]
condition_later = (df['CAN_LISTING_DT'] >= cutoff3) & (df['CAN_LISTING_DT'] <= cutoff4)
condition_has_status = (df['CAN_INIT_STAT'] > 2100) & (df['CAN_INIT_STAT'] <= 2160)
condition_cut = (condition_later==True) & (condition_has_status==False)
df = df[condition_cut==False]
number_cut_stat = before_cutting - df.shape[0]
number_cut_stat

114

In [40]:
#Split data into earlier and later cohorts
#Time cutoffs were set earlier in the file

cutoff1 = pd.to_datetime('01/01/2010', format='%m/%d/%Y')
cutoff2 = pd.to_datetime('12/31/2017', format='%m/%d/%Y')
condition_time = (df['CAN_LISTING_DT'] >= cutoff1) & (df['CAN_LISTING_DT'] <= cutoff2)
df_early = df[condition_time]

cutoff1 = pd.to_datetime('11/01/2018', format='%m/%d/%Y')
cutoff2 = pd.to_datetime('03/01/2020', format='%m/%d/%Y')
condition_time = (df['CAN_LISTING_DT'] >= cutoff1) & (df['CAN_LISTING_DT'] <= cutoff2)
df_later = df[condition_time]

In [41]:
#Convert the time variable into an integer number of days, early cohort
for_np_early = df_early[["died_noT","time"]]
time_early = for_np_early.to_numpy()

for i in range(0,time_early.shape[0]):
    xt = np.timedelta64(time_early[i,1])
    days_early = xt.astype('timedelta64[D]')
    time_early[i,1] = days_early.astype(int)
    
#Convert the time variable into an integer number of days, later cohort
for_np_later = df_later[["died_noT","time"]]
time_later = for_np_later.to_numpy()

for i in range(0,time_later.shape[0]):
    xt = np.timedelta64(time_later[i,1])
    days_later = xt.astype('timedelta64[D]')
    time_later[i,1] = days_later.astype(int)
    
#Add the newly processed time variable to the df_early and df_later Pandas dataframes
df_early = df_early.assign(t=time_early[:,1])
df_later = df_later.assign(t=time_later[:,1])

In [42]:
#Count 6-status values for plot later
df_later['CAN_INIT_STAT'].value_counts()

2140.0    2011
2160.0    1058
2120.0    1019
2130.0     643
2110.0     212
2150.0     151
Name: CAN_INIT_STAT, dtype: int64