### This notebook calculates descriptive statistics for each variable.
> Categorical variables are described using the total count and percentage of individuals with that characteristic. <br>
> Continuous variables are described using the median (50th percentile), 25th percentile, and 75th percentile. <br>
> To compare the early cohort to the later cohort, Wilcoxon rank-sum tests are performed for continuous variables. <br>
> Chi-squared tests are used to compare the cohorts with categorical variables. <br>

### *Before running this notebook, it is necessary to run Survival_Data_Initial_Processing_UC.ipynb*


In [1]:
#Run data processing notebook
%run /Users/kevinz94/Desktop/HeartDataPipeline/Survival_Data_Initial_Processing_UC.ipynb

472


In [2]:
#First create contingency table (diag_ct) that has the number of individuals with each diagnosis

#Because diagnosis variables are 1 for individuals with a given diagnosis, and 0 otherwise
#We can find the number of patients with that diagnosis simply by summing over the column
#The columns of the contingency table are as follows, in order
#Nonischemic dilated cardiomyopathy
#Ischemic cardiomyopathy
#Congenital heart disease
#Restrictive cardiomyopathy
#Hypertrophic cardiomyopathy
#Valvular heart disease
#Other diagnosis
#The first row of the contingency table refers to the early cohort, and the second row refers to the later cohort

diag_ct = np.zeros((2,7))

diag_ct[0,0] = df_early["diag_dilated_CM"].sum()
diag_ct[0,1] = df_early["diag_ischemic"].sum()
diag_ct[0,2] = df_early["diag_congenital"].sum()
diag_ct[0,3] = df_early["diag_restricted"].sum()
diag_ct[0,4] = df_early["diag_hypertrophic"].sum()
diag_ct[0,5] = df_early["diag_valvular"].sum()
diag_ct[0,6] = df_early["diag_OTHER"].sum()

diag_ct[1,0] = df_later["diag_dilated_CM"].sum()
diag_ct[1,1] = df_later["diag_ischemic"].sum()
diag_ct[1,2] = df_later["diag_congenital"].sum()
diag_ct[1,3] = df_later["diag_restricted"].sum()
diag_ct[1,4] = df_later["diag_hypertrophic"].sum()
diag_ct[1,5] = df_later["diag_valvular"].sum()
diag_ct[1,6] = df_later["diag_OTHER"].sum()

#Print contingency table to csv
total_diag = pd.DataFrame(data=diag_ct,columns=["diag_dilated_CM","diag_ischemic","diag_congenital","diag_restricted","diag_hypertrophic","diag_valvular","diag_OTHER"],index=["early","later"])
total_diag.to_csv('total_diagnosis.csv')

#Create array of percentages and print to csv
perc = np.zeros((diag_ct.shape[0],diag_ct.shape[1]))
perc[0,:] = 100 * (np.divide(diag_ct[0,:],np.sum(diag_ct[0,:])))
perc[1,:] = 100 * (np.divide(diag_ct[1,:],np.sum(diag_ct[1,:])))
perc_diag = pd.DataFrame(data=perc,columns=["diag_dilated_CM","diag_ischemic","diag_congenital","diag_restricted","diag_hypertrophic","diag_valvular","diag_OTHER"],index=["early","later"])
perc_diag.to_csv('percentages_diagnosis.csv')

#Calculate p-value for chi2 test and print to csv
diag_p = np.zeros((1,1))
diag_p[0,0] = chi2_contingency(diag_ct)[1]
p_val_diag = pd.DataFrame(data=diag_p,columns=['p-value of diagnosis'])
p_val_diag.to_csv('p_val_diagnosis.csv')



In [3]:
#First create contingency table (diab_ct) that has the number of individuals with and without diabetes

#The first column refers to individuals with diabetes, and the second column refers to people without diabetes
#The first row refers to the early cohort, and the second row refers to the later cohort
diab_ct = np.zeros((2,2))

#Since this variable is 1 for diabetes and 0 otherwise, we can just sum over the column
diab_ct[0,0] = df_early["diabetes"].sum()
#Then the number of 0 values is the remaining individuals in the cohort
diab_ct[0,1] = df_early.shape[0] - diab_ct[0,0]

#Since this variable is 1 for diabetes and 0 otherwise, we can just sum over the column
diab_ct[1,0] = df_later["diabetes"].sum()
#Then the number of 0 values is the remaining individuals in the cohort
diab_ct[1,1] = df_later.shape[0] - diab_ct[1,0]

#Print contingency table to csv
total_diab = pd.DataFrame(data=diab_ct,columns=["diabetic","not_diabetic"],index=["early","later"])
total_diab.to_csv('total_diabetes.csv')

#Create array of percentages and print to csv
perc = np.zeros((diab_ct.shape[0],diab_ct.shape[1]))

perc[0,:] = 100 * (np.divide(diab_ct[0,:],np.sum(diab_ct[0,:])))
perc[1,:] = 100 * (np.divide(diab_ct[1,:],np.sum(diab_ct[1,:])))
perc_diab = pd.DataFrame(data=perc,columns=["diabetic","not_diabetic"],index=["early","later"])
perc_diab.to_csv('percentages_diabetes.csv')

#Calculate p-value for chi2 test and print to csv
diab_p = np.zeros((1,1))
diab_p[0,0] = chi2_contingency(diab_ct)[1]
p_val_diab = pd.DataFrame(data=diab_p,columns=['p-value of diabetes'])
p_val_diab.to_csv('p_val_diabetes.csv')

In [4]:
#First create contingency table (vad_ct) for use of a ventricular assist device
#The columns of the contingency table are as follows, in order
#LVAD
#other VAD
#no VAD
#The first row refers to the early cohort, and the second row refers to the later cohort

#Because treatment variables are 1 for individuals with a given treatment, and 0 otherwise
#We can find the number of patients with that treatment simply by summing over the column

vad_ct = np.zeros((2,3))

vad_ct[0,0] = df_early["lvad"].sum()
vad_ct[0,1] = df_early["other_vad"].sum()
vad_ct[0,2] = df_early["no_vad"].sum()

vad_ct[1,0] = df_later["lvad"].sum()
vad_ct[1,1] = df_later["other_vad"].sum()
vad_ct[1,2] = df_later["no_vad"].sum()

#Print contingency table to csv
total_vad = pd.DataFrame(data=vad_ct,columns=["lvad","other_vad","no_vad"],index=["early","later"])
total_vad.to_csv('total_vad.csv')

#Create array of percentages and print to csv
perc = np.zeros((vad_ct.shape[0],vad_ct.shape[1]))
perc[0,:] = 100 * (np.divide(vad_ct[0,:],np.sum(vad_ct[0,:])))
perc[1,:] = 100 * (np.divide(vad_ct[1,:],np.sum(vad_ct[1,:])))
perc_vad = pd.DataFrame(data=perc,columns=["lvad","other_vad","no_vad"],index=["early","later"])
perc_vad.to_csv('percentages_vad.csv')

#Calculate p-value for chi2 test and print to csv
vad_p = np.zeros((1,1))
vad_p[0,0] = chi2_contingency(vad_ct)[1]
p_val_vad = pd.DataFrame(data=vad_p,columns=['p-value of VADs'])
p_val_vad.to_csv('p_val_vad.csv')

In [5]:
#First create contingency table (malig_ct) that has the number of individuals with a history of malignancy

#The first column refers to people with a history of malignancy, and the second column to those without
#The first row refers to the early cohort, and the second row refers to the later cohort
malig_ct = np.zeros((2,2))

#Since this variable is 1 for malignancy and 0 otherwise, we can just sum over the column
malig_ct[0,0] = df_early["malig"].sum()
#Then the number of 0 values is the remaining individuals in the cohort
malig_ct[0,1] = df_early.shape[0] - malig_ct[0,0]

#Since this variable is 1 for malignancy and 0 otherwise, we can just sum over the column
malig_ct[1,0] = df_later["malig"].sum()
#Then the number of 0 values is the remaining individuals in the cohort
malig_ct[1,1] = df_later.shape[0] - malig_ct[1,0]

#Print contingency table to csv
total_malig = pd.DataFrame(data=malig_ct,columns=["malignancy","no_malignancy"],index=["early","later"])
total_malig.to_csv('total_malig.csv')

#Create array of percentages and print to csv
perc = np.zeros((malig_ct.shape[0],malig_ct.shape[1]))

perc[0,:] = 100 * (np.divide(malig_ct[0,:],np.sum(malig_ct[0,:])))
perc[1,:] = 100 * (np.divide(malig_ct[1,:],np.sum(malig_ct[1,:])))
perc_malig = pd.DataFrame(data=perc,columns=["malignancy","no malignancy"],index=["early","later"])
perc_malig.to_csv('percentages_malignancy.csv')

#Calculate p-value for chi2 test and print to csv
malig_p = np.zeros((1,1))
malig_p[0,0] = chi2_contingency(malig_ct)[1]
p_val_malig = pd.DataFrame(data=malig_p,columns=['p-value of malignancy'])
p_val_malig.to_csv('p_val_malignancy.csv')

In [6]:
#First create contingency table (tobac_ct) that has the number of individuals with a history of tobacco use

#The first column refers to individuals with a history of tobacco use, and the second column to those without
#The first row refers to the early cohort, and the second row refers to the later cohort
tobac_ct = np.zeros((2,2))

#Since this variable is 1 for tobacco use and 0 otherwise, we can just sum over the column
tobac_ct[0,0] = df_early["tobacco"].sum()
#Then the number of 0 values is the remaining individuals in the cohort
tobac_ct[0,1] = df_early.shape[0] - tobac_ct[0,0]

#Since this variable is 1 for tobacco use and 0 otherwise, we can just sum over the column
tobac_ct[1,0] = df_later["tobacco"].sum()
#Then the number of 0 values is the remaining individuals in the cohort
tobac_ct[1,1] = df_later.shape[0] - tobac_ct[1,0]

#Print contingency table to csv
total_tobac = pd.DataFrame(data=tobac_ct,columns=["tobacco","no_tobacco"],index=["early","later"])
total_tobac.to_csv('total_tobacco.csv')

#Create array of percentages and print to csv
perc = np.zeros((tobac_ct.shape[0],tobac_ct.shape[1]))

perc[0,:] = 100 * (np.divide(tobac_ct[0,:],np.sum(tobac_ct[0,:])))
perc[1,:] = 100 * (np.divide(tobac_ct[1,:],np.sum(tobac_ct[1,:])))
perc_tobac = pd.DataFrame(data=perc,columns=["tobacco","no_tobacco"],index=["early","later"])
perc_tobac.to_csv('percentages_tobacco.csv')

#Calculate p-value for chi2 test and print to csv
tobac_p = np.zeros((1,1))
tobac_p[0,0] = chi2_contingency(tobac_ct)[1]
p_val_tobac = pd.DataFrame(data=tobac_p,columns=['p-value of tobacco'])
p_val_tobac.to_csv('p_val_tobacco.csv')

In [7]:
#First create contingency table (blood_ct) that has the number of individuals with each blood type

#Because blood type variables are 1 for individuals with a given type, and 0 otherwise
#We can find the number of patients with that blood type simply by summing over the column
#The columns of the contingency table are as follows, in order
#A, AB, B, O
#The first row refers to the early cohort, and the second row refers to the later cohort
blood_ct = np.zeros((2,4))

blood_ct[0,0] = df_early["blood_type_A"].sum()
blood_ct[0,1] = df_early["blood_type_AB"].sum()
blood_ct[0,2] = df_early["blood_type_B"].sum()
blood_ct[0,3] = df_early["blood_type_O"].sum()

blood_ct[1,0] = df_later["blood_type_A"].sum()
blood_ct[1,1] = df_later["blood_type_AB"].sum()
blood_ct[1,2] = df_later["blood_type_B"].sum()
blood_ct[1,3] = df_later["blood_type_O"].sum()

#Print contingency table to csv
total_blood = pd.DataFrame(data=blood_ct,columns=["blood_type_A","blood_type_AB","blood_type_B","blood_type_O"],index=["early","later"])
total_blood.to_csv('total_bloodtype.csv')

#Create array of percentages and print to csv
perc = np.zeros((blood_ct.shape[0],blood_ct.shape[1]))
perc[0,:] = 100 * (np.divide(blood_ct[0,:],np.sum(blood_ct[0,:])))
perc[1,:] = 100 * (np.divide(blood_ct[1,:],np.sum(blood_ct[1,:])))
perc_blood = pd.DataFrame(data=perc,columns=["blood_type_A","blood_type_AB","blood_type_B","blood_type_O"],index=["early","later"])
perc_blood.to_csv('percentages_bloodtype.csv')

#Calculate p-value for chi2 test and print to csv
blood_p = np.zeros((1,1))
blood_p[0,0] = chi2_contingency(blood_ct)[1]
p_val_blood = pd.DataFrame(data=blood_p,columns=['p-value of blood type'])
p_val_blood.to_csv('p_val_bloodtype.csv')




In [8]:
#First create contingency table (iabp_ct) that has the number of individuals with IABP treatment

#The first column refers to individuals with IABP treatment, and the second to those without
#The first row refers to the early cohort, and the second row refers to the later cohort
iabp_ct = np.zeros((2,2))

#Since this variable is 1 for IABP and 0 otherwise, we can just sum over the column
iabp_ct[0,0] = df_early["CAN_IABP"].sum()
#Then the number of 0 values is the remaining individuals in the cohort
iabp_ct[0,1] = df_early.shape[0] - iabp_ct[0,0]

#Since this variable is 1 for IABP and 0 otherwise, we can just sum over the column
iabp_ct[1,0] = df_later["CAN_IABP"].sum()
#Then the number of 0 values is the remaining individuals in the cohort
iabp_ct[1,1] = df_later.shape[0] - iabp_ct[1,0]

#Print contingency table to csv
total_iabp = pd.DataFrame(data=iabp_ct,columns=["IABP","No_IABP"],index=["early","later"])
total_iabp.to_csv('total_IABP.csv')

#Create array of percentages and print to csv
perc = np.zeros((iabp_ct.shape[0],iabp_ct.shape[1]))
perc[0,:] = 100 * (np.divide(iabp_ct[0,:],np.sum(iabp_ct[0,:])))
perc[1,:] = 100 * (np.divide(iabp_ct[1,:],np.sum(iabp_ct[1,:])))
perc_iabp = pd.DataFrame(data=perc,columns=["IABP","No_IABP"],index=["early","later"])
perc_iabp.to_csv('percentages_IABP.csv')

#Calculate p-value for chi2 test and print to csv
iabp_p = np.zeros((1,1))
iabp_p[0,0] = chi2_contingency(iabp_ct)[1]
p_val_iabp = pd.DataFrame(data=iabp_p,columns=['p-value of IABP'])
p_val_iabp.to_csv('p_val_IABP.csv')

In [9]:
#First create contingency table (inotrop_ct) that has the number of individuals with IV Inotrope treatment

#The first column refers to individuals with IV Inotrope treatment, and the second column to those without
#The first row refers to the early cohort, and the second row refers to the later cohort
inotrop_ct = np.zeros((2,2))

#Since this variable is 1 for inotropes and 0 otherwise, we can just sum over the column
inotrop_ct[0,0] = df_early["CAN_IV_INOTROP"].sum()
#Then the number of 0 values is the remaining individuals in the cohort
inotrop_ct[0,1] = df_early.shape[0] - inotrop_ct[0,0]

#Since this variable is 1 for inotropes and 0 otherwise, we can just sum over the column
inotrop_ct[1,0] = df_later["CAN_IV_INOTROP"].sum()
#Then the number of 0 values is the remaining individuals in the cohort
inotrop_ct[1,1] = df_later.shape[0] - inotrop_ct[1,0]

#Print contingency table to csv
total_inotrop = pd.DataFrame(data=inotrop_ct,columns=["Intropes","No_Inotropes"],index=["early","later"])
total_inotrop.to_csv('total_inotropes.csv')

#Create array of percentages and print to csv
perc = np.zeros((inotrop_ct.shape[0],inotrop_ct.shape[1]))
perc[0,:] = 100 * (np.divide(inotrop_ct[0,:],np.sum(inotrop_ct[0,:])))
perc[1,:] = 100 * (np.divide(inotrop_ct[1,:],np.sum(inotrop_ct[1,:])))
perc_inotrop = pd.DataFrame(data=perc,columns=["Inotropes","No_Inotropes"],index=["early","later"])
perc_inotrop.to_csv('percentages_inotropes.csv')

#Calculate p-value for chi2 test and print to csv
inotrop_p = np.zeros((1,1))
inotrop_p[0,0] = chi2_contingency(inotrop_ct)[1]
p_val_inotrop = pd.DataFrame(data=inotrop_p,columns=['p-value of IV Inotropes'])
p_val_inotrop.to_csv('p_val_inotropes.csv')

In [10]:
#First create contingency table (ecmo_ct) that has the number of individuals with ECMO treatment

#The first column refers to individuals with ECMO treatment, and the second to those without
#The first row refers to the early cohort, and the second row refers to the later cohort
ecmo_ct = np.zeros((2,2))

#Since this variable is 1 for ECMO and 0 otherwise, we can just sum over the column
ecmo_ct[0,0] = df_early["CAN_ECMO"].sum()
#Then the number of 0 values is the remaining individuals in the cohort
ecmo_ct[0,1] = df_early.shape[0] - ecmo_ct[0,0]

#Since this variable is 1 for ECMO and 0 otherwise, we can just sum over the column
ecmo_ct[1,0] = df_later["CAN_ECMO"].sum()
#Then the number of 0 values is the remaining individuals in the cohort
ecmo_ct[1,1] = df_later.shape[0] - ecmo_ct[1,0]

#Print contingency table to csv file
total_ecmo = pd.DataFrame(data=ecmo_ct,columns=["ECMO","No_ECMO"],index=["early","later"])
total_ecmo.to_csv('total_ECMO.csv')

#Create array of percentages and print to csv
perc = np.zeros((ecmo_ct.shape[0],ecmo_ct.shape[1]))
perc[0,:] = 100 * (np.divide(ecmo_ct[0,:],np.sum(ecmo_ct[0,:])))
perc[1,:] = 100 * (np.divide(ecmo_ct[1,:],np.sum(ecmo_ct[1,:])))
perc_ecmo = pd.DataFrame(data=perc,columns=["ECMO","No_ECMO"],index=["early","later"])
perc_ecmo.to_csv('percentages_ECMO.csv')

#Calculate p-value for chi2 test and print to csv
ecmo_p = np.zeros((1,1))
ecmo_p[0,0] = chi2_contingency(ecmo_ct)[1]
p_val_ecmo = pd.DataFrame(data=ecmo_p,columns=['p-value of ECMO'])
p_val_ecmo.to_csv('p_val_ECMO.csv')

In [11]:
#First create contingency table (cereb_ct) that has the number of individuals with a history of cerebral vascular disease

#The first column refers to those with a history of cerebral vascular disease, and the second column to those without
#The first row refers to the early cohort, and the second row refers to the later cohort
cereb_ct = np.zeros((2,2))

#Since this variable is 1 for cerebral vascular disease and 0 otherwise, we can just sum over the column
cereb_ct[0,0] = df_early["cereb_vasc"].sum()
#Then the number of 0 values is the remaining individuals in the cohort
cereb_ct[0,1] = df_early.shape[0] - cereb_ct[0,0]

#Since this variable is 1 for cerebral vascular disease and 0 otherwise, we can just sum over the column
cereb_ct[1,0] = df_later["cereb_vasc"].sum()
#Then the number of 0 values is the remaining individuals in the cohort
cereb_ct[1,1] = df_later.shape[0] - cereb_ct[1,0]

#Print contingency table to csv file
total_cereb = pd.DataFrame(data=cereb_ct,columns=["Cerebral_Vascular_Disease","No_Cerebral_Vascular_Disease"],index=["early","later"])
total_cereb.to_csv('total_cerebral_vascular.csv')

#Create array of percentages and print to csv
perc = np.zeros((cereb_ct.shape[0],cereb_ct.shape[1]))
perc[0,:] = 100 * (np.divide(cereb_ct[0,:],np.sum(cereb_ct[0,:])))
perc[1,:] = 100 * (np.divide(cereb_ct[1,:],np.sum(cereb_ct[1,:])))
perc_cereb = pd.DataFrame(data=perc,columns=["Cerebral_Vascular_Disease","No_Cerebral_Vascular_Disease"],index=["early","later"])
perc_cereb.to_csv('percentages_cerebral_vascular.csv')

#Calculate p-value for chi2 test and print to csv
cereb_p = np.zeros((1,1))
cereb_p[0,0] = chi2_contingency(cereb_ct)[1]
p_val_cereb = pd.DataFrame(data=cereb_p,columns=['p-value of cerebral vascular disease'])
p_val_cereb.to_csv('p_val_cerebral_vascular.csv')

In [12]:
#Descriptive statistics for age

#First eliminate all cases with missing values for this variable
age_early = df_early[["age"]]
missing_early = age_early["age"].isna().sum()
condition = (age_early["age"].isna()==False)
age_early = age_early[condition]

age_later = df_later[["age"]]
missing_later = age_later["age"].isna().sum()
condition = (age_later["age"].isna()==False)
age_later = age_later[condition]

#Convert to numpy for Wilcoxon test
age_e = age_early.to_numpy()
age_l = age_later.to_numpy()

#Flatten arrays
age_e = age_e.flatten()
age_l = age_l.flatten()

#Calculate the Wilcoxon rank-sum statistic
w, p = ranksums(age_e,age_l)

#Print all information calculated above to text file
with open('age_stats.txt', 'w') as f:
    print('p-value is', file=f)
    print(p, file=f)
    print('Median early is', file=f)
    print(np.median(age_e), file=f)
    print('25th percentile, early cohort, is', file=f)
    print(np.percentile(age_e, 25), file=f)
    print('75th percentile, early cohort, is', file=f)
    print(np.percentile(age_e, 75), file=f)
    print('Median later is', file=f)
    print(np.median(age_l), file=f)
    print('25th percentile, later cohort, is', file=f)
    print(np.percentile(age_l, 25), file=f)
    print('75th percentile, later cohort, is', file=f)
    print(np.percentile(age_l, 75), file=f)
    print('Number of missing values in early cohort was %6.0f' % missing_early, file=f)
    print('Number of missing values in later cohort was %6.0f' % missing_later, file=f)

In [13]:
#Descriptive statistics for pulmonary capillary wedge pressure

#First eliminate all cases with missing values for this variable
pcw_early = df_early[["CAN_PCW_MEAN"]]
missing_early = pcw_early["CAN_PCW_MEAN"].isna().sum()
condition = (pcw_early["CAN_PCW_MEAN"].isna()==False)
pcw_early = pcw_early[condition]

pcw_later = df_later[["CAN_PCW_MEAN"]]
missing_later = pcw_later["CAN_PCW_MEAN"].isna().sum()
condition = (pcw_later["CAN_PCW_MEAN"].isna()==False)
pcw_later = pcw_later[condition]

#Convert to numpy for Wilcoxon test
pcw_e = pcw_early.to_numpy()
pcw_l = pcw_later.to_numpy()

#Flatten arrays
pcw_e = pcw_e.flatten()
pcw_l = pcw_l.flatten()

#Calculate the Wilcoxon rank-sum statistic
w, p = ranksums(pcw_e,pcw_l)

#Print all information calculated above to text file
with open('pcw_stats.txt', 'w') as f:
    print('p-value is', file=f)
    print(p, file=f)
    print('Median early is', file=f)
    print(np.median(pcw_e), file=f)
    print('25th percentile, early cohort, is', file=f)
    print(np.percentile(pcw_e, 25), file=f)
    print('75th percentile, early cohort, is', file=f)
    print(np.percentile(pcw_e, 75), file=f)
    print('Median later is', file=f)
    print(np.median(pcw_l), file=f)
    print('25th percentile, later cohort, is', file=f)
    print(np.percentile(pcw_l, 25), file=f)
    print('75th percentile, later cohort, is', file=f)
    print(np.percentile(pcw_l, 75), file=f)
    print('Number of missing values in early cohort was %6.0f' % missing_early, file=f)
    print('Number of missing values in later cohort was %6.0f' % missing_later, file=f)

In [14]:
#Descriptive statistics for BMI

#First eliminate all cases with missing values for this variable
bmi_early = df_early[["CAN_BMI"]]
missing_early = bmi_early["CAN_BMI"].isna().sum()
condition = (bmi_early["CAN_BMI"].isna()==False)
bmi_early = bmi_early[condition]

bmi_later = df_later[["CAN_BMI"]]
missing_later = bmi_later["CAN_BMI"].isna().sum()
condition = (bmi_later["CAN_BMI"].isna()==False)
bmi_later = bmi_later[condition]

#Convert to numpy for Wilcoxon test
bmi_e = bmi_early.to_numpy()
bmi_l = bmi_later.to_numpy()

#Flatten arrays
bmi_e = bmi_e.flatten()
bmi_l = bmi_l.flatten()

#Calculate Wilcoxon rank-sum statistic
w, p = ranksums(bmi_e,bmi_l)

#Print all information calculated above to text file
with open('bmi_stats.txt', 'w') as f:
    print('p-value is', file=f)
    print(p, file=f)
    print('Median early is', file=f)
    print(np.median(bmi_e), file=f)
    print('25th percentile, early cohort, is', file=f)
    print(np.percentile(bmi_e, 25), file=f)
    print('75th percentile, early cohort, is', file=f)
    print(np.percentile(bmi_e, 75), file=f)
    print('Median later is', file=f)
    print(np.median(bmi_l), file=f)
    print('25th percentile, later cohort, is', file=f)
    print(np.percentile(bmi_l, 25), file=f)
    print('75th percentile, later cohort, is', file=f)
    print(np.percentile(bmi_l, 75), file=f)
    print('Number of missing values in early cohort was %6.0f' % missing_early, file=f)
    print('Number of missing values in later cohort was %6.0f' % missing_later, file=f)

In [15]:
#Descriptive statistics for glomerular filtration rate

#First eliminate all cases with missing values for this variable
gfr_early = df_early[["gfr"]]
missing_early = gfr_early["gfr"].isna().sum()
condition = (gfr_early["gfr"].isna()==False)
gfr_early = gfr_early[condition]

gfr_later = df_later[["gfr"]]
missing_later = gfr_later["gfr"].isna().sum()
condition = (gfr_later["gfr"].isna()==False)
gfr_later = gfr_later[condition]

#Convert to numpy for Wilcoxon test
gfr_e = gfr_early.to_numpy()
gfr_l = gfr_later.to_numpy()

#Flatten arrays
gfr_e = gfr_e.flatten()
gfr_l = gfr_l.flatten()

#Calculate Wilcoxon rank-sum statistic
w, p = ranksums(gfr_e,gfr_l)

#Print all information calculated above to text file
with open('gfr_stats.txt', 'w') as f:
    print('p-value is', file=f)
    print(p, file=f)
    print('Median early is', file=f)
    print(np.median(gfr_e), file=f)
    print('25th percentile, early cohort, is', file=f)
    print(np.percentile(gfr_e, 25), file=f)
    print('75th percentile, early cohort, is', file=f)
    print(np.percentile(gfr_e, 75), file=f)
    print('Median later is', file=f)
    print(np.median(gfr_l), file=f)
    print('25th percentile, later cohort, is', file=f)
    print(np.percentile(gfr_l, 25), file=f)
    print('75th percentile, later cohort, is', file=f)
    print(np.percentile(gfr_l, 75), file=f)
    print('Number of missing values in early cohort was %6.0f' % missing_early, file=f)
    print('Number of missing values in later cohort was %6.0f' % missing_later, file=f)

In [16]:
#Descriptive statistics for pulmonary artery mean pressure

#First eliminate all cases with missing values for this variable
pam_early = df_early[["CAN_PULM_ART_MEAN"]]
missing_early = pam_early["CAN_PULM_ART_MEAN"].isna().sum()
condition = (pam_early["CAN_PULM_ART_MEAN"].isna()==False)
pam_early = pam_early[condition]

pam_later = df_later[["CAN_PULM_ART_MEAN"]]
missing_later = pam_later["CAN_PULM_ART_MEAN"].isna().sum()
condition = (pam_later["CAN_PULM_ART_MEAN"].isna()==False)
pam_later = pam_later[condition]

#Convert to numpy for Wilcoxon test
pam_e = pam_early.to_numpy()
pam_l = pam_later.to_numpy()

#Flatten arrays
pam_e = pam_e.flatten()
pam_l = pam_l.flatten()

#Calculate Wilcoxon rank-sum statistic
w, p = ranksums(pam_e,pam_l)

#Print all information calculated above to text file
with open('pam_stats.txt', 'w') as f:
    print('p-value is', file=f)
    print(p, file=f)
    print('Median early is', file=f)
    print(np.median(pam_e), file=f)
    print('25th percentile, early cohort, is', file=f)
    print(np.percentile(pam_e, 25), file=f)
    print('75th percentile, early cohort, is', file=f)
    print(np.percentile(pam_e, 75), file=f)
    print('Median later is', file=f)
    print(np.median(pam_l), file=f)
    print('25th percentile, later cohort, is', file=f)
    print(np.percentile(pam_l, 25), file=f)
    print('75th percentile, later cohort, is', file=f)
    print(np.percentile(pam_l, 75), file=f)
    print('Number of missing values in early cohort was %6.0f' % missing_early, file=f)
    print('Number of missing values in later cohort was %6.0f' % missing_later, file=f)

In [17]:
#Descriptive statistics for cardiac index

#First eliminate all cases with missing values for this variable
ci_early = df_early[["cardiac_index"]]
missing_early = ci_early["cardiac_index"].isna().sum()
condition = (ci_early["cardiac_index"].isna()==False)
ci_early = ci_early[condition]

ci_later = df_later[["cardiac_index"]]
missing_later = ci_later["cardiac_index"].isna().sum()
condition = (ci_later["cardiac_index"].isna()==False)
ci_later = ci_later[condition]

#Convert to numpy for Wilcoxon test
ci_e = ci_early.to_numpy()
ci_l = ci_later.to_numpy()

#Flatten arrays
ci_e = ci_e.flatten()
ci_l = ci_l.flatten()

#Calculate Wilcoxon rank-sum statistic
w, p = ranksums(ci_e,ci_l)

#Print all information calculated above to text file
with open('cardiac_index_stats.txt', 'w') as f:
    print('p-value is', file=f)
    print(p, file=f)
    print('Median early is', file=f)
    print(np.median(ci_e), file=f)
    print('25th percentile, early cohort, is', file=f)
    print(np.percentile(ci_e, 25), file=f)
    print('75th percentile, early cohort, is', file=f)
    print(np.percentile(ci_e, 75), file=f)
    print('Median later is', file=f)
    print(np.median(ci_l), file=f)
    print('25th percentile, later cohort, is', file=f)
    print(np.percentile(ci_l, 25), file=f)
    print('75th percentile, later cohort, is', file=f)
    print(np.percentile(ci_l, 75), file=f)
    print('Number of missing values in early cohort was %6.0f' % missing_early, file=f)
    print('Number of missing values in later cohort was %6.0f' % missing_later, file=f)

In [18]:
age_total = np.append(age_e,age_l)
mean_age = np.mean(age_total)
print(mean_age)

52.9690035300675


In [19]:
#Percentage male in data
male = df[df['CAN_GENDER']=='M']
perc_male = male.shape[0]/df.shape[0]
print(perc_male)

0.7373815569455626


In [20]:
#Number died and transplanted
condition = (df['dead']==1) & (df['transplanted']==1)
count_deadT = df[condition]
count_deadT.shape[0]

4740

In [21]:
#Number transplanted and alive
condition = (df['dead']==0) & (df['transplanted']==1)
count_AT = df[condition]
count_AT.shape[0]

17307

In [22]:
#Number not transplanted and died
condition = (df['dead']==1) & (df['transplanted']==0)
count_deadN = df[condition]
count_deadN.shape[0]

5500

In [23]:
#Number not transplanted and alive
condition = (df['dead']==0) & (df['transplanted']==0)
count_AN = df[condition]
count_AN.shape[0]

4747

In [24]:
#Number transplanted
condition = (df['transplanted']==1)
count_T = df[condition]
count_T.shape[0]

22047

In [25]:
#Total number analyzed
df.shape[0]

32294