In [None]:
import pandas as pd
from scipy import stats
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample

In [None]:
data = pd.read_csv('./data/all_demo_2year.csv') # load your data

In [None]:
# Age
group1 = data[data['Outcome'] == 0]['AGE']
group2 = data[data['Outcome'] == 1]['AGE']

t_test_result = stats.ttest_ind(group1, group2)
print(f"T-Test for Age: {t_test_result}")

In [None]:
# Gender
contingency_table_female = pd.crosstab(data['Outcome'], data['Female'])
chi2_female, p_female, dof_female, ex_female = stats.chi2_contingency(contingency_table_female)
print(f"Chi-Square Test for Female: Chi2 = {chi2_female}, p-value = {p_female}")

In [None]:
# Industry
industry_columns = ['industry/1', 'industry/2', 'industry/3', 'industry/4', 'industry/5', 
                    'industry/6', 'industry/7', 'industry/A', 'industry/C', 'industry/W']
for col in industry_columns:
    contingency_table_industry = pd.crosstab(data['Outcome'], data[col])
    chi2_ind, p_ind, dof_ind, ex_ind = stats.chi2_contingency(contingency_table_industry)
    print(f"Chi-Square Test for {col}: Chi2 = {chi2_ind}, p-value = {p_ind}")

# Combine Industry Columns into One Variable
industry_columns = ['industry/1', 'industry/2', 'industry/3', 'industry/4', 'industry/5', 
                    'industry/6', 'industry/7', 'industry/A', 'industry/C', 'industry/W']
data['Industry'] = data[industry_columns].idxmax(axis=1)

# Chi-Square Test for Industry
contingency_table_industry = pd.crosstab(data['Outcome'], data['Industry'])
chi2_industry, p_industry, dof_industry, ex_industry = stats.chi2_contingency(contingency_table_industry)
print(f"Chi-Square Test for Industry: Chi2 = {chi2_industry}, p-value = {p_industry}")


In [None]:
# Region
region_columns = ['region/1', 'region/2', 'region/3', 'region/4', 'region/5']
for col in region_columns:
    contingency_table_region = pd.crosstab(data['Outcome'], data[col])
    chi2_reg, p_reg, dof_reg, ex_reg = stats.chi2_contingency(contingency_table_region)
    print(f"Chi-Square Test for {col}: Chi2 = {chi2_reg}, p-value = {p_reg}")

region_columns = ['region/1', 'region/2', 'region/3', 'region/4', 'region/5']
data['Region'] = data[region_columns].idxmax(axis=1)

# Chi-Square Test for Region
contingency_table_region = pd.crosstab(data['Outcome'], data['Region'])
chi2_region, p_region, dof_region, ex_region = stats.chi2_contingency(contingency_table_region)
print(f"Chi-Square Test for Region: Chi2 = {chi2_region}, p-value = {p_region}")

Risk Scores 

In [None]:
# load your data
chad = pd.read_csv('') # chad results for all patients
chad_vasc = pd.read_csv('') # chad_vasc results for all patients
chad_female = pd.read_csv('') # chad results for female patients
chad_male = pd.read_csv('') # chad results for male patients
chad_vasc_female = pd.read_csv('') # chad_vasc results for female patients
chad_vasc_male = pd.read_csv('') # chad_vasc results for male patients

In [None]:
def bootstrap_aucroc(df, score_col, outcome_col, n_iterations=1000):
    aucroc_scores = []
    for _ in range(n_iterations):
        boot_sample = resample(df, replace=True, n_samples=len(df))
        aucroc = roc_auc_score(boot_sample[outcome_col], boot_sample[score_col])
        aucroc_scores.append(aucroc)
    return aucroc_scores


In [None]:
n_iterations = 1000
aucroc_df1 = bootstrap_aucroc(chad, 'chad_scores', 'outcome', n_iterations)
aucroc_df2 = bootstrap_aucroc(chad_vasc, 'chad_vasc_scores', 'outcome', n_iterations)
aucroc_differences = np.array(aucroc_df2) - np.array(aucroc_df1)
conf_interval = np.percentile(aucroc_differences, [2.5, 97.5])

observed_mean_difference = np.mean(aucroc_differences)
p_value = np.mean(aucroc_differences >= observed_mean_difference)

print(f"95% Confidence Interval: {conf_interval}")
print(f"P-value: {p_value}")

In [None]:
n_iterations = 1000
aucroc_df1 = bootstrap_aucroc(chad_female, 'chad_scores', 'outcome', n_iterations)
aucroc_df2 = bootstrap_aucroc(chad_vasc_female, 'chad_vasc_scores', 'outcome', n_iterations)
aucroc_differences = np.array(aucroc_df2) - np.array(aucroc_df1)
conf_interval = np.percentile(aucroc_differences, [2.5, 97.5])

observed_mean_difference = np.mean(aucroc_differences)
p_value = np.mean(aucroc_differences >= observed_mean_difference)

print(f"95% Confidence Interval: {conf_interval}")
print(f"P-value: {p_value}")

In [None]:
n_iterations = 1000
aucroc_df1 = bootstrap_aucroc(chad_male, 'chad_scores', 'outcome', n_iterations)
aucroc_df2 = bootstrap_aucroc(chad_vasc_male, 'chad_vasc_scores', 'outcome', n_iterations)
aucroc_differences = np.array(aucroc_df2) - np.array(aucroc_df1)
conf_interval = np.percentile(aucroc_differences, [2.5, 97.5])

observed_mean_difference = np.mean(aucroc_differences)
p_value = np.mean(aucroc_differences >= observed_mean_difference)

print(f"95% Confidence Interval: {conf_interval}")
print(f"P-value: {p_value}")