In [7]:
import pandas as pd
from lifelines import CoxPHFitter
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
combined_data = pd.read_parquet('combined_data.parquet')
print("Columns with NaN values:")
print(combined_data.isnull().sum())
combined_data_clean = combined_data.dropna()

Columns with NaN values:
patient_id         0
event              0
T                  0
sex                0
race               0
ethnicity          0
min_travel_time    0
SDOH               0
education          0
income             0
mi                 0
chf                0
pvd                0
cevd               0
dementia           0
copd               0
rheumd             0
pud                0
mld                0
msld               0
diab               0
dia_w_c            0
hp                 0
mrend              0
srend              0
aids               0
hiv                0
mst                0
mal                0
Obesity            0
WL                 0
Alcohol            0
Drug               0
Psycho             0
Dep                0
dtype: int64


In [12]:
def run_cox_analysis(data):

    # Travel Time
    travel_covariates = ['min_travel_time']
    
    # Demographics
    demo_covariates = [col for col in data.columns if col.startswith(('sex_', 'race_', 'ethnicity_'))]
    
    # Comorbidity
    charlson_covariates = [
    'mi', 'chf', 'pvd', 'cevd', 'dementia', 'copd', 'rheumd', 'pud', 
    'mld', 'msld', 'diab', 'dia_w_c', 'hp', 'mrend', 'srend', 
    'aids', 'hiv', 'mst', 'mal', 'Obesity', 'WL', 'Alcohol', 'Drug', 'Psycho', 'Dep'
    ]
    
    # SDOH
    sdoh_covariates = [col for col in data.columns 
                      if col.startswith('education_')] + ['SDOH']
    
    covariates = travel_covariates + demo_covariates + charlson_covariates + sdoh_covariates

    cph = CoxPHFitter(penalizer=0.1)

    cox_data = data[covariates + ['T', 'event']]

    cph.fit(cox_data, duration_col='T', event_col='event')
    return cph

print("Event distribution:")
print(combined_data_clean['event'].value_counts())
print("\nTime variable statistics:")
print(combined_data_clean['T'].describe())

model = run_cox_analysis(combined_data_clean)

print("Cox Model Summary:")
model.print_summary()

Event distribution:
event
1    51
0    49
Name: count, dtype: int64

Time variable statistics:
count    100.000000
mean      15.284100
std        8.573717
min       -1.720000
25%        8.935000
50%       16.000000
75%       21.285000
max       34.810000
Name: T, dtype: float64
Cox Model Summary:


0,1
model,lifelines.CoxPHFitter
duration col,'T'
event col,'event'
penalizer,0.1
l1 ratio,0.0
baseline estimation,breslow
number of observations,100
number of events observed,51
partial log-likelihood,-175.63
time fit was run,2025-02-20 02:48:15 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
min_travel_time,-0.01,0.99,0.0,-0.01,0.0,0.99,1.0,0.0,-1.12,0.26,1.93
mi,-0.35,0.7,0.7,-1.72,1.02,0.18,2.76,0.0,-0.5,0.61,0.7
chf,0.12,1.12,0.67,-1.19,1.43,0.3,4.16,0.0,0.17,0.86,0.21
pvd,-0.79,0.45,0.96,-2.67,1.09,0.07,2.97,0.0,-0.83,0.41,1.29
cevd,-0.03,0.97,0.73,-1.46,1.4,0.23,4.06,0.0,-0.04,0.96,0.05
dementia,-0.93,0.39,0.66,-2.23,0.37,0.11,1.44,0.0,-1.41,0.16,2.64
copd,-0.14,0.87,0.56,-1.24,0.96,0.29,2.61,0.0,-0.25,0.8,0.32
rheumd,-0.76,0.47,0.82,-2.38,0.85,0.09,2.34,0.0,-0.93,0.35,1.49
pud,0.59,1.8,0.65,-0.69,1.86,0.5,6.43,0.0,0.9,0.37,1.45
mld,0.54,1.72,0.78,-0.99,2.08,0.37,7.97,0.0,0.7,0.49,1.04

0,1
Concordance,0.72
Partial AIC,405.26
log-likelihood ratio test,20.81 on 27 df
-log2(p) of ll-ratio test,0.33
