In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from lifelines import CoxPHFitter, KaplanMeierFitter
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Step 1: Load the data
data = pd.read_csv('Dialysis.csv')

# Step 2: Data Exploration
# Summary statistics
print(data.describe())

# Check for missing values
print(data.isnull().sum())

# Visualize age distribution
sns.histplot(data['num_age'], kde=True)
plt.title('Age Distribution')
plt.show()

# Visualize survival times
sns.histplot(data['time'], kde=True)
plt.title('Survival Time Distribution')
plt.show()

# Step 3: Handle Missing Data (if any)
# Assuming no missing data in this example, but you could use methods like:
# data.fillna(method='ffill', inplace=True)  # Forward fill
# data.dropna(inplace=True)  # Drop rows with missing values

# Step 4: One-Hot Encode the 'fac_disease' column
encoder = OneHotEncoder(drop='first')
encoded_disease = encoder.fit_transform(data[['fac_disease']]).toarray()
disease_columns = encoder.get_feature_names_out(['fac_disease'])
encoded_df = pd.DataFrame(encoded_disease, columns=disease_columns)
data = pd.concat([data, encoded_df], axis=1).drop(columns=['fac_disease'])

# Step 5: Feature Scaling (optional, depends on the model)
scaler = StandardScaler()
data[['num_age', 'num_begin']] = scaler.fit_transform(data[['num_age', 'num_begin']])

# Step 6: Train/Test Split
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Prepare the training data for the Cox model
df_train_for_cox = train_data[['time', 'event', 'num_age', 'num_begin', 'fac_center'] + list(disease_columns)]

# Step 7: Fit the Cox Proportional Hazards Model
cph = CoxPHFitter()
cph.fit(df_train_for_cox, duration_col='time', event_col='event')

# Display the summary of the model
print(cph.summary)

# Step 8: Model Validation using Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
concordance_indices = []

for train_index, test_index in kf.split(data):
    df_train, df_test = data.iloc[train_index], data.iloc[test_index]
    df_train_for_cox = df_train[['time', 'event', 'num_age', 'num_begin', 'fac_center'] + list(disease_columns)]
    cph.fit(df_train_for_cox, duration_col='time', event_col='event')
    concordance_indices.append(cph.concordance_index_)

print("Average Concordance Index:", np.mean(concordance_indices))

# Step 9: Predict Survival Probabilities on Test Data
df_test_for_cox = test_data[['num_age', 'num_begin', 'fac_center'] + list(disease_columns)]
test_data['predicted_survival'] = cph.predict_survival_function(df_test_for_cox).iloc[0].values

# Visualize survival curve for an example patient
new_patient = df_test_for_cox.iloc[0:1]
survival_function = cph.predict_survival_function(new_patient)
plt.plot(survival_function)
plt.title('Survival Function for an Example Patient')
plt.xlabel('Time')
plt.ylabel('Survival Probability')
plt.show()

# Step 10: Compare with Kaplan-Meier Estimator
kmf = KaplanMeierFitter()
kmf.fit(train_data['time'], event_observed=train_data['event'])
kmf.plot_survival_function()
plt.title('Kaplan-Meier Survival Curve')
plt.show()

# Step 11: Model Interpretation
# Check the coefficients
cph.plot()
plt.title('Cox Model Coefficients')
plt.show()

# Step 12: Saving and Deploying the Model
# Save the model for later use
cph.save_model('cox_model.pkl')

# Load and use the saved model
# cph_loaded = CoxPHFitter().load_model('cox_model.pkl')


AttributeError: module 'numpy' has no attribute 'msort'