# Fine-Gray Competing Risk Model Example

This notebook demonstrates how to use the `r_fine_gray.py` module to:
1. Fit a Fine-Gray model on training data
2. Save the model
3. Load the model and make predictions on spatial and temporal test datasets

In [None]:
# Import necessary libraries
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

# Add the project root directory to the Python path
sys.path.append(os.path.abspath(os.path.dirname(os.path.dirname(os.getcwd()))))

# Import the r_fine_gray module
from src.r_fine_gray import run_baseline_cif, load_and_predict
from src.util import load_yaml_file

ModuleNotFoundError: No module named 'src'

## 1. Load Datasets

First, let's load the training and test datasets. In this example, we'll load them from ZenML artifacts, but you can modify this to load from your preferred source.

In [None]:
# Load datasets from ZenML artifacts
from zenml.client import Client

# Load training data
print("Loading training data...")
train_artifact = Client().get_artifact_version("1bc9d8ef-eb17-474d-bbb0-46f62ce033ef")  # Replace with your artifact ID
train_df = train_artifact.load()
print(f"Training data loaded with shape: {train_df.shape}")

# Load spatial test data
print("Loading spatial test data...")
spatial_artifact = Client().get_artifact_version("your-spatial-test-artifact-id")  # Replace with your artifact ID
spatial_test_df = spatial_artifact.load()
print(f"Spatial test data loaded with shape: {spatial_test_df.shape}")

# Load temporal test data
print("Loading temporal test data...")
temporal_artifact = Client().get_artifact_version("your-temporal-test-artifact-id")  # Replace with your artifact ID
temporal_test_df = temporal_artifact.load()
print(f"Temporal test data loaded with shape: {temporal_test_df.shape}")

Alternatively, you can load the datasets from CSV files:

In [None]:
# Uncomment and modify as needed for your actual data loading
"""
# Load datasets from CSV files
train_df = pd.read_csv("path/to/train_df.csv")
spatial_test_df = pd.read_csv("path/to/spatial_test_df.csv")
temporal_test_df = pd.read_csv("path/to/temporal_test_df.csv")

print(f"Training data loaded with shape: {train_df.shape}")
print(f"Spatial test data loaded with shape: {spatial_test_df.shape}")
print(f"Temporal test data loaded with shape: {temporal_test_df.shape}")
"""

## 2. Examine Data Structure

Let's examine the structure of the datasets to ensure they have the required columns.

In [None]:
# Display the first few rows of the training dataset
train_df.head()

In [None]:
# Check column names
print("Training dataset columns:")
print(train_df.columns.tolist())

# Load mapping configuration
mapping_file = "src/default_master_df_mapping.yml"
mapping = load_yaml_file(mapping_file)

# Get required columns
duration_col = mapping.get('duration')
event_col = mapping.get('event')
feature_cols = mapping.get('features', [])

print(f"\nRequired columns:")
print(f"Duration column: {duration_col}")
print(f"Event column: {event_col}")
print(f"Feature columns: {feature_cols}")

# Check for missing required columns
required_cols = [duration_col, event_col] + feature_cols
missing_cols = [col for col in required_cols if col not in train_df.columns]

if missing_cols:
    print(f"\nWARNING: Missing required columns: {missing_cols}")
else:
    print("\nAll required columns are present in the dataset.")

## 3. Fit Fine-Gray Models on Training Data

Now let's fit the Fine-Gray models on the training data.

In [None]:
# Create output directory
output_path = Path("./fine_gray_output")
output_path.mkdir(exist_ok=True)

# Fit Fine-Gray models
print("Fitting Fine-Gray models on training data...")

train_results = run_baseline_cif(
    df=train_df,
    feature_cols=feature_cols,
    output_path=str(output_path),
    seed=42,
    n_threads=None,  # Auto-detect
    silent=False
)

print("Model fitting completed successfully")

## 4. Examine Training Results

Let's examine the results of the model fitting on the training data.

In [None]:
# Display dialysis risks
print("Dialysis risks:")
for risk in train_results['dialysis_risks']:
    print(f"  {risk['horizon_days']} days: {risk['risk_pct']:.2f}%")

# Display death risks
print("\nDeath risks:")
for risk in train_results['death_risks']:
    print(f"  {risk['horizon_days']} days: {risk['risk_pct']:.2f}%")

# Display paths to saved files
print("\nSaved files:")
print(f"  Visualization: {train_results['visualization_path']}")
print(f"  CSV: {train_results['csv_path']}")
print(f"  Dialysis model: {train_results['model_paths']['dialysis_model']}")
print(f"  Death model: {train_results['model_paths']['death_model']}")
print(f"  Metadata: {train_results['model_paths']['metadata']}")

## 5. Display Visualization

Let's display the visualization created by the model.

In [None]:
# Display the visualization
from IPython.display import Image
Image(filename=train_results['visualization_path'])

## 6. Make Predictions on Test Datasets

Now let's load the saved models and make predictions on the spatial and temporal test datasets.

In [None]:
# Get paths to saved models
dialysis_model_path = train_results['model_paths']['dialysis_model']
death_model_path = train_results['model_paths']['death_model']

# Define time horizons
time_horizons = [365, 730, 1095, 1460, 1825]  # 1-5 years

# Predict on spatial test dataset
print("Predicting on spatial test dataset...")

spatial_dialysis_predictions = load_and_predict(
    model_path=dialysis_model_path,
    df=spatial_test_df,
    feature_cols=feature_cols,
    time_horizons=time_horizons,
    seed=42
)

spatial_death_predictions = load_and_predict(
    model_path=death_model_path,
    df=spatial_test_df,
    feature_cols=feature_cols,
    time_horizons=time_horizons,
    seed=42
)

# Calculate mean risks for spatial test dataset
spatial_dialysis_risks = spatial_dialysis_predictions.mean() * 100
spatial_death_risks = spatial_death_predictions.mean() * 100

print("Spatial test dataset - Dialysis risks:")
for horizon, risk in spatial_dialysis_risks.items():
    days = horizon.replace('t', '')
    print(f"  {days} days: {risk:.2f}%")

print("\nSpatial test dataset - Death risks:")
for horizon, risk in spatial_death_risks.items():
    days = horizon.replace('t', '')
    print(f"  {days} days: {risk:.2f}%")

In [None]:
# Predict on temporal test dataset
print("Predicting on temporal test dataset...")

temporal_dialysis_predictions = load_and_predict(
    model_path=dialysis_model_path,
    df=temporal_test_df,
    feature_cols=feature_cols,
    time_horizons=time_horizons,
    seed=42
)

temporal_death_predictions = load_and_predict(
    model_path=death_model_path,
    df=temporal_test_df,
    feature_cols=feature_cols,
    time_horizons=time_horizons,
    seed=42
)

# Calculate mean risks for temporal test dataset
temporal_dialysis_risks = temporal_dialysis_predictions.mean() * 100
temporal_death_risks = temporal_death_predictions.mean() * 100

print("Temporal test dataset - Dialysis risks:")
for horizon, risk in temporal_dialysis_risks.items():
    days = horizon.replace('t', '')
    print(f"  {days} days: {risk:.2f}%")

print("\nTemporal test dataset - Death risks:")
for horizon, risk in temporal_death_risks.items():
    days = horizon.replace('t', '')
    print(f"  {days} days: {risk:.2f}%")

## 7. Compare Risks Across Datasets

Let's create a comparison table to compare the risks across the three datasets.

In [None]:
# Extract 1-year and 5-year risks
train_dialysis_1y = train_results['dialysis_risks'][0]['risk_pct']
train_dialysis_5y = train_results['dialysis_risks'][4]['risk_pct']
train_death_1y = train_results['death_risks'][0]['risk_pct']
train_death_5y = train_results['death_risks'][4]['risk_pct']

spatial_dialysis_1y = spatial_dialysis_risks['t365']
spatial_dialysis_5y = spatial_dialysis_risks['t1825']
spatial_death_1y = spatial_death_risks['t365']
spatial_death_5y = spatial_death_risks['t1825']

temporal_dialysis_1y = temporal_dialysis_risks['t365']
temporal_dialysis_5y = temporal_dialysis_risks['t1825']
temporal_death_1y = temporal_death_risks['t365']
temporal_death_5y = temporal_death_risks['t1825']

# Create comparison DataFrame
comparison_df = pd.DataFrame({
    'Dataset': ['Train', 'Spatial Test', 'Temporal Test'],
    'Dialysis 1-year (%)': [train_dialysis_1y, spatial_dialysis_1y, temporal_dialysis_1y],
    'Dialysis 5-year (%)': [train_dialysis_5y, spatial_dialysis_5y, temporal_dialysis_5y],
    'Death 1-year (%)': [train_death_1y, spatial_death_1y, temporal_death_1y],
    'Death 5-year (%)': [train_death_5y, spatial_death_5y, temporal_death_5y]
})

# Display comparison table
comparison_df

In [None]:
# Save comparison table to CSV
comparison_df.to_csv(output_path / "risk_comparison.csv", index=False)
print(f"Comparison table saved to {output_path / 'risk_comparison.csv'}")

## 8. Visualize Comparison

Let's create a bar chart to visualize the comparison.

In [None]:
# Create a bar chart to visualize the comparison
fig, ax = plt.subplots(1, 2, figsize=(15, 6))

# Dialysis risks
comparison_df.plot(
    x='Dataset',
    y=['Dialysis 1-year (%)', 'Dialysis 5-year (%)'],
    kind='bar',
    ax=ax[0],
    color=['#3366CC', '#6699FF']
)
ax[0].set_title('Dialysis Risks')
ax[0].set_ylabel('Risk (%)')
ax[0].grid(axis='y', linestyle='--', alpha=0.7)

# Death risks
comparison_df.plot(
    x='Dataset',
    y=['Death 1-year (%)', 'Death 5-year (%)'],
    kind='bar',
    ax=ax[1],
    color=['#FF9933', '#FFCC99']
)
ax[1].set_title('Death Risks')
ax[1].set_ylabel('Risk (%)')
ax[1].grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.savefig(output_path / "risk_comparison.png", dpi=300)
plt.show()

## 9. Save Patient-Level Predictions

Finally, let's save the patient-level predictions for further analysis.

In [None]:
# Save spatial test predictions
spatial_dialysis_predictions.to_csv(output_path / "spatial_dialysis_predictions.csv")
spatial_death_predictions.to_csv(output_path / "spatial_death_predictions.csv")

# Save temporal test predictions
temporal_dialysis_predictions.to_csv(output_path / "temporal_dialysis_predictions.csv")
temporal_death_predictions.to_csv(output_path / "temporal_death_predictions.csv")

print("Patient-level predictions saved successfully")

## 10. Conclusion

In this notebook, we've demonstrated how to:
1. Fit Fine-Gray competing risk models on training data
2. Save the models for future use
3. Load the models and make predictions on spatial and temporal test datasets
4. Compare the risks across the three datasets

The saved models can be reused for future predictions without having to refit them.