# Exercise 4 Solution: Calibrate and Project an SEIR Model

[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ngozzi/tech-transfer-epdemix/blob/main/sessions/session-4/solutions/python/exercise_4_seir_calibration.ipynb)

Calibrate an SEIR model to real-world-like data and generate projections.

In [None]:
# Colab installation (skip if running locally)
import sys, os, subprocess
if "google.colab" in sys.modules or os.getenv("COLAB_RELEASE_TAG"):
    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "-r",
                    "https://raw.githubusercontent.com/epistorm/epydemix/refs/heads/main/tutorials/colab_requirements.txt"])

## Task 1: Load and Prepare Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

colors = sns.color_palette("Dark2")

# Load incidence data
data = pd.read_csv('https://raw.githubusercontent.com/epistorm/epydemix/refs/heads/main/tutorials/data/incidence_data.csv')
data["date"] = pd.to_datetime(data["date"])

# Split: first 80 days for calibration, rest for validation
calibration_days = 80
data_calibration = data.iloc[:calibration_days]
data_validation = data.iloc[calibration_days:]

print(f"Calibration period: {data_calibration.date.iloc[0].date()} to {data_calibration.date.iloc[-1].date()} ({len(data_calibration)} days)")
print(f"Validation period: {data_validation.date.iloc[0].date()} to {data_validation.date.iloc[-1].date()} ({len(data_validation)} days)")

In [None]:
# Visualize the data split
fig, ax = plt.subplots(figsize=(10, 4), dpi=150)

ax.plot(data_calibration["date"], data_calibration["data"], "ko", markersize=4, label="Calibration data")
ax.plot(data_validation["date"], data_validation["data"], "o", color="gray", 
        markersize=4, alpha=0.5, label="Validation data (held out)")
ax.axvline(data_calibration["date"].iloc[-1], color="red", linestyle="--", alpha=0.5, label="Calibration cutoff")

ax.set_xlabel("Date")
ax.set_ylabel("New Infections")
ax.set_title("Incidence Data: Calibration vs Validation Split")
ax.legend()

plt.tight_layout()

## Task 2: Set Up the SEIR Model

In [None]:
from epydemix import EpiModel
from epydemix.population import load_epydemix_population
from epydemix.utils import compute_simulation_dates

# Create SEIR model
model = EpiModel(name="SEIR", compartments=["S", "E", "I", "R"])

# Add transitions
model.add_transition(source="S", target="E", params=("beta", "I"), kind="mediated")
model.add_transition(source="E", target="I", params="sigma", kind="spontaneous")
model.add_transition(source="I", target="R", params="gamma", kind="spontaneous")

# Set default parameters (will be overridden during calibration)
model.add_parameter("beta", 0.02)
model.add_parameter("sigma", 0.2)
model.add_parameter("gamma", 0.15)

# Load Indonesia population
model.import_epydemix_population(population_name="Indonesia")

print(model)

In [None]:
# Initial conditions: 0.05% of population infected
initial_conditions = {
    "S": model.population.Nk - (model.population.Nk * 0.05 / 100).astype(int),
    "E": np.zeros(len(model.population.Nk)),
    "I": (model.population.Nk * 0.05 / 100).astype(int),
    "R": np.zeros(len(model.population.Nk))
}

# Simulation parameters for calibration
parameters = {
    "initial_conditions_dict": initial_conditions,
    "epimodel": model,
    "start_date": data_calibration.date.values[0],
    "end_date": data_calibration.date.values[-1]
}

# Compute simulation dates
simulation_dates_calibration = compute_simulation_dates(
    start_date=data_calibration.date.values[0],
    end_date=data_calibration.date.values[-1]
)
simulation_dates_full = compute_simulation_dates(
    start_date=data_calibration.date.values[0],
    end_date=data_validation.date.values[-1]
)

## Tasks 3 & 4: Define Priors and Run Calibration

In [None]:
from scipy import stats

# Define priors
priors = {
    "beta": stats.uniform(0.01, 0.02),     # U(0.01, 0.03)
    "sigma": stats.uniform(0.1, 0.2),      # U(0.1, 0.3) → 3-10 day latent period
    "gamma": stats.uniform(0.1, 0.1)       # U(0.1, 0.2) → 5-10 day infectious period
}

print("Prior distributions:")
for param, dist in priors.items():
    print(f"  {param}: U({dist.args[0]:.2f}, {dist.args[0] + dist.args[1]:.2f})")

In [None]:
from epydemix import simulate
from epydemix.calibration import ABCSampler, rmse

def simulate_wrapper(parameters):
    """Run SEIR simulation and extract new exposures (S→E transitions)."""
    results = simulate(**parameters)
    # For SEIR, new infections are S→E transitions
    return {"data": results.transitions["S_to_E_total"]}

# Initialize the ABC sampler
abc_sampler = ABCSampler(
    simulation_function=simulate_wrapper,
    priors=priors,
    parameters=parameters,
    observed_data=data_calibration["data"].values,
    distance_function=rmse
)

In [None]:
# Run ABC-SMC calibration
results_seir = abc_sampler.calibrate(
    strategy="smc",
    num_particles=100,
    num_generations=5
)

### Visualize Posterior Distributions

In [None]:
from epydemix.visualization import plot_posterior_distribution, plot_posterior_distribution_2d

# Get posterior
posterior = results_seir.get_posterior_distribution()

# Plot marginal posteriors
fig, axes = plt.subplots(1, 3, figsize=(12, 3), dpi=150)

plot_posterior_distribution(posterior, "beta", ax=axes[0], kind="kde", 
                           title="Transmission Rate (β)", color=colors[0])
plot_posterior_distribution(posterior, "sigma", ax=axes[1], kind="kde",
                           title="Latent Rate (σ)", color=colors[1])
plot_posterior_distribution(posterior, "gamma", ax=axes[2], kind="kde",
                           title="Recovery Rate (γ)", color=colors[2])

plt.tight_layout()

In [None]:
# Print posterior summaries
print("Posterior parameter estimates (median [IQR]):")
for param in ["beta", "sigma", "gamma"]:
    values = posterior[param].values
    med = np.median(values)
    q25, q75 = np.percentile(values, [25, 75])
    print(f"  {param}: {med:.4f} [{q25:.4f}, {q75:.4f}]")

# Derived quantities
print("\nDerived quantities:")
latent_period = 1 / posterior["sigma"].values
infectious_period = 1 / posterior["gamma"].values
print(f"  Latent period: {np.median(latent_period):.1f} days [{np.percentile(latent_period, 25):.1f}, {np.percentile(latent_period, 75):.1f}]")
print(f"  Infectious period: {np.median(infectious_period):.1f} days [{np.percentile(infectious_period, 25):.1f}, {np.percentile(infectious_period, 75):.1f}]")

## Tasks 5 & 6: Run Projections and Compare to Validation Data

In [None]:
# Create projection parameters (extend to validation period)
projection_parameters = parameters.copy()
projection_parameters["end_date"] = data_validation.date.values[-1]

# Run projections
results_with_projections = abc_sampler.run_projections(projection_parameters)

In [None]:
from epydemix.visualization import plot_quantiles

# Get quantiles
df_calibration = results_with_projections.get_calibration_quantiles(simulation_dates_calibration)
df_projection = results_with_projections.get_projection_quantiles(simulation_dates_full)

# Plot
fig, ax = plt.subplots(figsize=(12, 5), dpi=150)

# Calibration fit
plot_quantiles(df_calibration, columns="data", ax=ax,
               colors=colors[1], show_data=False, labels=["Calibration fit"])

# Projection
plot_quantiles(df_projection, columns="data", ax=ax,
               colors=colors[0], show_data=False, labels=["Projection"])

# Observed data
ax.plot(data_calibration["date"], data_calibration["data"], "ko", markersize=4, label="Calibration data")
ax.plot(data_validation["date"], data_validation["data"], "o", color="gray",
        markersize=4, alpha=0.6, label="Validation data")

# Mark cutoff
ax.axvline(data_calibration["date"].iloc[-1], color="red", linestyle="--", alpha=0.5)

ax.set_ylabel("New Infections")
ax.set_title("SEIR Model: Calibration and Projection")
ax.legend(loc="upper right")

plt.tight_layout()

## Bonus: Compare SEIR to SIR Calibration

Let's calibrate an SIR model to the same data and compare.

In [None]:
from epydemix import load_predefined_model

# Load predefined SIR model
model_sir = load_predefined_model("SIR")
model_sir.import_epydemix_population(population_name="Indonesia")

# Initial conditions for SIR
initial_conditions_sir = {
    "Susceptible": model_sir.population.Nk - (model_sir.population.Nk * 0.05 / 100).astype(int),
    "Infected": (model_sir.population.Nk * 0.05 / 100).astype(int),
    "Recovered": np.zeros(len(model_sir.population.Nk))
}

parameters_sir = {
    "initial_conditions_dict": initial_conditions_sir,
    "epimodel": model_sir,
    "start_date": data_calibration.date.values[0],
    "end_date": data_calibration.date.values[-1]
}

# SIR priors
priors_sir = {
    "transmission_rate": stats.uniform(0.01, 0.02),
    "recovery_rate": stats.uniform(0.1, 0.15)
}

def simulate_wrapper_sir(parameters):
    results = simulate(**parameters)
    return {"data": results.transitions["Susceptible_to_Infected_total"]}

abc_sampler_sir = ABCSampler(
    simulation_function=simulate_wrapper_sir,
    priors=priors_sir,
    parameters=parameters_sir,
    observed_data=data_calibration["data"].values,
    distance_function=rmse
)

# Calibrate
results_sir = abc_sampler_sir.calibrate(
    strategy="smc",
    num_particles=100,
    num_generations=5
)

In [None]:
from epydemix.visualization import plot_distance_distribution

# Compare distance distributions
fig, ax = plt.subplots(figsize=(8, 4), dpi=150)

plot_distance_distribution(results_seir.get_distances(), ax=ax, kind="kde",
                          color=colors[0], label="SEIR", xlabel="RMSE")
plot_distance_distribution(results_sir.get_distances(), ax=ax, kind="kde",
                          color=colors[1], label="SIR", xlabel="RMSE")

ax.set_title("Calibration Performance: SEIR vs SIR")
ax.legend()

# Print median distances
print(f"Median RMSE - SEIR: {np.median(results_seir.get_distances()):.0f}")
print(f"Median RMSE - SIR: {np.median(results_sir.get_distances()):.0f}")

In [None]:
# Run SIR projections
projection_parameters_sir = parameters_sir.copy()
projection_parameters_sir["end_date"] = data_validation.date.values[-1]
results_sir_proj = abc_sampler_sir.run_projections(projection_parameters_sir)

# Get quantiles
df_sir_calib = results_sir_proj.get_calibration_quantiles(simulation_dates_calibration)
df_sir_proj = results_sir_proj.get_projection_quantiles(simulation_dates_full)

# Plot comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 4), dpi=150, sharey=True)

# SEIR
plot_quantiles(df_calibration, columns="data", ax=axes[0],
               colors=colors[1], show_data=False, labels=["Calibration"])
plot_quantiles(df_projection, columns="data", ax=axes[0],
               colors=colors[0], show_data=False, labels=["Projection"])
axes[0].plot(data["date"], data["data"], "ko", markersize=3, alpha=0.5)
axes[0].axvline(data_calibration["date"].iloc[-1], color="red", linestyle="--", alpha=0.5)
axes[0].set_title("SEIR Model")
axes[0].set_ylabel("New Infections")
axes[0].legend()

# SIR
plot_quantiles(df_sir_calib, columns="data", ax=axes[1],
               colors=colors[1], show_data=False, labels=["Calibration"])
plot_quantiles(df_sir_proj, columns="data", ax=axes[1],
               colors=colors[0], show_data=False, labels=["Projection"])
axes[1].plot(data["date"], data["data"], "ko", markersize=3, alpha=0.5)
axes[1].axvline(data_calibration["date"].iloc[-1], color="red", linestyle="--", alpha=0.5)
axes[1].set_title("SIR Model")
axes[1].legend()

plt.tight_layout()

## Discussion

**How do the posterior distributions compare?**

- The SEIR model has an additional parameter (σ, the latent rate) that the SIR model lacks
- This gives SEIR more flexibility to capture the delay between infection and becoming infectious
- The SEIR β estimate may be slightly different from SIR because it accounts for the latent period

**Does adding the E compartment improve the fit?**

The improvement depends on:
1. **The disease:** If there's a significant latent period, SEIR is more appropriate
2. **The data:** The original data was generated from an SIR model, so SIR may fit equally well
3. **Model complexity:** SEIR has more parameters, which could lead to overfitting if not constrained

**Key takeaways:**
- Model selection should be guided by biological plausibility, not just fit
- More complex models aren't always better—they require more data to constrain
- The latent period matters for forecasting because it affects the timing of epidemic waves
- Validation on held-out data is essential for assessing projection accuracy