# Importance of informative covariates in the Pogit model

If the covariates perfectly predict the true rate lambda and reporting rate p, then the Pogit model successfully separates these two functions. On the other hand, if lambda or p are noisy functions of their covariates, then the model"s recovery gets worse. We demonstrate how strong priors/constraints can be helpful in this setting to recover the true functions using synthetic data on road injuries.

In [None]:
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yaml
from xspline import XSpline

from regmod.data import Data
from regmod.variable import Variable, SplineVariable
from regmod.prior import GaussianPrior, UniformPrior, LinearUniformPrior, SplineUniformPrior, LinearGaussianPrior
from regmod.models import PogitModel, PoissonModel, BinomialModel
from regmod.utils import SplineSpecs
from regmod.optimizer import scipy_optimize

## Define utility functions

In [None]:
def logit(p):
    return np.log(p/(1-p))

def expit(x):
    return np.exp(x)/(1+np.exp(x))

def get_logit_p(df, coefs):
    intercept_pred = coefs["Intercept"]
    seatbeltUse_synthetic_pred = df["seatbeltUse_synthetic"].values*coefs["seatbeltUse_synthetic"]
    return intercept_pred + seatbeltUse_synthetic_pred

def get_log_lam(df, coefs, age_spline):
    intercept_pred = coefs["Intercept"]
    age_pred = age_spline.design_mat(df.age.values).dot(coefs["age"])
    sex_pred = df.sex.values*coefs["sex"]
    return intercept_pred + age_pred + sex_pred

def generate_lam(df, coefs, age_spline, noise_sd=0.0):
    noise = np.random.normal(loc=0.0, scale=noise_sd, size=df.shape[0])
    log_lam = get_log_lam(df, coefs, age_spline) + noise
    return np.exp(log_lam)*df.sample_size.values

def generate_p(df, coefs, noise_sd=0.0):
    noise = np.random.normal(loc=0.0, scale=noise_sd, size=df.shape[0])
    logit_p = get_logit_p(df, coefs) + noise
    return expit(logit_p)

def synthesize_data(df, p_coefs, lam_coefs, age_spline, p_noise_sd=0.0, lam_noise_sd=0.0):
    """For each row in roadInj_data, generate a synthetic 'true rate' of injuries and a synthetic 'reporting rate',
    based on the provided functions. Then, draw actual observations of 'observed injuries' from this data"""
    pStar = generate_p(df, p_coefs, noise_sd=p_noise_sd)
    lamStar = generate_lam(df, lam_coefs, age_spline, noise_sd=lam_noise_sd)
    
    trueInjuries = np.random.poisson(lamStar)
    observedInjuries = np.random.binomial(trueInjuries, pStar)
    
    df_gen = df.copy()
    df_gen["true_injuries"] = trueInjuries
    df_gen["observed_injuries"] = observedInjuries
    df_gen["pStar"] = pStar
    df_gen["lamStar"] = lamStar
    
    return df_gen

In [None]:
# plotting function
def plot_lam_curve(coefs, age_spline, linestyle="-", label_prefix="", ax=None):
    ax = plt.subplots()[1] if ax is None else ax
    df = pd.DataFrame({"age": np.linspace(age_spline.knots[0], age_spline.knots[-1], 100), "sample_size": 1})
    for sex, color, name in [(1, "b", "Male"), (0, "r", "Female")]:
        df["sex"] = sex
        ax.plot(df.age, generate_lam(df, coefs, age_spline),
                color=color, label=label_prefix + name, linestyle=linestyle)
    return ax

def plot_p_curve(coefs, ax=None, label="", linestyle="-"):
    ax = plt.subplots()[1] if ax is None else ax
    df = pd.DataFrame({"seatbeltUse_synthetic": np.linspace(0.0, 1.0, 100)})
    ax.plot(df.seatbeltUse_synthetic, generate_p(df, coefs), color="k", label=label, linestyle=linestyle)
    return ax

def plot_lam_data(df, ax=None):
    ax = plt.subplots()[1] if ax is None else ax
    for sex, color, marker in [(1, "b", "+"), (0, "r", "x")]:
        index = df["sex"] == sex
        ax.scatter(df.age[index], df.true_injuries[index]/df.sample_size[index], color=color, marker=marker)
    return ax

def plot_p_data(df, ax=None):
    ax = plt.subplots()[1] if ax is None else ax
    ax.scatter(df.seatbeltUse_synthetic, df.observed_injuries/df.true_injuries, color="gray")

## Load data and specs

In [None]:
data_dir = Path("../data")

In [None]:
roadInj_data = pd.read_csv(data_dir / "roadInj_data.csv")

In [None]:
with open(data_dir / "roadInj_specs.yml", "r") as f:
    roadInj_specs = yaml.full_load(f)
age_spline = XSpline(**roadInj_specs["age_spline_specs"])
p_coefs = roadInj_specs["p_coefs"]
lam_coefs = roadInj_specs["lam_coefs"]

## True parameters for sythetic data
We will generate synthetic data for the true rate of road injuries (lambda) and the rate at which road injuries go to inpatient care (p).

In [None]:
ax = plot_lam_curve(lam_coefs, age_spline)
ax.set_xlabel("Age")
ax.set_ylabel("Total Injury Rate Per Person Per Year")
ax.set_title("Synthetic generating function for true rate of road injuries data")
ax.legend()

In [None]:
ax = plot_p_curve(p_coefs)
ax.set_xlabel("Seatbelt Use")
ax.set_ylabel("Fraction of Injuries Treated as Inpatient")
ax.set_title("Synthetic generating function for rate of inpatient care")

## Generate synthetic data
Generate data for inpatient and outpatient injuries across all age, sex, and seat belt use categories.

Assuming seat belt use is independent of age and sex.
For each row of data we already have, we will assign some fraction of seatbelt-wearing (uniformly in 0,1), and then use this to come up with a "true rate" and a "reporting rate" for that row of data

In [None]:
syn_df = synthesize_data(roadInj_data, p_coefs, lam_coefs, age_spline)

fig, ax = plt.subplots(3, 1, figsize=(4, 8))

# Left column shows pStar, lamStar, and muStar for each covariate value
plot_lam_curve(lam_coefs, age_spline, ax=ax[0])
plot_lam_data(syn_df, ax=ax[0])
ax[0].set_xlabel("Age")
ax[0].set_ylabel("Total Injury Rate\nPer Person Per Year")
ax[0].legend()

plot_p_curve(p_coefs, ax=ax[1])
plot_p_data(syn_df, ax=ax[1])
ax[1].set_xlabel("Seatbelt Use")
ax[1].set_ylabel("Fraction of Injuries\nTreated as Inpatient")

ax[2].scatter(syn_df.seatbeltUse_synthetic, syn_df.observed_injuries/syn_df.sample_size, color="k")
ax[2].set_xlabel("Seatbelt Use")
ax[2].set_ylabel("Observed (Inpatient) Injury Rate\nPer Person Per Year")
    
plt.tight_layout()
plt.show()

## Test recovery across different amounts of noise

In [None]:
varAge = Variable(name="age")
varSex = Variable(name="sex")
varInt = Variable(name="(Intercept)")
varSeatbelt = Variable(name="seatbeltUse_synthetic", priors=[UniformPrior(lb=-3.5, ub=-2.5)])

varAgeSpline2Knot16 = SplineVariable(name="age",
                                     spline_specs=SplineSpecs(
                                         knots=np.array([roadInj_data.age.min(), 16, roadInj_data.age.max()]),
                                         knots_type="abs",
                                         degree=2,
                                         include_first_basis=False
                                     ))

In [None]:
def fit_model(df):
    # create data object
    data = Data(col_obs="observed_injuries",
                col_covs=["age", "sex", "(Intercept)", "seatbeltUse_synthetic"],
                df=df)

    # create model object
    model = PogitModel(data,
                       param_specs={"p": {"variables":[varInt, varSeatbelt]},
                                    "lam": {"use_offset": True, "variables":[varInt, varAgeSpline2Knot16, varSex]}})
    result = scipy_optimize(model)
    
    # process the results
    age_spline = model.params[1].variables[1].spline
    p_coefs, lam_coefs = model.split_coefs(result["coefs"])
    p_coefs = {
        "Intercept": p_coefs[0],
        "seatbeltUse_synthetic": p_coefs[1]
    }
    lam_coefs = {
        "Intercept": lam_coefs[0],
        "age": lam_coefs[1:-1],
        "sex": lam_coefs[-1]
    }
    
    return model, p_coefs, lam_coefs, age_spline

In [None]:
fig, ax = plt.subplots(2, 4, figsize=(12,6))
models = []
for j, (lamNoise, pNoise) in enumerate([(0, 0), (0.2, 0.2), (0.5, 0.5), (1, 1)]):
    ax[0, j].set_title("Lambda noise "+str(lamNoise)+", p noise "+str(pNoise))
    syn_df = synthesize_data(roadInj_data, p_coefs, lam_coefs, age_spline, p_noise_sd=pNoise, lam_noise_sd=lamNoise)
    
    syn_model, syn_p_coefs, syn_lam_coefs, syn_age_spline = fit_model(syn_df)
    models.append(syn_model)
    
    # plot result
    plot_lam_curve(lam_coefs, age_spline, ax=ax[0, j], label_prefix="True ")
    plot_lam_curve(syn_lam_coefs, syn_age_spline, ax=ax[0, j], linestyle="--", label_prefix="Fit ")
    plot_lam_data(syn_df, ax=ax[0, j])
    ax[0, j].set_xlabel("Age")
    
    plot_p_curve(p_coefs, ax=ax[1, j], label="True")
    plot_p_curve(syn_p_coefs, ax=ax[1, j], label="Fit", linestyle="--")
    plot_p_data(syn_df, ax=ax[1, j])
    ax[1, j].set_xlabel("Seatbelt Use")
    

ax[0, 0].legend()
ax[1, 0].legend()
ax[0, 0].set_ylabel("Total Injury Rate\nPer Person Per Year")
ax[1, 0].set_ylabel("Fraction of Injuries\nTreated as Inpatient")

plt.tight_layout()
plt.show()