# Importance of informative covariates in the Pogit model

If the covariates perfectly predict the true rate lambda and reporting rate p, then the Pogit model successfully separates these two functions. On the other hand, if lambda or p are noisy functions of their covariates, then the model's recovery gets worse. We demonstrate how strong priors/constraints can be helpful in this setting to recover the true functions using synthetic data on road injuries.

In [None]:
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yaml
from xspline import XSpline

from regmod.data import Data
from regmod.variable import Variable, SplineVariable
from regmod.prior import GaussianPrior, UniformPrior, LinearUniformPrior, SplineUniformPrior, LinearGaussianPrior
from regmod.models import PogitModel, PoissonModel, BinomialModel
from regmod.utils import SplineSpecs
from regmod.optimizer import scipy_optimize

## Define utility functions

In [None]:
def logit(p):
    return np.log(p/(1-p))

def expit(x):
    return np.exp(x)/(1+np.exp(x))

def generate_lam(df, specs, noise_sd=0.0):
    intercept_pred = specs["coefs"]["Intercept"]
    age_pred = specs["age_spline"].design_mat(df.age.values).dot(specs["coefs"]["age"])
    sex_pred = df.sex.values*specs["coefs"]["sex"]
    noise = np.random.normal(loc=0.0, scale=noise_sd, size=df.shape[0])
    log_lam = intercept_pred + age_pred + sex_pred + noise
    return np.exp(log_lam)*df.sample_size.values

def get_true_logit_p(seatbeltuse):
    return -3*(seatbeltuse - 1) - 1

def generate_p(df, noise_sd=0.0):
    noise = np.random.normal(loc=0.0, scale=noise_sd, size=df.shape[0])
    logit_p = get_true_logit_p(df.seatbeltUse_synthetic) + noise
    return expit(logit_p)

def synthesize_data(df, specs, p_noise_sd=0.0, lam_noise_sd=0.0):
    """For each row in roadInj_data, generate a synthetic 'true rate' of injuries and a synthetic 'reporting rate',
    based on the provided functions. Then, draw actual observations of 'observed injuries' from this data"""
    pStar = generate_p(df, noise_sd=p_noise_sd)
    lamStar = generate_lam(df, specs, noise_sd=lam_noise_sd)
    
    trueInjuries = np.random.poisson(lamStar)
    observedInjuries = np.random.binomial(trueInjuries, pStar)
    
    df_gen = df.copy()
    df_gen["true_injuries"] = trueInjuries
    df_gen["observed_injuries"] = observedInjuries
    df_gen["pStar"] = pStar
    df_gen["lamStar"] = lamStar
    
    return df_gen

## Load data and specs

In [None]:
data_dir = Path("../data")

In [None]:
roadInj_data = pd.read_csv(data_dir / "roadInj_data.csv")

In [None]:
with open(data_dir / "roadInj_specs.yml", "r") as f:
    roadInj_specs = yaml.full_load(f)
roadInj_specs["age_spline"] = XSpline(**roadInj_specs["age_spline_specs"])

## True parameters for sythetic data
We will generate synthetic data for the true rate of road injuries (lambda) and the rate at which road injuries go to inpatient care (p).

In [None]:
pdGrid = pd.DataFrame({
    "age": np.linspace(roadInj_data.age.min(), roadInj_data.age.max(), 100),
    "sex": 0,
    "sample_size": 1e8,
})
for sex, color, marker, name in [(1, 'b', '+', "Male"), (0, 'r', 'x', "Female")]:
    pdGrid["sex"] = sex
    plt.plot(pdGrid.age, generate_lam(pdGrid, roadInj_specs)/pdGrid.sample_size, color=color, label=name)
plt.xlabel("Age")
plt.ylabel("Total Injury Rate Per Person Per Year")
plt.title("Synthetic generating function for true rate of road injuries data")
plt.legend()

In [None]:
plt.scatter(roadInj_data.seatbeltUse_synthetic, generate_p(roadInj_data), color='k')
plt.xlabel("Seatbelt Use")
plt.ylabel("Fraction of Injuries Treated as Inpatient")
plt.title("Synthetic generating function for rate of inpatient care")

## Generate data for inpatient and outpatient injuries across all age, sex, and seat belt use categories
Assuming seat belt use is independent of age and sex

For each row of data we already have, we will assign some fraction of seatbelt-wearing (uniformly in 0,1), and then use this to come up with a 'true rate' and a 'reporting rate' for that row of data

In [None]:
sd = synthesize_data(roadInj_data, roadInj_specs)
pdGrid = pd.DataFrame({
    "age": np.linspace(roadInj_data.age.min(), roadInj_data.age.max(), 100),
    "sex": 0,
    "sample_size": 1e-8,
})

fig, ax = plt.subplots(3, 1, figsize=(4, 8))
# Left column shows pStar, lamStar, and muStar for each covariate value
for sex, color, marker, name in [(1, 'b', '+', "Male"), (0, 'r', 'x', "Female")]:
    pdGrid["sex"] = sex
    ax[0].plot(pdGrid.age, generate_lam(pdGrid, roadInj_specs)/pdGrid.sample_size, color=color, label=name)
    
    plotIdx = roadInj_data.sex == sex
    ax[0].scatter(roadInj_data[plotIdx].age,
                  sd[plotIdx].true_injuries/roadInj_data[plotIdx].sample_size, 
                  color=color, marker=marker)
    ax[0].set_xlabel("Age")
    ax[0].set_ylabel("Total Injury Rate\nPer Person Per Year")
    ax[0].legend()

roadInj_data.sort_values("seatbeltUse_synthetic", inplace=True)
ax[1].plot(roadInj_data.seatbeltUse_synthetic, generate_p(roadInj_data), color='k')
ax[1].scatter(sd.seatbeltUse_synthetic, sd.observed_injuries/sd.true_injuries, color='gray')
ax[1].set_xlabel("Seatbelt Use")
ax[1].set_ylabel("Fraction of Injuries\nTreated as Inpatient")

ax[2].scatter(roadInj_data.seatbeltUse_synthetic, sd.observed_injuries/sd.sample_size, color='k')
ax[2].set_xlabel("Seatbelt Use")
ax[2].set_ylabel("Observed (Inpatient) Injury Rate\nPer Person Per Year")
    
plt.tight_layout()
# plt.show()

## Test recovery across different amounts of noise

In [None]:
varAge = Variable(name="age")
varSex = Variable(name="sex")
varInt = Variable(name="(Intercept)")
varSeatbelt = Variable(name="seatbeltUse_synthetic")

varAgeSpline2Knot16 = SplineVariable(name="age",
                                     spline_specs=SplineSpecs(
                                         knots=np.array([roadInj_data.age.min(), 16, roadInj_data.age.max()]),
                                         knots_type="abs",
                                         degree=2,
                                         include_first_basis=True
                                     ))

In [None]:
fig, ax = plt.subplots(2, 4, figsize=(12,6))

for j, (lamNoise, pNoise) in enumerate([(0, 0), (0.2, 0.2), (0.5, 0.5), (1, 1)]):
    ax[0, j].set_title("Lambda noise "+str(lamNoise)+", p noise "+str(pNoise))
    sd = synthesize_data(roadInj_data, roadInj_specs, p_noise_sd=pNoise, lam_noise_sd=lamNoise)
    spd = sd.copy()
    
    # create data object
    sdata = Data(col_obs='observed_injuries',
                 col_covs=["age", "sex", "(Intercept)", "seatbeltUse_synthetic"],
                 df=spd)


    # create model object
    model = PogitModel(sdata,
                       param_specs={"p": {"variables":[varInt, varSeatbelt]},
                                    "lam": {"use_offset":True, "variables":[varInt, varAgeSpline2Knot16, varSex]}})
    result = scipy_optimize(model)
       
    
    # plot result
    df_pred_p = pd.DataFrame({"(Intercept)":1,
                              "seatbeltUse_synthetic":np.linspace(0, 1, 100)})
    data_pred_p = Data(col_covs=list(df_pred_p.columns), df=df_pred_p)
    
    # create prediction
    sdata.detach_df()
    sdata.attach_df(gridData)
    pred_p = model.params[0].get_param(model.split_coefs(result["coefs"])[0], data_pred_p)
    
    spd.sort_values("seatbeltUse_synthetic", inplace=True)
    ax[1, j].scatter(spd.seatbeltUse_synthetic, spd.observed_injuries/spd.true_injuries, color='gray', alpha=0.4)
    ax[1, j].plot(spd.seatbeltUse_synthetic, generate_p(spd), color='k')
    ax[1, j].plot(gridData.seatbeltUse_synthetic, pred_p, color='k', linestyle='--')
    ax[1, j].set_xlabel("Seatbelt Use")
    
    df_pred_lam = pd.DataFrame({
        "age": np.linspace(roadInj_data.age.min(), roadInj_data.age.max(), 100),
        "sex": 0,
        "sample_size": 1,
        "(Intercept)": 1
    })
    for sex, color, marker, name in [(1, 'b', '+', "Male"), (0, 'r', 'x', "Female")]:
        df_pred_lam["sex"] = sex
        data_pred_lam = Data(col_covs=list(df_pred_lam.columns), df=df_pred_lam)
        ax[0, j].plot(df_pred_lam.age, generate_lam(df_pred_lam, roadInj_specs)/df_pred_lam.sample_size,
                      color=color, label=name)

        plotIdx = roadInj_data.sex == sex
        ax[0, j].scatter(roadInj_data[plotIdx].age, sd[plotIdx].true_injuries/roadInj_data[plotIdx].sample_size, 
                         color=color, marker=marker, alpha=0.4)
        pred_lam = model.params[1].get_param(model.split_coefs(result["coefs"])[1], data_pred_lam)
        ax[0, j].plot(df_pred_lam.age, pred_lam/df_pred_lam.sample_size, color=color, linestyle='--')
    
    ax[0, j].set_xlabel("Age")
ax[0, 0].set_ylabel("Total Injury Rate\nPer Person Per Year")
ax[0, 0].legend()
ax[1, 0].set_ylabel("Fraction of Injuries\nTreated as Inpatient")

plt.tight_layout()