# Demonstrate the importance of informative covariates in the Pogit model, using synthetic data on road injuries

If the covariates perfectly predict the true rate lambda and reporting rate p, then the Pogit model successfully separates these two functions. On the other hand, if lambda or p are noisy functions of their covariates, then the model's recovery gets worse. We demonstrate how strong priors/constraints can be helpful in this setting to recover the true functions.

In [None]:
from pathlib import Path
src_dir = Path("../src")
data_dir = Path("../data")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from regmod.data import Data
from regmod.variable import Variable, SplineVariable
from regmod.prior import GaussianPrior, UniformPrior, LinearUniformPrior, SplineUniformPrior, LinearGaussianPrior
from regmod.models import PogitModel, PoissonModel, BinomialModel
from regmod.utils import SplineSpecs
from regmod.optimizer import scipy_optimize

import pickle

# Functions for synthetic data generation
import sys
sys.path.insert(0, str(src_dir))
from roadInjTutorial_utils import *

## Load covariates that will be used to model road injuries

In [None]:
with open(data_dir / 'roadInj_pdCovOnly.pickle', 'rb') as f:
    # Load the lambda model that we saved earlier
    processed_data = pickle.load(f)

In [None]:
processed_data

In [None]:
with open(data_dir / 'roadInj_lamModel.pickle', 'rb') as f:
    # Load the lambda model that we saved earlier
    ml, rl = pickle.load(f)

## We will generate synthetic data for the true rate of road injuries (lambda) and the rate at which road injuries go to inpatient care (p)

In [None]:
synthLamFun = lamFunGenerator()

for sex, color, marker, name in [(1, 'b', '+', "Male"), (0, 'r', 'x', "Female")]:
    pdGrid = pd.DataFrame(data=np.array(np.meshgrid([sex], 
                                           np.linspace(np.min(processed_data["age"]), np.max(processed_data["age"]), 100),
                                           [1], [10**8])).T.reshape(-1, 4),
                                  columns=['sex', 'age', '(Intercept)', 'sample_size'])
    d = Data(col_covs=["age", "sex", "(Intercept)"],
           df=pdGrid)
    plt.plot(pdGrid["age"], synthLamFun(pdGrid, data_dir=data_dir)/pdGrid["sample_size"], 
            color=color, label=name)
plt.xlabel("Age")
plt.ylabel("Total Injury Rate Per Person Per Year")
plt.title("Synthetic generating function for true rate of road injuries data")
plt.legend()
plt.show()

In [None]:
synthPFun = pFunGenerator()

plt.scatter(processed_data.seatbeltUse_synthetic, synthPFun(processed_data), color='k')
plt.xlabel("Seatbelt Use")
plt.ylabel("Fraction of Injuries Treated as Inpatient")
plt.title("Synthetic generating function for rate of inpatient care")
plt.show()

## Generate data for inpatient and outpatient injuries across all age, sex, and seat belt use categories
Assuming seat belt use is independent of age and sex

For each row of data we already have, we will assign some fraction of seatbelt-wearing (uniformly in 0,1), and then use this to come up with a 'true rate' and a 'reporting rate' for that row of data

In [None]:
def synthesizeData(processed_data, pFun, lamFun, data_dir="./"):
    """For each row in processed_data, generate a synthetic 'true rate' of injuries and a synthetic 'reporting rate',
    based on the provided functions. Then, draw actual observations of 'observed injuries' from this data"""
    pStar = pFun(processed_data)
    lamStar = lamFun(processed_data, data_dir=data_dir)
    
    trueInjuries = np.random.poisson(lamStar)
    observedInjuries = np.random.binomial(trueInjuries, pStar)
    
    return pd.DataFrame({"age": processed_data.age,
                         "sex": processed_data.sex,
                         "(Intercept)":processed_data["(Intercept)"],
                         "seatbeltUse_synthetic": processed_data.seatbeltUse_synthetic,
                         "true_injuries":trueInjuries, 
                         "observed_injuries":observedInjuries, 
                         "pStar":pStar, 
                         "lamStar":lamStar,
                         "sample_size":processed_data.sample_size,
                         "offset":processed_data.offset})

In [None]:
synthPFun = pFunGenerator()
synthLamFun = lamFunGenerator()

sd = synthesizeData(processed_data, synthPFun, synthLamFun, data_dir=data_dir)

fig, ax = plt.subplots(3, 1, figsize=(4, 8))
# Left column shows pStar, lamStar, and muStar for each covariate value
for sex, color, marker, name in [(1, 'b', '+', "Male"), (0, 'r', 'x', "Female")]:
    pdGrid = pd.DataFrame(data=np.array(np.meshgrid([sex], 
                                           np.linspace(np.min(processed_data["age"]), np.max(processed_data["age"]), 100),
                                           [1], [1])).T.reshape(-1, 4),
                                  columns=['sex', 'age', '(Intercept)', 'sample_size'])
    d = Data(col_obs='total_injuries',
           col_covs=["age", "sex", "(Intercept)"],
           df=pdGrid)

    ax[0].plot(pdGrid["age"], synthLamFun(pdGrid, data_dir=data_dir)/pdGrid["sample_size"], 
            color=color, label=name)
    
    plotIdx = processed_data.sex == sex
    ax[0].scatter(processed_data[plotIdx].age, sd[plotIdx].true_injuries/processed_data[plotIdx].sample_size, 
               color=color, marker=marker)
    
    ax[0].set_xlabel("Age")
    ax[0].set_ylabel("Total Injury Rate\nPer Person Per Year")
    ax[0].legend()


ax[1].plot(processed_data.sort_values("seatbeltUse_synthetic").seatbeltUse_synthetic, 
           synthPFun(processed_data.sort_values("seatbeltUse_synthetic")), color='k')
ax[1].scatter(sd.seatbeltUse_synthetic, sd.observed_injuries/sd.true_injuries, color='gray')
ax[1].set_xlabel("Seatbelt Use")
ax[1].set_ylabel("Fraction of Injuries\nTreated as Inpatient")

ax[2].scatter(processed_data.seatbeltUse_synthetic, sd.observed_injuries/sd.sample_size, color='k')
ax[2].set_xlabel("Seatbelt Use")
ax[2].set_ylabel("Observed (Inpatient) Injury Rate\nPer Person Per Year")
    
plt.tight_layout()
plt.show()

## Test recovery across different amounts of noise

In [None]:
varAge = Variable(name="age")
varSex = Variable(name="sex")
varInt = Variable(name="(Intercept)")
varSeatbelt = Variable(name="seatbeltUse_synthetic")

varAgeSpline2Knot16 = SplineVariable(name="age",
                                 spline_specs=SplineSpecs(knots=np.array([min(processed_data["age"]),
                                                                          16,
                                                                          max(processed_data["age"])]),
                                               knots_type="abs",
                                               degree=2))

In [None]:
noiselessP = pFunGenerator()
noiselessLam = lamFunGenerator()

fig, ax = plt.subplots(2, 4, figsize=(12,6))

for j, (lamNoise, pNoise) in enumerate([(0,0), (0.2,0.2), (0.5, 0.5), (1,1)]):
    ax[0,j].set_title("Lambda noise "+str(lamNoise)+", p noise "+str(pNoise))
    
    l = lamFunGenerator(lambda size: np.random.normal(loc=0, scale=lamNoise, size=size))
    
    p = pFunGenerator(noise=lambda size: np.random.normal(loc=0, scale=pNoise, size=size))
    
    sd = synthesizeData(processed_data, p, lamFun=l, data_dir=data_dir)
    
    spd = pd.DataFrame({"age":processed_data.age,
                      "seatbeltUse_synthetic":processed_data.seatbeltUse_synthetic,
                      "total_injuries":sd.true_injuries,
                      "reported_injuries":sd.observed_injuries,
                      "sex":processed_data.sex,
                       "(Intercept)":1,
                       "sample_size": processed_data.sample_size,
                       "offset": processed_data.offset})
    
    
    sdata = Data(col_obs='reported_injuries',
           col_covs=["age", "sex", "(Intercept)", "seatbeltUse_synthetic"],
           df=spd)


    model = PogitModel(sdata,
                    param_specs={"p": {"variables":[varInt, varSeatbelt]
                                      },
                                 "lam": {"use_offset":True,
                                         "variables":[varInt, varAgeSpline2Knot16, varSex]}})

    result = scipy_optimize(model)
       
    
    gridData = pd.DataFrame({"(Intercept)":1,
                            "seatbeltUse_synthetic":np.linspace(0, 1, 100)})
    
    ax[1,j].scatter(spd.seatbeltUse_synthetic, spd.reported_injuries/spd.total_injuries, color='gray', alpha=0.4)
    ax[1,j].plot(spd.sort_values("seatbeltUse_synthetic").seatbeltUse_synthetic, noiselessP(spd.sort_values("seatbeltUse_synthetic")), color='k')
    ax[1,j].plot(gridData.seatbeltUse_synthetic, model.params[0].get_param(model.split_coefs(result["coefs"])[0], 
                                                            Data(col_obs='reported_injuries',
                                                                 col_covs=["(Intercept)", "seatbeltUse_synthetic"],
                                                                 df=gridData)),
            color='k', linestyle='--')
    ax[1,j].set_xlabel("Seatbelt Use")
    
    for sex, color, marker, name in [(1, 'b', '+', "Male"), (0, 'r', 'x', "Female")]:
        pdGrid = pd.DataFrame(data=np.array(np.meshgrid([sex], 
                                               np.linspace(np.min(processed_data["age"]), np.max(processed_data["age"]), 100),
                                               [1], [1])).T.reshape(-1, 4),
                                      columns=['sex', 'age', '(Intercept)', 'sample_size'])
        d = Data(col_obs='total_injuries',
               col_covs=["age", "sex", "(Intercept)"],
               df=pdGrid)

        ax[0,j].plot(pdGrid["age"], noiselessLam(pdGrid, data_dir=data_dir)/pdGrid["sample_size"], 
                color=color, label=name)

        plotIdx = processed_data.sex == sex
        ax[0,j].scatter(processed_data[plotIdx].age, sd[plotIdx].true_injuries/processed_data[plotIdx].sample_size, 
                   color=color, marker=marker, alpha=0.4)
        ax[0,j].plot(pdGrid.age, 
                 model.params[1].get_param(model.split_coefs(result["coefs"])[1], d)/pdGrid.sample_size,
                 color=color, linestyle='--')
    
    ax[0,j].set_xlabel("Age")
ax[0,0].set_ylabel("Total Injury Rate\nPer Person Per Year")
ax[0,0].legend()
ax[1,0].set_ylabel("Fraction of Injuries\nTreated as Inpatient")

plt.tight_layout()
plt.show()