In [1]:
from pathlib import Path

import polars as pl
import numpy as np
from scipy import stats 
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import numpyro
import numpyro.distributions as dist
from numpyro.infer import MCMC, NUTS, init_to_median
import jax
from jax import random
import jax.numpy as jnp

In [2]:
collected_vehicle_data_path = Path("..", "raw_data", "collected-data", "collected_vehicle_data.csv")
jax.config.update('jax_enable_x64', True)

# Research Question: What is the distribution of vehicle model years in the target population?

Note that we include vehicles that are driven in Utah County without being registered in Utah County.  Thus, we provide added information to what is publicly available from government registration records.

## Strategy
 Use the registration counts as the concentration parameters for a Dirichlet distribution.  Use the technique [here](https://en.wikipedia.org/wiki/Dirichlet_distribution#Conjugate_to_categorical_or_multinomial) to use these concentration parameters as pseudocounts to be added to our observed counts.  The summed counts can then be used as the concentration parameter for the posterior Dirichlet distribution of the relative frequencies of different vehicle model years in the population.

## ETL
New vehicles are still being sold for 2024, 2025, and 2026 model years, but not for model year 2023.  The registration data that we have is for vehicles registered in 2024 all of the way up to February 17, 2025.  Thus, there may be additional registrations for the newer model year vehicles between February 2025 and March 2025 which are not in our dataset.  We can modify the registration counts for these new model years using the count for model year 2023. 

Assume that if the registration data were to go all of the way up to February 17, 2026 that there would be the same number of registrations expiring for model year 2024 as there currently is for model year 2023.  Note that the proportion of this period traversed at the time of data collection is approximately 1/12. 

Assume that if the registration data were to go all of the way up to February 17, 2027 that there would be the same number of registrations expiring for model year 2025 as there currently is for model year 2023.  Note that the proportion of this period traversed at the time of data collection is approximately 1/24. 

Assume that if the registration data were to go all of the way up to February 17, 2028 that there would be the same number of registrations expiring for model year 2026 as there currently is for model year 2023.  Note that the proportion of this period traversed at the time of data collection is approximately 1/36. 

In [3]:
source = Path("..", "raw_data", "registrations", "registrations.csv")
reg = pl.scan_csv(
    source=source
)

reg = (reg
    .with_columns(
        pl.col("num_registrations").str.replace_all(",", "").cast(pl.Int64).alias("num_registrations")
    )
    .collect()
    .lazy()
)

reg.collect().tail()

model_year,num_registrations
i64,i64
2022,30312
2023,31266
2024,27037
2025,5830
2026,8


In [4]:
reg_2023 = (reg
    .filter(pl.col("model_year") == 2023)
    .select("num_registrations")
    .collect()
    .item()
)

reg_2 = (reg
    .with_columns(
        pl.when(pl.col("model_year") > 2023)
        .then(pl.col("num_registrations") + (reg_2023 - pl.col("num_registrations")) / (12 * (pl.col("model_year") - 2023)))
        .otherwise(pl.col("num_registrations"))
        .cast(pl.Int64)
        .alias("num_registrations")
    )
)

reg_2.tail().collect()

model_year,num_registrations
i64,i64
2022,30312
2023,31266
2024,27389
2025,6889
2026,876


## Extrapolate the registration counts for pre-1913 model year vehicles
The first steam-powered vehicle dates back to 1672 ([Wikipedia](https://en.wikipedia.org/wiki/History_of_the_automobile#Steam-powered_wheeled_vehicles)).  The Utah registration data starts for model year 1913.  We assume that the number of vehicle registrations is 0 for model years not listed in the government data.  Also, we assume that the model year of a vehicle must be between 1672 and 2026.

In [5]:
yr_range = range(1672, 2027)

# Make sure we account for all of the model years.
reg_3 = pl.LazyFrame(
    data={
        "model_year": [x for x in yr_range],
    },
    schema={
        "model_year": pl.Int64,
    }
)

# Fill in nulls with 0s.
reg_4 = (reg_2
    .join(other=reg_3, on="model_year", how="right")
    .with_columns(
        pl.col("num_registrations").fill_null(0) 
    )
    .sort(by="model_year")
)

reg_4.collect()

num_registrations,model_year
i64,i64
0,1672
0,1673
0,1674
0,1675
0,1676
…,…
30312,2022
31266,2023
27389,2024
6889,2025


In [6]:
reg_4.select(pl.sum("num_registrations")).collect()

num_registrations
i64
584221


## Non-business registrations

In [7]:
def richards_curve(t, A, K, B, nu, Q, C, M):
    """https://en.wikipedia.org/wiki/Generalised_logistic_function
    """
    return A + (K - A)/(C + Q*jnp.exp(-B*(t - M)))**(1.0/nu)

In [8]:
# # For model years earlier than 2005-ish, the registration
# # data probably better captures what is going on in the population
# # because vehicles registered by commercial entities are 
# # more likely to be newer.  For the pre-2005 model years,
# # most were probably registered by actual individuals
# # instead of businesses.

# params = {
#     "t":reg_4.select("model_year").collect().to_series().to_numpy(),
#     "A":1, # 1
#     "K":0.75, # 0.75 to 0.90
#     "B":0.4, # 0.3 to 1
#     "nu":1, # 0.05 to 1
#     "Q":1, # 1
#     "C":1, # 1
#     "M":2008 # 2004-2008
# }

# y = richards_curve(
#     **params
# )
# fig = px.scatter(
#     x=params["t"],
#     y=y
# )

# fig.show()

In [9]:
def non_business_regs(PRNG_key):
    params = {
        "t":reg_4.select("model_year").collect().to_series().to_numpy(),
        "A":1, 
        "K":numpyro.sample("K", dist.Uniform(0.75, 0.85), rng_key=PRNG_key), 
        "B":numpyro.sample("B", dist.Uniform(0.3, 1), rng_key=PRNG_key), 
        "nu":numpyro.sample("nu", dist.Uniform(0.05, 1), rng_key=PRNG_key), 
        "Q":1, 
        "C":1, 
        "M":numpyro.sample("M", dist.Uniform(2004, 2008), rng_key=PRNG_key)
    }

    regs = numpyro.deterministic(
        name="regs",
        value=richards_curve(
            **params
        )
    ) 
    
    return regs

In [10]:
key = random.key(7)
# Run NUTS.
kernel = NUTS(non_business_regs)
num_samples = 100
mcmc = MCMC(kernel, num_warmup=1000, num_samples=num_samples)
mcmc.run(
    rng_key=key,
    PRNG_key=key
)

sample: 100%|██████████| 1100/1100 [00:02<00:00, 456.24it/s, 7 steps of size 7.07e-01. acc. prob=0.89]


In [11]:
non_business_regs_samples = mcmc.get_samples()

In [12]:
model_years = reg_4.select("model_year").collect().to_series().to_numpy()

In [13]:
fig = px.line()

for i in range(num_samples):
    y = non_business_regs_samples["regs"][i, :]
    fig.add_trace(
        go.Scatter(
            x=model_years,
            y=y,
            opacity=0.05,
            mode="lines",
            line = dict(color='black')
        )
    )

fig.update_layout(
    showlegend=False,
    title=f"{num_samples} Samples from Prior Distribution for Proportion of Non-Business Registrations",
    xaxis={"title": "Model Year"},
    yaxis={"title": "Proportion of Utah-County Registrations"}
)

fig.show()

## Dirichlet
Assume that the proportion of vehicles in the population for each model year follows a Dirichlet distribution whose parameters are a function of the registration counts.



In [14]:
def get_alpha(PRNG_key, reg_weight=1):
    # reg_weight is the weight given to the registrations.
    # Smaller values mean that we trust the usefulness
    # of the registration data less. 
    # pre_alpha is not the real alpha.
    pre_alpha = (reg_4
        .cast({"num_registrations": pl.Float64})
        # Set 0 registration counts to small real numbers less
        # than 1.
        .with_columns(
            pl.when((pl.col("num_registrations") == 0) & (pl.col("model_year") < 1913))
            .then(1.0/reg_weight * \
                np.exp(-(1913 - pl.col("model_year")) ** 0.5)
            )
            .when((pl.col("num_registrations") == 0) & (pl.col("model_year") > 1913))
            .then(pl.lit(1.0, dtype=pl.Float64))
            .otherwise(pl.col("num_registrations"))
            .alias("num_registrations")
        )
        .select(reg_weight * pl.col("num_registrations"))
        .collect()
        .to_series()
        .to_numpy()
    )
    key, subkey = random.split(PRNG_key)
    alpha = numpyro.deterministic(
        name="alpha",
        value=non_business_regs(PRNG_key=subkey) * pre_alpha
    )
    return alpha

In [15]:
key, subkey = random.split(key)
# Run NUTS.
kernel = NUTS(model=get_alpha)
num_samples = 100
mcmc = MCMC(kernel, num_warmup=1000, num_samples=num_samples)
mcmc.run(
    rng_key=key,
    PRNG_key=subkey,
    reg_weight=1
)

sample: 100%|██████████| 1100/1100 [00:02<00:00, 393.22it/s, 7 steps of size 6.24e-01. acc. prob=0.93]


In [16]:
alpha_samples = mcmc.get_samples()

In [17]:
alpha_samples["alpha"].min()

Array(1.81107566e-07, dtype=float64)

In [18]:
px.histogram(alpha_samples["alpha"].sum(axis=1))


response variable: vector of means (proportion for each model year)
posterior parameter(s): concentration vector for Dirichlet distribution


## View Samples from Prior Predictive Distribution
The prior predictive distribution shows what we think the proportions of each model year are in the population.
The 1 July, 2024 [estimate](https://www.census.gov/quickfacts/fact/table/utahcountyutah/PST045223) of the number of residents of Utah County, Utah is 747,234.  The five-year 2019-2023 ACS estimate of the number of households in Utah County is 195,602.  According to (https://datausa.io/profile/geo/utah), the number of vehicles per household in Utah is about 2.  Note that vehicles can be registered by businesses and not just households.

In [19]:
key, subkey_1, subkey_2 = random.split(key, 3)
alpha = get_alpha(subkey_1)

In [20]:
def model(PRNG_key):
    key, subkey_1, subkey_2 = random.split(PRNG_key, 3)
    alpha = get_alpha(subkey_1)
    prior = numpyro.sample(
        name="prior", 
        fn=numpyro.distributions.Dirichlet(alpha),
        rng_key=subkey_2
    )
    return prior

In [21]:
key, subkey = random.split(key)
# Run NUTS.
kernel = NUTS(model=model, init_strategy=init_to_median())
num_samples = 100
mcmc = MCMC(kernel, num_warmup=1000, num_samples=num_samples)

mcmc.run(
    rng_key=key,
    PRNG_key=subkey
)

sample: 100%|██████████| 1100/1100 [00:20<00:00, 53.46it/s, 109 steps of size 4.38e-03. acc. prob=0.95] 


In [22]:
# Get samples
prior_samples = mcmc.get_samples()["prior"]

In [23]:
x = model_years
num_samples = 3
fig = make_subplots(
    rows=num_samples, 
    shared_xaxes=True,
    x_title="Model Year",
    y_title="Relative Frequency"
)

for t in range(num_samples):
    row = t + 1
    y = prior_samples[t, :]
    # https://stackoverflow.com/questions/65910725/plotly-bar-chart-opacity-changes-with-longer-time-range
    # Plot later years
    fig.add_trace(
        go.Bar(
            x=x[x > 2000],
            y=y[x > 2000],
            orientation="v"  
        ),
        row=row,
        col=1
    )

fig.update_traces(marker_line_width = 0)

# https://stackoverflow.com/questions/56712486/how-to-hide-legend-with-plotly-express-and-plotly
fig.update_layout(
    barmode="overlay",
    bargap=0,
    showlegend=False,
    title="Samples from Prior Predictive Distribution"
)

fig.show()

In [24]:
# Plot earlier years
fig = make_subplots(
    rows=num_samples, 
    shared_xaxes=True,
    x_title="Model Year",
    y_title="Relative Frequency"
)

for t in range(num_samples):
    row = t + 1
    y = prior_samples[t, :]
    # https://stackoverflow.com/questions/65910725/plotly-bar-chart-opacity-changes-with-longer-time-range
    # Plot earlier years
    fig.add_trace(
        go.Bar(
            x=x[x <= 2000],
            y=y[x <= 2000],
            orientation="v"  
        ),
        row=row,
        col=1
    )

fig.update_traces(marker_line_width = 0)
# https://stackoverflow.com/questions/56712486/how-to-hide-legend-with-plotly-express-and-plotly
fig.update_layout(
    barmode="overlay",
    bargap=0,
    showlegend=False,
    title="Samples from Prior Predictive Distribution"
)

fig.show()

## Analysis with Actual Data

In [52]:
obs_0 = pl.scan_csv(
    source=collected_vehicle_data_path,
    has_header=True
)

In [53]:
obs = (obs_0
    .group_by("year")
    .agg(pl.count("year").alias("observed_count"))
    .select(pl.col("year").alias("model_year"), "observed_count")
    .join(
        other=reg_4,
        on="model_year",
        how="right"
    )
    .with_columns(
        pl.col("observed_count").fill_null(0)
    )
    .select("model_year", "observed_count")
    .sort(by="model_year")
)

In [82]:
def model(obs_counts, PRNG_key):
    key, subkey_1, subkey_2 = random.split(PRNG_key, 3)
    alpha = get_alpha(subkey_1)
    alpha_2 = alpha + obs_counts
    proportions = numpyro.sample(
        name="proportions", 
        fn=numpyro.distributions.Dirichlet(alpha_2),
        rng_key=subkey_2
    )
    return proportions

In [84]:
key, subkey = random.split(key)
# Run NUTS.
kernel = NUTS(model=model, init_strategy=init_to_median())
num_samples = 100
mcmc = MCMC(kernel, num_warmup=1000, num_samples=num_samples)

mcmc.run(
    rng_key=key,
    PRNG_key=subkey,
    obs_counts=obs.select("observed_count").collect().to_series().to_numpy()
)

sample: 100%|██████████| 1100/1100 [00:22<00:00, 49.63it/s, 13 steps of size 1.71e-02. acc. prob=0.77] 


In [85]:
# Get samples
posterior_predictive_samples = mcmc.get_samples()["proportions"]

In [95]:
posterior_predictive_samples_mean = posterior_predictive_samples.mean(axis=0)

# Sensitivity Analysis for Non-sampling Error

**Strategy**: For each model year, think about how many vehicles of the given model year could be found amoung population units not included in the sample?

We can view the total number of non-responses as the number of households (approximately).  We need to determine reasonable values for the correlation between the response indicator R and binary variable y of whether or not a given car is of the indicated model year.

For each model year, #{y = 1} is probably the unweighted alpha times 0.5 on the low end and times 2 on the high end among population units with non-response.  

We are probably more likely to have non-response for older cars.

In [54]:
params = {
    "t":reg_4.select("model_year").collect().to_series().to_numpy(),
    "A":10, 
    "K":2, 
    "B":0.5, 
    "nu":10, 
    "Q":100, 
    "C":1, 
    "M":1990 
}

y = richards_curve(
    **params
)
px.scatter(
    x=params["t"],
    y=y
)

### Sensitivity with Scipy

In [66]:
# K is the size of the sensitivity analysis.
K = 1000
# n is the sample size.
n = obs.select(pl.col("observed_count").sum()).collect().item()

# N is the target population size.
N = int(np.round(np.abs(stats.norm(loc=450000, scale=50000).rvs(size=1))).item())

# We are probably more likely to have non-response for older cars.
model_years = obs.select("model_year").collect().to_series().to_numpy()
num_model_years = model_years.shape[0]
params = {
    "t":model_years,
    "A":10, 
    "K":2, 
    "B":0.5, 
    "nu":10, 
    "Q":100, 
    "C":1, 
    "M":1990 
}
lower = 0.7
# upper is between 2 and 10.
upper = richards_curve(
    **params
)

multiplier_1 = stats.beta(a=0.5, b=0.5).rvs(size=num_model_years)
# multiplier_2 is between lower and upper.
multiplier_2 = lower + (upper - lower) * multiplier_1
alpha_for_R_0_weight = 1
key, subkey = random.split(key)
alpha_for_R_0 = multiplier_2 * get_alpha(PRNG_key=subkey, reg_weight=alpha_for_R_0_weight)
m = np.ceil(alpha_for_R_0).astype(np.int64)

multivariate_hypergeom_obj = stats.multivariate_hypergeom(
    m=m,
    n=N - n
)
num_1s_for_R_0 = multivariate_hypergeom_obj.rvs(size=1).ravel()

fig = px.line(x=model_years, y=num_1s_for_R_0)
fig.show()

In [75]:
def get_phi(n_11, n_00, n_10, n_01):
    """https://en.wikipedia.org/wiki/Phi_coefficient
    
    Matrix arguments are also accepted.
    In this case, the product in the formula in the denominator
    is taken along axis 0.  If matrix arguments are supplied,
    then they must all be of the same shape.  
    """
    n_1_dot = (n_11 + n_10).astype(np.float64)
    n_0_dot = (n_01 + n_00).astype(np.float64)
    n_dot_1 = (n_01 + n_11).astype(np.float64)
    n_dot_0 = (n_00 + n_10).astype(np.float64)

    divisor = np.prod(np.sqrt([n_1_dot, n_0_dot, n_dot_0, n_dot_1]), axis=0)
    phi = (n_11 * n_00 - n_10 * n_01) / divisor
    np.nan_to_num(phi, copy=False)
    return phi

def get_var_binary_data(num_1s, n):
    """
    See: https://stats.stackexchange.com/questions/67019/variance-and-covariance-of-binary-data
    """
    return num_1s * (n - num_1s)/(n * (n - 1))

def get_mse_of_sample_mean(mean_squared_correlation, n, N, pop_var):
    """
    See: Sampling: Design and Analysis by Lohr on p. 529
    """
    return mean_squared_correlation * (N - 1)/n * (1 - n/N) * pop_var

def get_error_of_sample_mean(corr, n, N, pop_std):
    """
    See: Sampling: Design and Analysis by Lohr on p. 529
    """
    return corr * np.sqrt((N - 1)/n * (1 - n/N)) * np.sqrt(pop_std)

In [69]:
# K is the size of the sensitivity analysis.
K = 1000
# We are probably more likely to have non-response for older cars.
model_years = obs.select("model_year").collect().to_series().to_numpy()
num_model_years = model_years.shape[0]
correlations = np.zeros(shape=(K, num_model_years))
population_sizes = np.zeros(shape=(K,))
errors = np.zeros(shape=(K, num_model_years))
# squared_errors = np.zeros(shape=(K, num_model_years))
params = {
    "t":model_years,
    "A":10, 
    "K":2, 
    "B":0.5, 
    "nu":10, 
    "Q":100, 
    "C":1, 
    "M":1990 
}
lower = 0.7
# upper is between 2 and 10.
upper = richards_curve(
    **params
)
# n is the sample size.
n = obs.select(pl.col("observed_count").sum()).collect().item()
num_1s_for_R_1 = obs.select("observed_count").collect().to_series().to_numpy()
num_1s_for_R_0_all = np.zeros(shape=(K, num_model_years))

In [76]:
for k in range(K):
    # N is the target population size.
    N = int(np.round(np.abs(stats.norm(loc=450000, scale=50000).rvs(size=1))).item())
    population_sizes[k] = N
    multiplier_1 = stats.beta(a=0.5, b=0.5).rvs(size=num_model_years)
    # multiplier_2 is between lower and upper.
    multiplier_2 = lower + (upper - lower) * multiplier_1
    alpha_for_R_0_weight = 1
    key, subkey = random.split(key)
    alpha_for_R_0 = multiplier_2 * get_alpha(PRNG_key=subkey, reg_weight=alpha_for_R_0_weight)
    m = np.ceil(alpha_for_R_0).astype(np.int64)

    multivariate_hypergeom_obj = stats.multivariate_hypergeom(
        m=m,
        n=N - n
    )
    num_1s_for_R_0 = multivariate_hypergeom_obj.rvs(size=1).ravel()
    num_1s_for_R_0_all[k, :] = num_1s_for_R_0

    num_0s_for_R_0 = N - n - num_1s_for_R_0

    num_0s_for_R_1 = n - num_1s_for_R_1

    
    corr = get_phi(
        n_11=num_1s_for_R_1,
        n_00=num_0s_for_R_0,
        n_10=num_1s_for_R_0,
        n_01=num_0s_for_R_1
    )

    correlations[k, :] = corr

    # https://stats.stackexchange.com/questions/67019/variance-and-covariance-of-binary-data
    num_1s = num_1s_for_R_1 + num_1s_for_R_0
    pop_var_for_each_yr = get_var_binary_data(num_1s, N)
    pop_std_for_each_yr = np.sqrt(pop_var_for_each_yr)

    error_for_each_yr = get_error_of_sample_mean(
        corr=corr,
        n=n,
        N=N,
        pop_std=pop_std_for_each_yr
    )
    errors[k, :] = error_for_each_yr


invalid value encountered in divide



In [77]:
mean_correlations = correlations.mean(axis=0)
fig = px.line(
    x=model_years,
    y=mean_correlations
)
fig.update_layout(
    title="Simulated Mean Correlations between Responders and Non-Responders",
    xaxis={"title": "Model Year"},
    yaxis={"title": "Mean Correlation"}
)

In [78]:
fig = px.line()

rng = np.random.default_rng()
num_1s_for_R_0_all_sample = rng.choice(num_1s_for_R_0_all, size=100, axis=0)

for k in range(num_1s_for_R_0_all_sample.shape[0]):
    y = num_1s_for_R_0_all_sample[k, :]
    fig.add_trace(
        go.Scatter(
            x=model_years,
            y=y,
            opacity=0.05,
            mode="lines",
            line = dict(color='black')
        )
    )

fig.update_layout(
    showlegend=False,
    title=f"Simulation of Possible Vehicle Counts among Non-Responders",
    xaxis={"title": "Model Year"},
    yaxis={"title": "Number of Vehicles"}
)

fig.show()

In [79]:
mean_absolute_errors = np.abs(errors).mean(axis=0)

In [80]:
fig = px.line(
    x=model_years,
    y=mean_absolute_errors
)
fig.update_layout(
    title="Simulated MAEs between Responders and Non-Responders",
    xaxis={"title": "Model Year"},
    yaxis={"title": "MAE"}
)

In [None]:
y = posterior_predictive_samples_mean
fig = px.line()
lower = y - mean_absolute_errors
# lower[lower < 0] = 0
lower = lower.at[lower < 0].set(0)
upper = y + mean_absolute_errors
# upper[upper > 1] = 1
upper = upper.at[upper > 1].set(1)
fig.add_trace(
    go.Scatter(
        name="Point Estimate",
        x=model_years,
        y=y,
        mode="lines",
        line = dict(color='blue')
    )
)
fig.add_trace(
    go.Scatter(
        name="Estimate - MAE",
        x=model_years,
        y=lower,
        opacity=0.5,
        mode="lines",
        line = dict(color='red')
    )
)

fig.add_trace(
    go.Scatter(
        name="Estimate + MAE",
        x=model_years,
        y=upper,
        opacity=0.5,
        mode="lines",
        line = dict(color='black')
    )
)

fig.update_layout(
    title=f"Estimated Proportion of Vehicles with Simulated MAEs for each Model Year",
    xaxis={"title": "Model Year"},
    yaxis={"title": "Proportion of Vehicles"}
)

fig.show()