In [22]:
from pathlib import Path

import polars as pl
import numpy as np
from scipy.stats import (dirichlet, poisson, beta, multinomial)
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import numpyro
import numpyro.distributions as dist
from numpyro.infer import MCMC, NUTS
from jax import random
import jax.numpy as jnp


# Research Question: What is the distribution of vehicle model years in the target population?

Note that we include vehicles that are driven in Utah County without being registered in Utah County.  Thus, we provide added information to what is publicly available from government registration records.

## Strategy
 Use the registration counts as the concentration parameters for a Dirichlet distribution.  Use the technique [here](https://en.wikipedia.org/wiki/Dirichlet_distribution#Conjugate_to_categorical_or_multinomial) to use these concentration parameters as pseudocounts to be added to our observed counts.  The summed counts can then be used as the concentration parameter for the posterior Dirichlet distribution of vehicle model years in Utah County.

## ETL
New vehicles are still being sold for 2024, 2025, and 2026 model years, but not for model year 2023.  The registration data that we have is for vehicles registered in 2024 all of the way up to February 17, 2025.  Thus, there may be additional registrations for the newer model year vehicles between February 2025 and March 2025 which are not in our dataset.  We can modify the registration counts for these new model years using the count for model year 2023. 

Assume that if the registration data were to go all of the way up to February 17, 2026 that there would be the same number of registrations expiring for model year 2024 as there currently is for model year 2023.  Note that the proportion of this period traversed at the time of data collection is approximately 1/12. 

Assume that if the registration data were to go all of the way up to February 17, 2027 that there would be the same number of registrations expiring for model year 2025 as there currently is for model year 2023.  Note that the proportion of this period traversed at the time of data collection is approximately 1/24. 

Assume that if the registration data were to go all of the way up to February 17, 2028 that there would be the same number of registrations expiring for model year 2026 as there currently is for model year 2023.  Note that the proportion of this period traversed at the time of data collection is approximately 1/36. 

In [8]:
source = Path("..", "raw_data", "registrations", "registrations.csv")
reg = pl.scan_csv(
    source=source
)

reg = (reg
    .with_columns(
        pl.col("num_registrations").str.replace_all(",", "").cast(pl.Int64).alias("num_registrations")
    )
    .collect()
    .lazy()
)

reg.collect().tail()

model_year,num_registrations
i64,i64
2022,30312
2023,31266
2024,27037
2025,5830
2026,8


In [9]:
reg_2023 = (reg
    .filter(pl.col("model_year") == 2023)
    .select("num_registrations")
    .collect()
    .item()
)

reg_2 = (reg
    .with_columns(
        pl.when(pl.col("model_year") > 2023)
        .then(pl.col("num_registrations") + (reg_2023 - pl.col("num_registrations")) / (12 * (pl.col("model_year") - 2023)))
        .otherwise(pl.col("num_registrations"))
        .cast(pl.Int64)
        .alias("num_registrations")
    )
)

reg_2.tail().collect()

model_year,num_registrations
i64,i64
2022,30312
2023,31266
2024,27389
2025,6889
2026,876


## Extrapolate the registration counts for pre-1913 model year vehicles
The first steam-powered vehicle dates back to 1672 ([Wikipedia](https://en.wikipedia.org/wiki/History_of_the_automobile#Steam-powered_wheeled_vehicles)).  The Utah registration data starts for model year 1913.  We assume that the number of pre-1913 model year vehicle registrations is 0.  Also, we assume that the model year of a vehicle must be between 1672 and 2026.

In [10]:
pre_yr_range = range(1672, 1913)

reg_3 = pl.DataFrame(
    data={
        "model_year": [x for x in pre_yr_range],
        "num_registrations": [0 for x in pre_yr_range]
    },
    schema={
        "model_year": pl.Int64,
        "num_registrations": pl.Int64
    }
)

reg_4 = reg_2.collect().vstack(reg_3).sort("model_year").lazy()

reg_4.collect()

model_year,num_registrations
i64,i64
1672,0
1673,0
1674,0
1675,0
1676,0
…,…
2022,30312
2023,31266
2024,27389
2025,6889


## Non-business registrations

In [25]:
def richards_curve(t, A, K, B, nu, Q, C, M):
    """https://en.wikipedia.org/wiki/Generalised_logistic_function
    """
    return A + (K - A)/(C + Q*jnp.exp(-B*(t - M)))**(1.0/nu)

In [27]:
# For model years earlier than 2005-ish, the registration
# data probably better captures what is going on in the population
# because vehicles registered by commercial entities are 
# more likely to be newer.  For the pre-2005 model years,
# most were probably registered by actual individuals
# instead of businesses.

params = {
    "t":reg_4.select("model_year").collect().to_series().to_numpy(),
    "A":1, # 1
    "K":0.75, # 0.75 to 0.90
    "B":0.4, # 0.3 to 1
    "nu":1, # 0.05 to 1
    "Q":1, # 1
    "C":1, # 1
    "M":2009 # 2004 to 2014
}

y = richards_curve(
    **params
)
px.scatter(
    x=params["t"],
    y=y
)

In [41]:
def non_business_regs():
    params = {
        "t":reg_4.select("model_year").collect().to_series().to_numpy(),
        "A":1, # 1
        "K":numpyro.sample("K", dist.Uniform(0.75, 0.9)), # 0.75 to 0.90
        "B":numpyro.sample("B", dist.Uniform(0.3, 1)), # 0.3 to 1
        "nu":numpyro.sample("nu", dist.Uniform(0.05, 1)), # 0.05 to 1
        "Q":1, # 1
        "C":1, # 1
        "M":numpyro.sample("M", dist.Uniform(2004, 2014)) # 2004 to 2014
    }

    regs = numpyro.deterministic(
        name="regs",
        value=richards_curve(
            **params
        )
    ) 
    
    return regs

In [42]:
key = random.key(42)
# Run NUTS.
kernel = NUTS(non_business_regs)
num_samples = 200
mcmc = MCMC(kernel, num_warmup=1000, num_samples=num_samples)
mcmc.run(
    rng_key=key
)

sample: 100%|██████████| 1200/1200 [00:02<00:00, 486.45it/s, 3 steps of size 7.32e-01. acc. prob=0.89] 


In [43]:
non_business_regs_samples = mcmc.get_samples()

In [61]:
fig = px.line()

for i in range(num_samples):
    y = non_business_regs_samples["regs"][i, :]
    fig.add_trace(
        go.Scatter(
            x=params["t"],
            y=y,
            opacity=0.05,
            mode="lines",
            line = dict(color='black')
        )
    )

fig.show()

## Dirichlet
Assume that the proportion of vehicles in the population for each model year follows a Dirichlet distribution whose parameters are a function of the registration counts.



In [526]:
prop_non_business_registrations_vec = richards_curve(
    **params
)

In [527]:
# reg_weight is the weight given to the registrations.
# Smaller values mean that we trust the usefulness
# of the registration data less. 
reg_weight = 1
alpha = (reg_4
    .cast({"num_registrations": pl.Float64})
    # Set 0 registration counts to small real numbers less
    # than 1.
    .with_columns(
        pl.when((pl.col("num_registrations") == 0) & (pl.col("model_year") < 1913))
        .then(1.0/reg_weight * np.exp(pl.col("model_year") - 1913))
        .otherwise(pl.col("num_registrations"))
        .alias("num_registrations")
    )
    .select(prop_non_business_registrations_vec * reg_weight * pl.col("num_registrations"))
    .collect()
    .to_series()
)

# dirichlet_pre_2005 = dirichlet(alpha_pre_2005)

# # reg_weight is the weight given to the registrations.
# # Smaller values mean that we trust the usefulness
# # of the registration data less. 
# reg_weight = 1e-3
# alpha_post_2004 = (reg_4
#     .cast({"num_registrations": pl.Float64})
#     .filter(pl.col("model_year") > 2004)
#     .select(reg_weight * pl.col("num_registrations"))
#     .collect()
#     .to_series()
# )

# dirichlet_post_2004 = dirichlet(alpha_post_2004)

## View Samples from Prior
The prior is for the population totals for each model year.
The 1 July, 2024 [estimate](https://www.census.gov/quickfacts/fact/table/utahcountyutah/PST045223) of the number of residents of Utah County, Utah is 747,234.  The five-year 2019-2023 ACS estimate of the number of households in Utah County is 195,602.  According to (https://datausa.io/profile/geo/utah), the number of vehicles per household in Utah is about 2.  Note that vehicles can be registered by businesses and not just households.

In [265]:
total_num_regs = reg_4.select(pl.col("num_registrations").sum()).collect().item()
total_num_regs

584221

In [266]:
beta_rv = beta(a=30, b=5)
go.Figure(go.Histogram(x=beta_rv.rvs(300)))

In [267]:
estimated_prop_household_vehicles_registered = beta_rv.rvs(1)
estimated_prop_household_vehicles_registered.item()

0.9159122768729805

In [268]:
total_num_households = 196000
pop_mean_vehicles_per_household = poisson(mu=2)
total_num_regs / total_num_households

2.980719387755102

In [269]:
(total_num_regs - 2*total_num_households) / total_num_regs

0.32902103827147605

In [270]:
# What is the population size?
# What are the possible numbers of vehicles per household?
poisson_rv = poisson(mu=2)
np.sum(poisson_rv.rvs(196000))
# mu1 =
# mu2 = 
# skellam_rv = skellam()
# binom_rv.rvs(size=3)

np.int64(392472)

In [271]:
px.histogram(x=poisson_rv.rvs(196000))

In [604]:
# Get samples
num_samples = 3
prior = dirichlet(alpha=alpha)
prior_samples = prior.rvs(size=num_samples)

In [605]:
x = reg_4.select("model_year").collect().to_series().to_numpy()
fig = make_subplots(
    rows=num_samples, 
    shared_xaxes=True,
    x_title="Model Year"
)

for t in range(num_samples):
    row = t + 1
    y = prior_samples[t, :]
    # https://stackoverflow.com/questions/65910725/plotly-bar-chart-opacity-changes-with-longer-time-range
    # Plot later years
    fig.add_trace(
        go.Bar(
            x=x[x > 2000],
            y=y[x > 2000],
            orientation="v"  
        ),
        row=row,
        col=1
    )

fig.update_traces(marker_line_width = 0)

# https://stackoverflow.com/questions/56712486/how-to-hide-legend-with-plotly-express-and-plotly
fig.update_layout(
    barmode="overlay",
    bargap=0,
    showlegend=False,
    title="Samples from Prior Distribution for Model Year Counts"
)

fig.show()

In [606]:
# Plot earlier years
fig = make_subplots(
    rows=num_samples, 
    shared_xaxes=True,
    x_title="Model Year"
)

for t in range(num_samples):
    row = t + 1
    y = prior_samples[t, :]
    # https://stackoverflow.com/questions/65910725/plotly-bar-chart-opacity-changes-with-longer-time-range
    # Plot earlier years
    fig.add_trace(
        go.Bar(
            x=x[x <= 2000],
            y=y[x <= 2000],
            orientation="v"  
        ),
        row=row,
        col=1
    )

fig.update_traces(marker_line_width = 0)
# https://stackoverflow.com/questions/56712486/how-to-hide-legend-with-plotly-express-and-plotly
fig.update_layout(
    barmode="overlay",
    bargap=0,
    showlegend=False,
    title="Samples from Prior Distribution for Model Year Counts"
)

fig.show()

# Sensitivity Analysis for Non-sampling Error

In [607]:
np.sqrt(0.28*0.72/300)

np.float64(0.02592296279363144)

In [608]:
N = int(1e6)
n = 300
0.007 * np.sqrt(((N - 1)/n) * (1 - n/N)) * 0.03

np.float64(0.01212253080194478)

In [609]:
R = np.concat((np.ones((n,)), np.zeros((N - n,))))
y = np.concat((
    np.zeros((int(np.ceil((1 - 0.1) * n)), )), 
    np.ones((int(np.ceil(0.1 * n)), )), 

    np.zeros((int(np.ceil((1 - 0.25) * (N - n))), )), 
    np.ones((int(np.ceil(0.25 * (N - n))), )), 
))

In [610]:
np.corrcoef(R, y)

array([[ 1.        , -0.00599946],
       [-0.00599946,  1.        ]])

In [611]:
np.concat((np.ones((1,)), np.zeros((100,))))

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])