In [3]:
from pathlib import Path

import polars as pl

# Research Question: What is the distribution of vehicle model years in Utah County?

Note that we include vehicles that are driven in Utah County without being registered in Utah County.  Thus, we provide added information to what is simply available from government registration records.

## Strategy
Because vehicles are still being sold for 2024, 2025, and 2026 model years, update the registration counts for these years using the count for model year 2023 and how long it has been since 2023.  Group model years less than 1965 into one bin.  Divide the registration counts by 1000 and use these new values as the concentration parameters for a Dirichlet distribution.  Use the technique [here](https://en.wikipedia.org/wiki/Dirichlet_distribution#Conjugate_to_categorical_or_multinomial) to use these concentration parameters as pseudocounts to be added to our observed counts.  The summed counts can then be used as the concentration parameter for the posterior Dirichlet distribution of vehicle model years in Utah County.

In [4]:
source = Path("..", "raw_data", "registrations", "registrations.csv")
reg = pl.scan_csv(
    source=source
)

reg = (reg
    .with_columns(
        pl.col("num_registrations").str.replace_all(",", "").cast(pl.Int64).alias("num_registrations")
    )
    .collect()
    .lazy()
)

reg.collect().tail()

model_year,num_registrations
i64,i64
2022,30312
2023,31266
2024,27037
2025,5830
2026,8


In [5]:
reg_2023 = (reg
    .filter(pl.col("model_year") == 2023)
    .select("num_registrations")
    .collect()
    .item()
)

reg_2 = (reg
    .with_columns(
        pl.when(pl.col("model_year") > 2023)
        .then(pl.lit(reg_2023))
        .otherwise(pl.col("num_registrations"))
        .alias("num_registrations")
    )
)

In [9]:
(reg_2
    .with_columns(
        pl.when(pl.col("model_year") < 1965)
        .then(pl.lit("< 1965"))
        .otherwise(pl.col("model_year").cast(pl.Utf8))
        .alias("model_year")
    )
    .group_by("model_year", maintain_order=True)
    .agg(
        pl.col("num_registrations").sum()
    )
    .collect()
)

model_year,num_registrations
str,i64
"""< 1965""",1665
"""1965""",286
"""1966""",321
"""1967""",285
"""1968""",244
…,…
"""2022""",30312
"""2023""",31266
"""2024""",31266
"""2025""",31266
