In [188]:
from pathlib import Path

import numpy as np
import jax.numpy as jnp
from jax import grad
import polars as pl
import plotly.express as px

import sklearn.pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import SplineTransformer
from sklearn.linear_model import (
    # Ridge, 
    # ElasticNet, 
    LinearRegression
)

# Exploratory Data Analysis and Feature Engineering

We have data on model_year, calendar_time, and advertised_inventory.

In [173]:
# Import data
mixed_years_path = Path("..", "raw_data", "model_year_vs_calendar_time", "model_year_vs_calendar_time.csv")
mixed_years = pl.scan_csv(
    source=mixed_years_path,
)
# View head of data
mixed_years.collect().head()

model_year,calendar_time,advertised_inventory,model_year_is_previous_calendar_year,model_year_is_calendar_year,model_year_is_next_calendar_year
i64,f64,f64,i64,i64,i64
2022,2020.5,0.0,0,0,0
2022,2020.9,0.0,0,0,0
2022,2022.420572,935.612174,0,1,0
2022,2022.441428,928.768412,0,1,0
2022,2022.462284,920.898087,0,1,0


Create a new variable called yr_diff to represent how far ahead the calendar_time is from the model_year.

In [174]:
mixed_years = (
    mixed_years
        .select(
            "model_year",
            # Use the midpoint rule.
            (pl.col("calendar_time") - (pl.col("model_year") + 6/12)).alias("yr_diff"),
            "advertised_inventory"
        )
)

# Also, pad in some more 0s for easier modeling later.
mixed_years = (mixed_years
    .collect()
    .vstack(
        pl.DataFrame(
            {
                "model_year": [2023]*10, 
                "yr_diff": [1.95, 2.0, 2.05, -1.65, -1.70, -1.75, -1.8, -1.85, -1.9, -1.95],
                "advertised_inventory": [0.0]*10,
            }
        )
    )
    .lazy()
)

Except for some fluctations in year-to-year amplitudes, the pattern of inventory rise and fall for different model years appears remarkably consistent.  Inventories for each model year peak around July of the same calendar year.

In [175]:
px.scatter(
    x=mixed_years.select("yr_diff").collect().to_series(),
    y=mixed_years.select("advertised_inventory").collect().to_series(),
    color=mixed_years.select(pl.col("model_year").cast(pl.Utf8)).collect().to_series(),
    title="The Rise and Fall of Vehicle Inventories",
    labels={
        "x": "calendar_time - (model_year + 6/12) (years)",
        "y": "Advertised Vehicle Inventory (thousands)",
        "color": "Model Year"
    }
)





# Spline Regression

In [None]:
# For each model year that we have data (2022 through 2025),
# fit a separate spline regression.  Do this so that monthly
# inventory counts can be predicted despite not having
# direct access to this information.  (All of the information
# that we have to go on is the graph from www.spglobal.com/mobility)

model_yr_range = range(2022, 2026, 1)
inventory_model = dict.fromkeys(model_yr_range)

# Create some evenly space yr_diffs for evaluating each model.
# These same yr_diffs can be used for other model years
# to compare apples to apples.
step = 1.0/12.0
a = -3.0
b = 3.0

yr_diffs = np.linspace(a, b, num=int((b - a)/step + 1))

# Create a data-frame to store the predictions from each model.
mixed_years_2 = pl.DataFrame()

# Get plotly figures set up before the loop.
fig = px.scatter()
fig_2 = px.scatter()

model_year_knots = {
    2022: (
        np.sort(
            np.array([2.19, 1.48, 1.2, 1, 0.9, 0.85, 0.8, 0.7, 0.6, 0.4, 0.3, 
                0.2, 0.15, 0.087, 0.025, 0, -0.015,
                -0.079428
            ])
        )
        .reshape(-1, 1)
    ),
    2023: (
        np.sort(
            np.array([-2., -1.08, -1.02, -1., -0.95, 
                -0.93, -0.91, -0.88, -0.85, -0.82,
                -0.75, -0.73, -0.5, 
                -0.3, -0.244, -0.2, -0.18, -0.15, -0.12, -0.09, -0.05, 
                -0.03,  0.,  0.03,  0.1,  0.5,  0.95,  1.,  1.05,
                1.15,  2.1
            ])
        )
        .reshape(-1, 1)
    ),
    2024: (
        np.sort(
            np.array([2.1, 1.15, 1.05, 1, 0.95, 0.5, 0.1, 0.03, 0, 
                -0.03, -0.05, -0.09, -0.12, -0.15, -0.18, -0.2, -0.244, -0.3, -0.5, 
                -0.73, -0.75, -0.95, 
                -1, -1.02, -1.2, -1.22, -1.28, -1.32, -1.4, -1.45, -2
            ])
        )
        .reshape(-1, 1)
    ),
    2025: (
        np.sort(
            np.array([-1.42, -1.3, -1.27, -1.12, -1, -0.9, -0.85, -0.8, -0.716094   
            ])
        )
        .reshape(-1, 1)
    )
}

# Make a pipeline for each model_year and fit the model.
for yr in model_yr_range:
    # Make pipeline for the current model year.
    current_yr_inventory_model = make_pipeline(
        SplineTransformer(
            degree=3,
            knots=model_year_knots[yr],
            extrapolation="constant",
            include_bias=True
        ),
        LinearRegression(
            fit_intercept=True
        )
    )

    # Fit model.
    # Sometimes, we only want to work with some values in yr_diff.
    yr_diff_filter = (
        pl.when(yr == 2022)
        .then(
            (pl.col("yr_diff") > -0.084)
        )
        .when(yr == 2024)
        .then(
            (pl.col("yr_diff") < 0.29)
        )
        .when(yr == 2025)
        .then(
            (pl.col("yr_diff") < -0.7)
        )
        .otherwise(True)
    )
    
    mixed_years_filtered = (mixed_years
        .filter(yr_diff_filter & (pl.col("model_year") == yr))
    )
    
    current_yr_inventory_model.fit(
        X=(
            mixed_years_filtered
                .select("yr_diff")
                .collect()
        ),
        y=(
            mixed_years_filtered
                .select("advertised_inventory")
                .collect()
        )
    )

    # Save the fitted model.
    inventory_model[yr] = current_yr_inventory_model

    # Put the predictions from current_yr_inventory_model into a new data-frame.
    mixed_years_current_preds = pl.LazyFrame(
        data={
            "yr_diff": yr_diffs,
            "model_year": None,
            "predicted_inventory": None,
            "predicted_inventory_change": None
        },
        schema={
            "yr_diff": pl.Float64,
            "model_year": pl.Int32,
            "predicted_inventory": pl.Float64,
            "predicted_inventory_change": pl.Float64
        }
    )

    # Filter it first.
    mixed_years_current_preds = (mixed_years_current_preds
        .filter(yr_diff_filter)                             
    )

    # Use the current_yr_inventory_model to get the predictions.
    preds = pl.Series(
        current_yr_inventory_model
            .predict(
                X=(mixed_years_current_preds
                    .select("yr_diff")
                    .collect()
                )
            )
            .flatten()
    )

    # Replace negative values generated by the spline regression.
    # (I cannot figure out how to put constraints on the spline regression
    # other than to just fix the predictions like I do here.)
    preds = np.where(preds < 0, 0, preds)

    mixed_years_current_preds = (mixed_years_current_preds
        .with_columns(
            model_year = pl.lit(yr),
            predicted_inventory = preds
        )                 
    )

    # Save mixed_years_current_preds for the current model_year.
    mixed_years_2 = (mixed_years_2
        .vstack(mixed_years_current_preds.collect())   
        .with_columns(
            predicted_inventory_derivative = pl.col("predicted_inventory").diff(n=1).over("model_year")
        )   
    )

    # Make a plot of the original training data alongside the predictions.
    fig.add_scatter(
        x=(mixed_years_2
           .filter(pl.col("model_year") == yr)
           .select("yr_diff")
           .to_series()
        ),
        y=(mixed_years_2
           .filter(pl.col("model_year") == yr)
           .select("predicted_inventory")
           .to_series()
        ),
        mode="markers",
        name=f"{yr} Predicted"
    )

    fig.add_scatter(
        x=(mixed_years
           .filter(pl.col("model_year") == yr)
           .select("yr_diff")
           .collect()
           .to_series()
        ),
        y=(mixed_years
           .filter(pl.col("model_year") == yr)
           .select("advertised_inventory")
           .collect()
           .to_series()
        ),
        mode="markers",
        name=f"{yr} Actual"
    )

    # https://plotly.com/python/figure-labels/
    fig.update_layout(
        title=dict(text="Curve Fitting for Different Model Years"),
        xaxis=dict(
            title=dict(
                text="model_year + 0.5 - calendar_time (years)"
            )
        ),
        yaxis=dict(
            title=dict(
                text="Advertised Vehicle Inventory (thousands)"
            )
        )
    )

    fig_2.add_scatter(
        x=(mixed_years_2
           .filter(pl.col("model_year") == yr)
           .select("yr_diff")
           .to_series()
        ),
        y=(mixed_years_2
           .filter(pl.col("model_year") == yr)
           .select("predicted_inventory_change")
           .to_series()
        ),
        mode="markers",
        name=f"{yr}"
    )

# Show the figures.
fig.show()
fig_2.show()