In [179]:
from pathlib import Path
from functools import partial

import numpy as np
from jax import (grad, jit, Array, vjp)
from jax.lax import cond
import jax.numpy as jnp
from scipy.interpolate import (make_lsq_spline, BSpline, Akima1DInterpolator)
from scipy.integrate import cumulative_trapezoid
from math import (gamma, factorial)
from statsmodels.nonparametric.kernel_regression import KernelReg
import statsmodels.api as sm
import polars as pl
import plotly.express as px

# Exploratory Data Analysis and Feature Engineering

We have data on model_year, calendar_time, and advertised_inventory.

In [2]:
# Import data
mixed_years_path = Path("..", "raw_data", "model_year_vs_calendar_time", "model_year_vs_calendar_time.csv")
mixed_years = pl.scan_csv(
    source=mixed_years_path,
)
# View head of data
mixed_years.collect().head()

model_year,calendar_time,advertised_inventory,model_year_is_previous_calendar_year,model_year_is_calendar_year,model_year_is_next_calendar_year
i64,f64,f64,i64,i64,i64
2022,2020.5,0.0,0,0,0
2022,2020.9,0.0,0,0,0
2022,2022.420572,935.612174,0,1,0
2022,2022.441428,928.768412,0,1,0
2022,2022.462284,920.898087,0,1,0


Create a new variable called yr_diff to represent how far ahead the calendar_time is from the model_year.

In [3]:
mixed_years = (
    mixed_years
        .select(
            "model_year",
            # Use the midpoint rule.
            (pl.col("calendar_time") - (pl.col("model_year") + 6/12)).alias("yr_diff"),
            "advertised_inventory"
        )
)

# Also, pad in some more 0s for easier modeling later.
mixed_years = (mixed_years
    .collect()
    .vstack(
        pl.DataFrame(
            {
                "model_year": [2022]*10, 
                "yr_diff": np.linspace(2.05, 2.19, 10),
                "advertised_inventory": [0.0]*10,
            }
        )
    )
    .vstack(
        pl.DataFrame(
            {
                "model_year": [2023]*10, 
                "yr_diff": [1.95, 2.0, 2.05, -1.65, -1.70, -1.75, -1.8, -1.85, -1.9, -1.95],
                "advertised_inventory": [0.0]*10,
            }
        )
    )
    .lazy()
)

Except for some fluctations in year-to-year amplitudes, the pattern of inventory rise and fall for different model years appears remarkably consistent.  Inventories for each model year peak around July of the same calendar year.

In [271]:
x = mixed_years.select("yr_diff").collect().to_series().to_numpy()
y = mixed_years.select("advertised_inventory").collect().to_series().to_numpy()
color = mixed_years.select(pl.col("model_year").cast(pl.Utf8)).collect().to_series().to_numpy()

px.scatter(
    x=x,
    y=y,
    color=color,
    title="The Rise and Fall of Vehicle Inventories",
    labels={
        "x": "calendar_time - (model_year + 6/12) (years)",
        "y": "Advertised Vehicle Inventory (thousands)",
        "color": "Model Year"
    }
)





# Predict the Average Inventory Curve

We will combine our data from the different model years to predict an average inventory curve that can be differentiated and used for prior model years. 

In [273]:
# For each model year that we have data (2022 through 2025),
# fit a separate spline regression.  Do this so that monthly
# inventory counts can be predicted despite not having
# direct access to this information.  (The only information
# that we have to go on is the graph from www.spglobal.com/mobility)
# Use the splines to interpolate values.  

model_yr_range = range(2022, 2026, 1)
inventory_model = dict.fromkeys(model_yr_range)

# Create some evenly space yr_diffs for evaluating each model.
# These same yr_diffs can be used for other model years
# to compare apples to apples.
step = 1.0/1200.0
a = -3.0
b = 3.0

yr_diffs_for_prediction = np.linspace(a, b, num=int((b - a)/step + 1))

# For each year, train and predict on only some of the yr_diffs.
yr_diff_filter = {
    2022: (-0.079428, 2.19),
    2023: (-2.0, 2.1),
    2024: (-2.0, 0.283906),
    2025: (-2.0, -0.716094)
}

# Create a data-frame to store the predictions from each model.
mixed_years_2 = pl.DataFrame()

# Get plotly figures set up before the loop.
fig = px.scatter()
fig_2 = px.scatter()

# Make a model for each model_year and fit the model.
for yr in model_yr_range:
    # For the current yr, get the yr_diffs to train and predict on.
    a, b = yr_diff_filter[yr]

    # Get the yr_diffs for predicting.
    yr_diffs_for_prediction_current = yr_diffs_for_prediction[(yr_diffs_for_prediction >= a) & (yr_diffs_for_prediction <= b)]
    
    # Get the yr_diffs for training.
    yr_diffs_for_training = (mixed_years
        # .filter((pl.col("yr_diff").is_between(a, b)) & (pl.col("model_year") == yr))
        .filter(pl.col("model_year") == yr)
        .sort(by=pl.col("yr_diff"))
        .select("yr_diff")
        .collect()
        .to_series()
        .to_numpy()
    )

    # Get the response values for training.
    y = (mixed_years
        .filter(pl.col("model_year") == yr)
        # .filter((pl.col("yr_diff").is_between(a, b)) & (pl.col("model_year") == yr))
        .sort(by=pl.col("yr_diff"))
        .select("advertised_inventory")
        .collect()
        .to_series()
        .to_numpy()
    )
    
    # Prepare to fit model.
    # Fit model.
    current_yr_inventory_model = Akima1DInterpolator(
        x=yr_diffs_for_training,
        y=y,
        method="makima",
        extrapolate=True
    )

    # Get the predictions.
    preds = current_yr_inventory_model(yr_diffs_for_prediction_current)

    # Replace negative values generated by the spline regression.
    # (I cannot figure out how to put constraints on the spline regression
    # other than to just fix the predictions like I do here.)
    preds = np.where(preds < 0, 0, preds)
    
    # derivatives = current_yr_inventory_model(
    #     x=yr_diffs_for_prediction_current, 
    #     nu=1
    # )

    # Save the fitted model.
    inventory_model[yr] = current_yr_inventory_model

    # Put the predictions from current_yr_inventory_model into a new data-frame.
    mixed_years_current_preds = pl.LazyFrame(
        data={
            "yr_diff": yr_diffs_for_prediction_current,
            "model_year": [yr]*len(yr_diffs_for_prediction_current),
            "predicted_inventory": preds,
            # "predicted_inventory_derivative": derivatives
        },
        schema={
            "yr_diff": pl.Float64,
            "model_year": pl.Int32,
            "predicted_inventory": pl.Float64,
            # "predicted_inventory_derivative": pl.Float64
        }
    )
    
    # Save mixed_years_current_preds for the current model_year.
    mixed_years_2 = (mixed_years_2
        .vstack(mixed_years_current_preds.collect())     
    )

    ########################################################
    # Figures
    ########################################################
    # Make a plot of the original training data alongside the predictions.
    fig.add_scatter(
        x=(mixed_years_2
           .filter(pl.col("model_year") == yr)
           .select("yr_diff")
           .to_series()
        ),
        y=(mixed_years_2
           .filter(pl.col("model_year") == yr)
           .select("predicted_inventory")
           .to_series()
        ),
        mode="markers",
        name=f"{yr} Predicted"
    )

    fig.add_scatter(
        x=(mixed_years
           .filter(pl.col("model_year") == yr)
           .select("yr_diff")
           .collect()
           .to_series()
        ),
        y=(mixed_years
           .filter(pl.col("model_year") == yr)
           .select("advertised_inventory")
           .collect()
           .to_series()
        ),
        mode="markers",
        name=f"{yr} Actual"
    )

    # https://plotly.com/python/figure-labels/
    fig.update_layout(
        title=dict(text="Curve Fitting for Different Model Years"),
        xaxis=dict(
            title=dict(
                text="calendar_time - (model_year + 6/12) (years)"
            )
        ),
        yaxis=dict(
            title=dict(
                text="Advertised Vehicle Inventory (thousands)"
            )
        )
    )

    # fig_2.add_scatter(
    #     x=(mixed_years_2
    #        .filter(pl.col("model_year") == yr)
    #        .select("yr_diff")
    #        .to_series()
    #     ),
    #     y=(mixed_years_2
    #        .filter(pl.col("model_year") == yr)
    #        .select("predicted_inventory_derivative")
    #        .to_series()
    #     ),
    #     mode="markers",
    #     name=f"{yr}"
    # )

    # fig_2.update_layout(
    #     title=dict(text="Predicted Inventory Derivatives for Different Model Years"),
    #     xaxis=dict(
    #         title=dict(
    #             text="calendar_time - (model_year + 6/12) (years)"
    #         )
    #     ),
    #     yaxis=dict(
    #         title=dict(
    #             text="Predicted d[Advertised Vehicle Inventory]/dt (thousands/yr)"
    #         )
    #     ),
    #     legend_title_text="Model Year"
    # )

# Show the figures.
fig.show()
# fig_2.show()

In [None]:
# Determine how to scale the inventory curve for model
# year 2023 to best match the inventory curve
# of model years 2022 and 2024.  
# Use the results to impute for model years 2022 and 2024.
# Determine the best scaling of the imputed curve for model year 2024 to
# match the curve for model year 2025.
# Use the reults to impute for model year 2025.
def raised_cosine_pdf(x: Array|float, mu: float=0.0, s: float=1.0):
    """https://en.m.wikipedia.org/wiki/Raised_cosine_distribution"""
    non_zero_pdf_val = 1.0/(2.0*s)*(1 + jnp.cos(np.pi * (x - mu)/s))
    pdf_vals = jnp.where(jnp.absolute(x - mu) > s, 0, non_zero_pdf_val)
    return pdf_vals

def stirlings_approx(n: Array|float):
    """Compute Stirling's approximation of n! to 2 orders.
    
    See: https://en.m.wikipedia.org/wiki/Stirling%27s_approximation
    """
    return jnp.sqrt(2*jnp.pi*n)*(n/jnp.e)**n * (1.0 + 1.0/(12.0 * n))

def stirlings_approx_gamma(z: Array|float, n: int=3):
    """Compute Stirling's approximation of Γ(z) to 2 orders.
    
    See: https://en.m.wikipedia.org/wiki/Stirling%27s_approximation
    Args:
        n: A non-negative integer.  Higher values should give more accuracy.  
            However, don't choose too high of a value or you will get
            overflow problems.
    """
    # See recurence formula: https://en.wikipedia.org/wiki/Gamma_function#Residues
    return stirlings_approx(z + n) / jnp.prod(z + jnp.arange(n + 1))
    
def symmetric_generalized_normal_pdf(x: Array|float, mu: float=0.0, alpha: float=1.0, beta: float=8.0):
    """https://en.m.wikipedia.org/wiki/Generalized_normal_distribution
    
    beta: positive
    """
    pdf_vals = beta/(2*alpha*stirlings_approx_gamma(1.0/beta))*jnp.exp(-(jnp.abs(x - mu)/alpha)**beta)
    return pdf_vals

def inventory_scaling_func(yr_diffs, params: Array):
    """
    params: Array with entries of `a`, `mu` and `s` (in that order)
    """
    a = params[0]
    mu = params[1]
    s = params[2]
    return 1.0 + a*raised_cosine_pdf(x=yr_diffs, mu=mu, s=s)

def inventory_scaling_func_2(yr_diffs, params: Array):
    """
    params: Array with entries of `a`, `mu`, `alpha`, and `beta` (in that order)
    """
    a = params[0]
    mu = params[1]
    alpha = params[2]
    beta = params[3]
    return 1.0 + a*symmetric_generalized_normal_pdf(x=yr_diffs, mu=mu, alpha=alpha, beta=beta)

def predict(yr_diffs, inventory, params):
    """
    params: Array with entries of `a`, `mu`, and `s` (in that order)
    """
    return inventory * inventory_scaling_func(yr_diffs=yr_diffs, params=params)

def predict_2(yr_diffs, inventory, params):
    """
    params: Array with entries of `a`, `mu`, `alpha`, and `beta` (in that order)
    """
    return inventory * inventory_scaling_func_2(yr_diffs=yr_diffs, params=params)

def mse(predicted: Array, actual: Array):
    n = len(actual)
    error = actual - predicted
    return (1.0/n * jnp.dot(error, error))

def loss(params: Array, yr_diffs, inventory, actual, constrained_index):
    """
    Args:
        params: `a`, `mu`, and `s`.
        constrained_index: either a 0 or a -1.  Used to indicate the
            entries in yr_diffs and inventory which should be predicted
            precisely.
    """
    predicted = predict(
        yr_diffs=yr_diffs,
        inventory=inventory,
        params=params
    )

    # print(f"actual[constrained_index]: {actual[constrained_index]}")
    # print(f"predicted[constrained_index]: {predicted[constrained_index]}")
    constraint_penalty = jnp.absolute(actual[constrained_index] - predicted[constrained_index])**3
    large_param_penalty = params[0]**2 + params[1]**2
    other_loss = mse(predicted=predicted, actual=actual)

    return constraint_penalty + large_param_penalty + other_loss

def loss_2(params: Array, yr_diffs, inventory, actual, constrained_index):
    """
    Args:
        params: Array with entries of `a`, `mu`, `alpha`, and `beta` (in that order)
        constrained_index: either a 0 or a -1.  Used to indicate the
            entries in yr_diffs and inventory which should be predicted
            precisely.
    """
    predicted = predict_2(
        yr_diffs=yr_diffs,
        inventory=inventory,
        params=params
    )

    # print(f"actual[constrained_index]: {actual[constrained_index]}")
    # print(f"predicted[constrained_index]: {predicted[constrained_index]}")
    constraint_penalty = jnp.absolute(actual[constrained_index] - predicted[constrained_index])**3
    large_param_penalty = params[0]**2 + params[1]**2
    other_loss = mse(predicted=predicted, actual=actual)

    return constraint_penalty + large_param_penalty + other_loss

@jit
def update(params, learning_rate, grad_loss, clip):
    # Prevent problems with exploding gradients using clipping.
    # https://arxiv.org/pdf/1211.5063
    grad_loss_mag = jnp.linalg.norm(grad_loss)
    grad_loss = cond(
        pred=(grad_loss_mag >= clip),
        true_fun=lambda x: clip/grad_loss_mag*x,
        false_fun=lambda x: x,
        operand=grad_loss
        
    )
    
    new_params = params - learning_rate * grad_loss
    return new_params

In [16]:
params = jnp.array([-1.6, -0.1, 1.5])

In [17]:
yr_diffs=(mixed_years_2
    .filter((pl.col("yr_diff").is_between(*yr_diff_filter[2022])) & (pl.col("yr_diff").is_between(*yr_diff_filter[2023])) & (pl.col("model_year") == 2022))
    .select("yr_diff")
    .to_series()
    .to_numpy()
)

inventory = (mixed_years_2
    .filter((pl.col("yr_diff").is_between(*yr_diff_filter[2022])) & (pl.col("model_year") == 2023))
    .select("predicted_inventory")
    .to_series()
    .to_numpy()
)

actual = (mixed_years_2
    .filter((pl.col("yr_diff").is_between(*yr_diff_filter[2022])) & (pl.col("yr_diff").is_between(*yr_diff_filter[2023])) & (pl.col("model_year") == 2022))
    .select("predicted_inventory")
    .to_series()
    .to_numpy()
)

preds = predict(
    yr_diffs=yr_diffs,
    inventory=inventory,
    params=params
)

for i in range(400):
    grad_loss = grad(loss)( 
        params,
        yr_diffs,
        inventory,
        actual,
        0
    )

    params = update(
        params=params,
        learning_rate=1e-8,
        grad_loss=grad_loss,
        clip=1e6
    )

    if i % 20 == 0:
        print(f"grad_loss {grad_loss}")
        print(f"params: {params}")

fig = px.scatter()

fig.add_scatter(
    x=(mixed_years_2
        .filter(pl.col("model_year") == 2023)
        .select("yr_diff")
        .to_series()
    ),
    y=(mixed_years_2
        .filter(pl.col("model_year") == 2023)
        .select("predicted_inventory")
        .to_series()
    ),
    mode="markers",
    name=f"2023 Predicted"
)

fig.add_scatter(
    x=(mixed_years_2
        .filter(pl.col("model_year") == 2022)
        .select("yr_diff")
        .to_series()
    ),
    y=actual,
    mode="markers",
    name=f"2022 Predicted"
)

fig.add_scatter(
    x=yr_diffs,
    y=preds
)

fig.show()

grad_loss [-3.4066066e+09  2.5418824e+08 -3.6300628e+09]
params: [-1.5931659  -0.10050994  1.5072825 ]
grad_loss [-1.6622984e+09  1.4626270e+08 -1.4745592e+09]
params: [-1.4497596  -0.11226591  1.6460913 ]
grad_loss [-7.2695475e+08  7.0944896e+07 -5.3446915e+08]
params: [-1.2944003  -0.12678711  1.7710887 ]
grad_loss [-2.4677971e+08  2.5153914e+07 -1.4868894e+08]
params: [-1.128388  -0.1434514  1.881244 ]
grad_loss [-38673716.    3863941.2 -18713132. ]
params: [-0.95307285 -0.16123244  1.9756933 ]
grad_loss [-36038.246    -435.6955 -16365.417 ]
params: [-0.83167297 -0.17271382  2.0297387 ]
grad_loss [-2783.495  -3660.5847 -2893.8176]
params: [-0.82958704 -0.17213006  2.030936  ]
grad_loss [ -301.08917 -3889.851   -1885.7783 ]
params: [-0.82936263 -0.1713668   2.0313804 ]
grad_loss [  151.42618 -3918.3416  -1695.611  ]
params: [-0.8293614 -0.1705847  2.0317326]
grad_loss [  249.49602 -3911.8672  -1648.1752 ]
params: [-0.82940435 -0.16980146  2.0320654 ]
grad_loss [  271.64365 -3898.0208

# Symmetric Generalized Normal Distribution

In [84]:
x = np.linspace(-3, 3, 100)
params_2 = np.array([-1.6, -0.1, 1.5, 9.2])    
y=inventory_scaling_func_2(yr_diffs=x, params=params_2)

px.line(
    x=x,
    y=y
)

In [85]:
yr_diffs=(mixed_years_2
    .filter((pl.col("yr_diff").is_between(*yr_diff_filter[2022])) & (pl.col("yr_diff").is_between(*yr_diff_filter[2023])) & (pl.col("model_year") == 2022))
    .select("yr_diff")
    .to_series()
    .to_numpy()
)

inventory = (mixed_years_2
    .filter((pl.col("yr_diff").is_between(*yr_diff_filter[2022])) & (pl.col("model_year") == 2023))
    .select("predicted_inventory")
    .to_series()
    .to_numpy()
)

actual = (mixed_years_2
    .filter((pl.col("yr_diff").is_between(*yr_diff_filter[2022])) & (pl.col("yr_diff").is_between(*yr_diff_filter[2023])) & (pl.col("model_year") == 2022))
    .select("predicted_inventory")
    .to_series()
    .to_numpy()
)

preds_2 = predict_2(
    yr_diffs=yr_diffs,
    inventory=inventory,
    params=params_2
)

# for i in range(200):
#     grad_loss_2 = grad(loss_2)( 
#         params_2,
#         yr_diffs,
#         inventory,
#         actual,
#         0
#     )

#     params_2 = update(
#         params=params_2,
#         learning_rate=1e-5,
#         grad_loss=grad_loss_2
#     )

#     if i % 50 == 0:
#         print(f"grad_loss_2 {grad_loss_2}")
#         print(f"params_2: {params_2}")

#         loss_2_val = loss_2(
#             params=params_2,
#             yr_diffs=yr_diffs,
#             inventory=inventory,
#             actual=actual,
#             constrained_index=0
#         )

#         print(f"loss_2: {loss_2_val}\n")

fig = px.scatter()

fig.add_scatter(
    x=(mixed_years_2
        .filter(pl.col("model_year") == 2023)
        .select("yr_diff")
        .to_series()
    ),
    y=(mixed_years_2
        .filter(pl.col("model_year") == 2023)
        .select("predicted_inventory")
        .to_series()
    ),
    mode="markers",
    name=f"2023 Predicted"
)

fig.add_scatter(
    x=(mixed_years_2
        .filter(pl.col("model_year") == 2022)
        .select("yr_diff")
        .to_series()
    ),
    y=actual,
    mode="markers",
    name=f"2022 Predicted"
)

fig.add_scatter(
    x=yr_diffs,
    y=preds_2
)

fig.show()

In [388]:
yr_diffs=(mixed_years_2
    .filter(pl.col("yr_diff").is_between(*yr_diff_filter[2023]))
    .select("yr_diff")
    .to_series()
    .to_numpy()
)

inventory = (mixed_years_2
    .filter(pl.col("yr_diff").is_between(*yr_diff_filter[2023]))
    .select("predicted_inventory")
    .to_series()
    .to_numpy()
)

preds_2b = predict_2(
    yr_diffs=yr_diffs,
    inventory=inventory,
    params=params_2
)

fig.add_scatter(
    x=yr_diffs,
    y=preds_2b,
    mode="markers"
)

fig.show()

In [390]:
len(preds_2b)

11817

# Metalog

Fit metalog PDF curves for each model year.  There can be a penalty for parameters being far from the parameters for 2023.  Do Vincent averaging to get one metalog PDF curve representative of all the model years.

In [264]:
inv_integral_2023 = cumulative_trapezoid(
    y=mixed_years_2
        .filter(pl.col("model_year") == 2023)
        .select("predicted_inventory")
        .to_series(),
    dx=step
)

inv_integral_2023_2 = inv_integral_2023 / inv_integral_2023[-1]

x=(mixed_years_2
    .filter(pl.col("model_year") == 2023)
    .select("yr_diff")
    .to_series()
    .to_numpy()[1:]
)

px.scatter(
    x=x,
    y=inv_integral_2023_2
)

In [265]:
p = inv_integral_2023_2[(inv_integral_2023_2 > 1e-5) & (inv_integral_2023_2 < (1 - 1e-5))]
num_rows = len(p)
x = x[(inv_integral_2023_2 > 1e-5) & (inv_integral_2023_2 < (1 - 1e-5))]

px.scatter(
    x=x,
    y=p
)

In [266]:
inv_integral_2023_2_filter = [(inv_integral_2023_2 > 1e-5) & (inv_integral_2023_2 < (1 - 1e-5))][::80]
p = inv_integral_2023_2[*inv_integral_2023_2_filter]
num_rows = len(p)
x=(mixed_years_2
    .filter(pl.col("model_year") == 2023)
    .select("yr_diff")
    .to_series()
    .to_numpy()[1:]
)
x = x[*inv_integral_2023_2_filter]

px.scatter(
    x=x,
    y=p
)

In [267]:
# https://blogs.sas.com/content/iml/2023/02/22/metalog-distribution.html

# Construct basis functions
# https://en.wikipedia.org/wiki/Metalog_distribution#Fitting_to_data
def g_i(i):
    if i == 1:
        g_i_func = lambda p: 1.0
    elif i == 2:
        g_i_func = lambda p: jnp.log(p/(1-p))
    elif i == 3:
        g_i_func = lambda p: (p - 0.5) * jnp.log(p/(1-p))
    elif i == 4:
        g_i_func = lambda p: p - 0.5
    elif (i % 2 == 1) and (i >= 5):
        g_i_func = lambda p: (p - 0.5)**((i - 1)/2)
    elif (i % 2 == 0) and (i >= 6):
        g_i_func = lambda p: (p - 0.5)**(i/2 - 1)
    
    return g_i_func


k = 100
@partial(jit, static_argnames=["k"])
def m(p, metalog_params, k):
    derivs_of_bases = jnp.array(
        list(grad(g_i(i))(p) for i in range(1, k + 1, 1))
    ).T
    return (metalog_params @ derivs_of_bases)**(-1)

M = np.array([
    [g_i(i=1)(p)]*num_rows,
    *list(g_i(i)(p) for i in range(2, k + 1, 1))
]).T
a = M.T @ M
b = M.T @ x
metalog_params_lstsq_result = np.linalg.lstsq(
    a=a,
    b=b
)
print(metalog_params_lstsq_result)
metalog_params = metalog_params_lstsq_result[0]

(array([-5.78131941e-02,  9.65578014e-02, -3.10571482e-02,  9.17505830e-01,
        1.26232463e+00,  1.26231090e+00, -5.27795257e+00, -5.27795132e+00,
       -4.00758890e+01, -4.00758891e+01,  8.85774554e+01,  8.85774554e+01,
        4.93686648e+02,  4.93686648e+02, -4.90894284e+02, -4.90894284e+02,
       -2.32896352e+03, -2.32896352e+03,  7.26630438e+02,  7.26630438e+02,
        3.05640858e+03,  3.05640858e+03,  7.26214613e+02,  7.26214613e+02,
        2.83950045e+03,  2.83950045e+03,  3.88073853e+02,  3.88073853e+02,
        1.44348701e+03,  1.44348701e+03,  1.60545197e+02,  1.60545197e+02,
        5.76277706e+02,  5.76277706e+02,  5.76369818e+01,  5.76369818e+01,
        2.01576273e+02,  2.01576273e+02,  1.89210987e+01,  1.89210987e+01,
        6.49178745e+01,  6.49178745e+01,  5.84401511e+00,  5.84401511e+00,
        1.97714529e+01,  1.97714529e+01,  1.72768610e+00,  1.72768610e+00,
        5.78676329e+00,  5.78676329e+00,  4.94349967e-01,  4.94349967e-01,
        1.64452155e+00, 

In [268]:
inventory = np.empty(shape = (len(p),))
i = 0
for cum_prob in p:
    inventory[i] = m(p=cum_prob, metalog_params=metalog_params, k=k)
    i += 1

In [269]:
def loss(scaling_factor: float, params: Array, yr_diffs, inventory, actual):
    """
    Args:
        params: 
    """
    other_loss = mse(predicted=scaling_factor * inventory, actual=actual)

    return other_loss

In [270]:
scaling_factor = 1678.3

actual = (mixed_years_2
    .filter(pl.col("model_year") == 2023)
    .select("predicted_inventory")
    .to_series()
    .to_numpy()
)[1: ][(inv_integral_2023_2 > 1e-5) & (inv_integral_2023_2 < (1 - 1e-5))][::100]

for i in range(150):
    grad_loss = grad(loss)( 
        scaling_factor,
        params,
        yr_diffs,
        inventory,
        actual
    )

    scaling_factor = update(
        params=scaling_factor,
        learning_rate=1,
        grad_loss=grad_loss,
        clip=1e6
    )

    if i % 20 == 0:
        print(f"grad_loss {grad_loss}")
        print(f"scaling_factor: {scaling_factor}")
print(scaling_factor)

fig = px.scatter()

fig.add_scatter(
    x=x,
    y=scaling_factor*inventory,
    mode="markers"
)

fig.add_scatter(
    x=x,
    y=actual,
    mode="markers"
)

fig.show()

TypeError: sub got incompatible shapes for broadcasting: (40,), (3996,).

## Metalog 2022

In [None]:
0.4015 # yr_diff for 2022 median area-under-the-curve
yr_diffs=(mixed_years_2
    .filter((pl.col("yr_diff").is_between(*yr_diff_filter[2022])) & (pl.col("yr_diff").is_between(*yr_diff_filter[2023])) & (pl.col("model_year") == 2022))
    .select("yr_diff")
    .to_series()
    .to_numpy()
)

inventory = (mixed_years_2
    .filter((pl.col("yr_diff").is_between(*yr_diff_filter[2022])) & (pl.col("model_year") == 2023))
    .select("predicted_inventory")
    .to_series()
    .to_numpy()
)

actual = (mixed_years_2
    .filter((pl.col("yr_diff").is_between(*yr_diff_filter[2022])) & (pl.col("yr_diff").is_between(*yr_diff_filter[2023])) & (pl.col("model_year") == 2022))
    .select("predicted_inventory")
    .to_series()
    .to_numpy()
)

preds = predict(
    yr_diffs=yr_diffs,
    inventory=inventory,
    params=params
)

for i in range(400):
    grad_loss = grad(loss)( 
        params,
        yr_diffs,
        inventory,
        actual,
        0
    )

    params = update(
        params=params,
        learning_rate=1e-8,
        grad_loss=grad_loss,
        clip=1e6
    )

    if i % 20 == 0:
        print(f"grad_loss {grad_loss}")
        print(f"params: {params}")

# Kernel

In [15]:
fig = px.scatter()
yr_diffs_to_use = dict()
rel_dist_from_2023_pred_inventory = dict()

for yr in range(2022, 2026, 1):
    # We can only compare to 2023 at common times.
    yr_diffs_to_use[yr] = {
        "for_comparison": (mixed_years_2
            .filter((pl.col("model_year") == 2023) & pl.col("yr_diff").is_between(
                max(yr_diff_filter[yr][0], yr_diff_filter[2023][0]), 
                min(yr_diff_filter[yr][1], yr_diff_filter[2023][1])))
            .select("yr_diff")
            .to_series()
            .to_numpy()
        ),
        # 2022 uses 2024
        # 
        "for_prediction": (mixed_years_2
            .filter((pl.col("model_year") == 2023) & pl.col("yr_diff").is_between(
                max(yr_diff_filter[yr][0], yr_diff_filter[2023][0]), 
                min(yr_diff_filter[yr][1], yr_diff_filter[2023][1])))
            .select("yr_diff")
            .to_series()
            .to_numpy()
        )
    }
    
    
    
    
    # Calculate the relative distance from model year 2023's predicted inventory
    # at each time point.
    preds_for_2023_to_use = (mixed_years_2
        .filter((pl.col("model_year") == 2023) & pl.col("yr_diff").is_in(yr_diffs_to_use[yr]))
        .select("predicted_inventory")
        .to_series()
    )

    rel_dist_from_2023_pred_inventory[yr] = np.absolute(
        np.subtract(
            preds_for_2023_to_use, 
            mixed_years_2
                .filter((pl.col("model_year") == yr) & pl.col("yr_diff").is_in(yr_diffs_to_use[yr]))
                .select("predicted_inventory")
                .to_series()
        )
    ) / (preds_for_2023_to_use + 200)
    
    fig.add_scatter(
        x=yr_diffs_to_use[yr],
        y=rel_dist_from_2023_pred_inventory[yr],
        mode="markers",
        name=f"{yr} Predicted"
    )
fig.show()

InvalidOperationError: 'is_in' cannot check for String values in Float64 data

Resolved plan until failure:

	---> FAILED HERE RESOLVING 'filter' failed <---
FILTER [([(col("model_year")) == (2023)]) & (col("yr_diff").is_in([Series]))] FROM
  DF ["yr_diff", "model_year", "predicted_inventory", "predicted_inventory_derivative"]; PROJECT */4 COLUMNS

In [None]:
rel_dist_from_2023_pred_inventory_2024_sub = rel_dist_from_2023_pred_inventory[2024].filter(yr_diffs_to_use[2024] >= yr_diffs_to_use[2022][0]).to_numpy()
rel_dist_from_2023_pred_inventory_2022_sub = rel_dist_from_2023_pred_inventory[2022].filter(yr_diffs_to_use[2022] <= yr_diffs_to_use[2024][-1]).to_numpy()
yr_diffs_to_use_2024_sub = yr_diffs_to_use[2024].filter(yr_diffs_to_use[2024] >= yr_diffs_to_use[2022][0]).to_numpy()

rel_dist_from_2023_pred_inventory_2024_2022_avg = np.mean(
    rel_dist_from_2023_pred_inventory_2024_sub - rel_dist_from_2023_pred_inventory_2022_sub
)

np.float64(0.1838212070033685)

In [79]:
def normal_kernel(x: np.ndarray | float, mu: float=0.0, sigma: float=1.0) -> np.ndarray | float:
    """
    Parameters:
        
    """
    return np.exp(-((x - mu)**2)/(2 * sigma**2))

def weighted_avg(a:float, b:float, distances_from_a: np.ndarray | float, mu=0.0, sigma=1.0):
    a_weights = normal_kernel(x=distances_from_a, mu=mu, sigma=sigma)
    b_weights = 1 - a_weights

    return a * a_weights + b * b_weights

In [89]:
rel_dist_from_2023_pred_inventory_2024_sub_last = rel_dist_from_2023_pred_inventory_2024_sub[-1]
yr_diffs = yr_diffs_to_use[2022].filter(yr_diffs_to_use[2022] > yr_diffs_to_use[2024][-1]).to_numpy()
distances_from_a = yr_diffs - min(yr_diffs)
avgs = weighted_avg(
    a=rel_dist_from_2023_pred_inventory_2024_sub_last,
    b=rel_dist_from_2023_pred_inventory_2024_2022_avg,
    distances_from_a=distances_from_a
)
    
a = preds_for_2023_to_use.filter(yr_diffs_to_use[2023] > yr_diffs_to_use[2024][-1]).to_numpy()
b = a - avgs*(a + 200)
b

ShapeError: filter's length: 4920 differs from that of the series: 1541

In [58]:
rel_dist_from_2023_pred_inventory_2024_sub = rel_dist_from_2023_pred_inventory[2024].filter(yr_diffs_to_use[2024] >= yr_diffs_to_use[2022][0]).to_numpy()
rel_dist_from_2023_pred_inventory_2022_sub = rel_dist_from_2023_pred_inventory[2022].filter(yr_diffs_to_use[2022] <= yr_diffs_to_use[2024][-1]).to_numpy()
yr_diffs_to_use_2024_sub = yr_diffs_to_use[2024].filter(yr_diffs_to_use[2024] >= yr_diffs_to_use[2022][0]).to_numpy()

X = np.column_stack((
    np.ones(shape=len(yr_diffs_to_use_2024_sub)),
    yr_diffs_to_use_2024_sub,
    rel_dist_from_2023_pred_inventory_2024_sub,
    # yr_diffs_to_use_2024_sub * rel_dist_from_2023_pred_inventory_2024_sub
))

res = sm.OLS(
    endog=rel_dist_from_2023_pred_inventory_2022_sub,
    exog=X
).fit()
res.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.842
Model:,OLS,Adj. R-squared:,0.841
Method:,Least Squares,F-statistic:,1155.0
Date:,"Wed, 02 Apr 2025",Prob (F-statistic):,2.86e-174
Time:,10:09:49,Log-Likelihood:,1475.2
No. Observations:,436,AIC:,-2944.0
Df Residuals:,433,BIC:,-2932.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.4539,0.051,8.878,0.000,0.353,0.554
x1,-0.1918,0.008,-25.309,0.000,-0.207,-0.177
x2,-0.1612,0.095,-1.701,0.090,-0.347,0.025

0,1,2,3
Omnibus:,42.662,Durbin-Watson:,0.001
Prob(Omnibus):,0.0,Jarque-Bera (JB):,53.202
Skew:,-0.848,Prob(JB):,2.8e-12
Kurtosis:,2.767,Cond. No.,311.0


In [59]:
yr_diffs_to_use_2024_sub_2 = yr_diffs_to_use[2024].filter(yr_diffs_to_use[2024] < yr_diffs_to_use[2022][0]).to_numpy()
rel_dist_from_2023_pred_inventory_2024_sub_2 = rel_dist_from_2023_pred_inventory[2024].filter(yr_diffs_to_use[2024] < yr_diffs_to_use[2022][0]).to_numpy()

X = np.column_stack((
    np.ones(shape=len(yr_diffs_to_use_2024_sub_2)),
    yr_diffs_to_use_2024_sub_2,
    rel_dist_from_2023_pred_inventory_2024_sub_2,
    # yr_diffs_to_use_2024_sub_2 * rel_dist_from_2023_pred_inventory_2024_sub_2
))

rel_dist_from_2023_pred_inventory_2022_pred = res.predict(exog=X)

In [60]:
px.scatter(
    x=np.concat((yr_diffs_to_use_2024_sub_2, yr_diffs_to_use_2024_sub)),
    y=np.concat((rel_dist_from_2023_pred_inventory_2022_pred, rel_dist_from_2023_pred_inventory_2022_sub))
)

In [None]:
# Do kernel regression to get an average across all model years.
derivative_model_2 = KernelReg(
    endog=mixed_years_2.select("predicted_inventory_derivative").to_series().to_numpy(),
    exog=mixed_years_2.select("yr_diff").to_series().to_numpy(),
    var_type="c",
    reg_type="ll",
    bw=[0.04]
)
predicted_inventory_derivative, _ = derivative_model_2.fit()

In [20]:
fig = px.scatter()
fig.add_scatter(
    x=(mixed_years_2
        .select("yr_diff")
        .to_series()
    ),
    y=predicted_inventory_derivative,
    mode="markers",
    name=f"{yr}"
)

fig.update_layout(
    title=dict(text="Predicted Average Inventory Derivative"),
    xaxis=dict(
        title=dict(
            text="calendar_time - (model_year + 6/12) (years)"
        )
    ),
    yaxis=dict(
        title=dict(
            text="Predicted d[Advertised Vehicle Inventory]/dt (thousands/yr)"
        )
    )
)

# Show the figure
fig.show()

In [None]:
np.trapezoid(y=mixed_years_2
           .filter(pl.col("model_year") == 2023)
           .select("predicted_inventory_derivative")
           .to_series().to_numpy(), dx=step)

np.float64(0.0018605435097924783)

In [None]:
# Make sure that the predicted inventory derivative integrates to 0.


Now, adjust for the fact that in recent years, model years have been released earlier and earlier.  See [link](https://en.wikipedia.org/wiki/Model_year#:~:text=Model%20year%20followed%20with%20calendar,that%20the%20vehicle%20was%20manufactured.).

In 1935, the derivative of inventory can become positive at `calendar_time - (model_year + 6/12) = -0.417`.  In 1965, the derivative of inventory can become positive at `calendar_time - (model_year + 6/12) = -1.167`.

In [28]:
fig = px.scatter()
fig.add_scatter(
    x=(mixed_years_2
        .select("yr_diff")
        .to_series()
        + 0.5
    ),
    y=predicted_inventory_derivative,
    mode="markers",
    name=f"{yr}"
)

fig.update_layout(
    title=dict(text="Predicted Average Inventory Derivative (Adjusted)"),
    xaxis=dict(
        title=dict(
            text="calendar_time - (model_year + 6/12) (years)"
        )
    ),
    yaxis=dict(
        title=dict(
            text="Predicted d[Advertised Vehicle Inventory]/dt (thousands/yr)"
        )
    )
)

# Show the figure
fig.show()