In [19]:
from pathlib import Path
from functools import partial

import numpy as np
from skfda.representation.grid import FDataGrid
from skfda.representation.basis import BSplineBasis
from skfda.preprocessing.dim_reduction import FPCA
from skfda.misc.regularization import L2Regularization
from skfda.misc.operators import LinearDifferentialOperator
from skfda.exploratory.stats import (std, gmean, fisher_rao_karcher_mean)
from skfda.preprocessing.smoothing import (KernelSmoother, BasisSmoother)
from skfda.preprocessing.registration import FisherRaoElasticRegistration
from skfda.misc.kernels import normal
from skfda.misc.hat_matrix import NadarayaWatsonHatMatrix
from skfda.ml.regression import LinearRegression

from jax import (grad, jit, Array, vjp)
from jax.lax import cond
import jax.numpy as jnp
from scipy.interpolate import (make_lsq_spline, BSpline, Akima1DInterpolator)
from scipy.integrate import cumulative_trapezoid
from statsmodels.nonparametric.kernel_regression import KernelReg
import statsmodels.api as sm
import polars as pl
import plotly.express as px

# Exploratory Data Analysis and Feature Engineering

We have data on model_year, calendar_time, and advertised_inventory.

In [2]:
# Import data
mixed_years_path = Path("..", "raw_data", "model_year_vs_calendar_time", "model_year_vs_calendar_time.csv")
mixed_years = pl.scan_csv(
    source=mixed_years_path,
)
# View head of data
mixed_years.collect().head()

model_year,calendar_time,advertised_inventory,model_year_is_previous_calendar_year,model_year_is_calendar_year,model_year_is_next_calendar_year
i64,f64,f64,i64,i64,i64
2022,2020.5,0.0,0,0,0
2022,2020.9,0.0,0,0,0
2022,2022.420572,935.612174,0,1,0
2022,2022.441428,928.768412,0,1,0
2022,2022.462284,920.898087,0,1,0


Create a new variable called yr_diff to represent how far ahead the calendar_time is from the model_year.

In [3]:
mixed_years = (
    mixed_years
        .select(
            "model_year",
            # Use the midpoint rule.
            (pl.col("calendar_time") - (pl.col("model_year") + 6/12)).alias("yr_diff"),
            "advertised_inventory"
        )
)

# Also, pad in some more 0s for easier modeling later.
mixed_years = (mixed_years
    .collect()
    .vstack(
        pl.DataFrame(
            {
                "model_year": [2022]*10, 
                "yr_diff": np.linspace(2.05, 2.19, 10),
                "advertised_inventory": [0.0]*10,
            }
        )
    )
    .vstack(
        pl.DataFrame(
            {
                "model_year": [2023]*11, 
                "yr_diff": [1.95, 2.0, 2.05, -1.65, -1.70, -1.75, -1.8, -1.85, -1.9, -1.95, 2.19],
                "advertised_inventory": [0.0]*11,
            }
        )
    )
    .vstack(
        pl.DataFrame(
            {
                "model_year": [2024], 
                "yr_diff": [2.19],
                "advertised_inventory": [0.0],
            }
        )
    )
    .vstack(
        pl.DataFrame(
            {
                "model_year": [2025], 
                "yr_diff": [2.19],
                "advertised_inventory": [0.0],
            }
        )
    )
    .lazy()
)

## Plot Data

In [4]:
x = mixed_years.select("yr_diff").collect().to_series().to_numpy()
y = mixed_years.select("advertised_inventory").collect().to_series().to_numpy()
color = mixed_years.select(pl.col("model_year").cast(pl.Utf8)).collect().to_series().to_numpy()

px.scatter(
    x=x,
    y=y,
    color=color,
    title="The Rise and Fall of Vehicle Inventories",
    labels={
        "x": "calendar_time - (model_year + 6/12) (years)",
        "y": "Advertised Vehicle Inventory (thousands)",
        "color": "Model Year"
    }
)

  sf: grouped.get_group(s if len(s) > 1 else s[0])


Except for some fluctations in year-to-year amplitudes, the pattern of inventory rise and fall for different model years appears remarkably consistent.  Inventories for each model year peak around July 1st of the same calendar year.

# Predict the Average Inventory Curve
We will combine our data from the different model years to predict an average inventory curve that can be differentiated and used for prior model years. 
## Interpolate using modified Akima piecewise cubic Hermite interpolation


In [5]:
# For each model year that we have data (2022 through 2025),
# fit a separate spline regression.  Do this so that monthly
# inventory counts can be predicted despite not having
# direct access to this information.  (The only information
# that we have to go on is the graph from www.spglobal.com/mobility)
# Use the splines to interpolate values.  

model_yr_range = range(2022, 2026, 1)
inventory_model = dict.fromkeys(model_yr_range)

# Create some evenly space yr_diffs for evaluating each model.
# These same yr_diffs can be used for other model years
# to compare apples to apples.
step = 1.0/1200.0
a = -2.0
b = 2.19

yr_diffs_for_prediction = np.linspace(a, b, num=int((b - a)/step + 1))

# For each year, train and predict on only some of the yr_diffs.
# Actually, predict for all of the yr_diffs first and then
# go back and change predictions to NAs for predictions
# based on yr_diffs outside of the yr_diff_filter.
yr_diff_filter = {
    2022: (-0.079428, 2.19),
    2023: (-2.0, 2.19),
    2024: (-2.0, 0.283906),
    2025: (-2.0, -0.716094)
}

# Create a data-frame to store the predictions from each model.
mixed_years_2 = pl.DataFrame()

# Make a model for each model_year and fit the model.
for yr in model_yr_range:
    # For the current yr, get the yr_diffs to train and predict on.
    a, b = yr_diff_filter[yr]

    # Get the yr_diffs for training.
    yr_diffs_for_training = (mixed_years
        .filter(pl.col("model_year") == yr)
        .sort(by=pl.col("yr_diff"))
        .select("yr_diff")
        .collect()
        .to_series()
        .to_numpy()
    )

    # Get the response values for training.
    y = (mixed_years
        .filter(pl.col("model_year") == yr)
        .sort(by=pl.col("yr_diff"))
        .select("advertised_inventory")
        .collect()
        .to_series()
        .to_numpy()
    )
    
    # Prepare to fit model.
    # Fit model.
    current_yr_inventory_model = Akima1DInterpolator(
        x=yr_diffs_for_training,
        y=y,
        method="makima",
        extrapolate=True
    )

    # Get the predictions.
    preds = current_yr_inventory_model(yr_diffs_for_prediction)

    # Change some of the predictions to missing values 
    # if they are from extrapolating yr_diffs.
    preds = np.where(
        (yr_diffs_for_prediction < a) | (yr_diffs_for_prediction > b),
        np.nan,
        preds
    )

    # Replace negative values generated by the spline regression.
    # (I cannot figure out how to put constraints on the spline regression
    # other than to just fix the predictions like I do here.)
    preds = np.where(preds < 0, 0, preds)

    # Save the fitted model.
    inventory_model[yr] = current_yr_inventory_model

    # Put the predictions from current_yr_inventory_model into a new data-frame.
    mixed_years_current_preds = pl.LazyFrame(
        data={
            "yr_diff": yr_diffs_for_prediction,
            "model_year": [yr]*len(yr_diffs_for_prediction),
            "predicted_inventory": preds
        },
        schema={
            "yr_diff": pl.Float64,
            "model_year": pl.Int32,
            "predicted_inventory": pl.Float64
        }
    )
    
    # Save mixed_years_current_preds for the current model_year.
    mixed_years_2 = (mixed_years_2
        .vstack(mixed_years_current_preds.collect())     
    )

# Make it lazy.
mixed_years_2 = (mixed_years_2
    .sort(by=["model_year", "yr_diff"])
    .lazy()
)

## Plot of Interpolations

In [6]:
fig = px.scatter()
for yr in model_yr_range:
    # Make a plot of the original training data alongside the predictions.
    x_even = (mixed_years_2
        .filter(pl.col("model_year") == yr)
        .select("yr_diff")
        .collect()
        .to_series()
    )

    y_pred = (mixed_years_2
        .filter(pl.col("model_year") == yr)
        .select("predicted_inventory")
        .collect()
        .to_series()
    )

    fig.add_scatter(
        x=x_even,
        y=y_pred,
        mode="markers",
        name=f"{yr} Predicted"
    )

    x_original = (mixed_years
        .filter(pl.col("model_year") == yr)
        .select("yr_diff")
        .collect()
        .to_series()
    )

    y_act = (mixed_years
        .filter(pl.col("model_year") == yr)
        .select("advertised_inventory")
        .collect()
        .to_series()
    )

    fig.add_scatter(
        x=x_original,
        y=y_act,
        mode="markers",
        name=f"{yr} Actual"
    )

    # https://plotly.com/python/figure-labels/
    fig.update_layout(
        title=dict(text="Curve Fitting for Different Model Years"),
        xaxis=dict(
            title=dict(
                text="calendar_time - (model_year + 6/12) (years)"
            )
        ),
        yaxis=dict(
            title=dict(
                text="Advertised Vehicle Inventory (thousands)"
            )
        )
    )

fig.show()

## Extrapolate using Functional Data Analysis

In [7]:
# For each model year, figure out where
# our data ends.
ends = (mixed_years_2
    .filter(pl.col("predicted_inventory").is_not_nan())
    .filter(
        (
            (pl.col("model_year") == 2022) & (pl.col("yr_diff") == pl.col("yr_diff").min().over("model_year"))
        )
        |
        (
            (pl.col("model_year") == 2024) & (pl.col("yr_diff") == pl.col("yr_diff").max().over("model_year"))
        )
        |
        (
            (pl.col("model_year") == 2025) & (pl.col("yr_diff") == pl.col("yr_diff").max().over("model_year"))
        )
    )
)

ends.collect()

yr_diff,model_year,predicted_inventory
f64,i32,f64
-0.078785,2022,935.574723
0.283788,2024,1926.95002
-0.716411,2025,1169.16697


In [8]:
ratios = (ends
    .join(
        other=(mixed_years_2
        ),
        on=["yr_diff"]
    )
    .filter((pl.col("predicted_inventory_right").is_not_nan()))
    .with_columns(
        (pl.col("predicted_inventory") / pl.col("predicted_inventory_right")).alias("predicted_inventory_right_ratio")
    )
    .filter(pl.col("predicted_inventory_right_ratio") != 1)
    .select("model_year", "model_year_right", "predicted_inventory_right_ratio")
    .sort(by="model_year")
)

ratios.collect()

model_year,model_year_right,predicted_inventory_right_ratio
i32,i32,f64
2022,2023,0.594405
2022,2024,0.36869
2024,2022,2.502214
2024,2023,1.618999
2025,2023,2.287845
2025,2024,1.150884


### 2022

In [9]:
data_matrix = np.array(
    list(
        (mixed_years_2
            .filter(
                (pl.col("model_year") == yr) & 
                (pl.col("yr_diff") <= ends.filter(pl.col("model_year") == 2024).select("yr_diff").collect().item())
            )
            .sort(by="yr_diff")
            .select("predicted_inventory")
            .collect()
            .to_series()
            .to_numpy()
            .T
        ) for yr in [2023, 2024]
    )
)

grid_points = (mixed_years_2
    .filter(
        (pl.col("model_year") == 2024) & 
        (pl.col("yr_diff") <= ends.filter(pl.col("model_year") == 2024).select("yr_diff").collect().item())
    )
    .sort(by="yr_diff")
    .select("yr_diff")
    .collect()
    .to_series()
    .to_numpy()
)

# Create a FDataGrid object for use in functional data analysis.
fd_grid_obj_1 = FDataGrid(
    data_matrix=data_matrix,
    grid_points=grid_points
)

mean_inventory = fisher_rao_karcher_mean(
    fdatagrid=fd_grid_obj_1
)

In [10]:
yr_diff_cutoff = (ends
    .filter(pl.col("model_year") == 2022)
    .select("yr_diff")
    .collect()
    .item()
)

# Use the appropriately scaled 
# mean inventory curve for 2023 and 2024 to impute for 2022.
mean_inventories = mean_inventory.data_matrix.ravel()
subset_1 = (mean_inventory.grid_points[0] == yr_diff_cutoff)
mean_for_scaling = mean_inventories[subset_1]

subset_2 = mean_inventory.grid_points[0] < yr_diff_cutoff
extrapolations_1 = mean_inventories[subset_2]

extrapolations_2 = (mixed_years_2
    .filter((pl.col("model_year") == 2023) & (pl.col("yr_diff") < yr_diff_cutoff))
    .sort(by="yr_diff")
    .select("predicted_inventory")  
    .collect()
    .to_series()              
)

# Determine the proper scaling factor.
scaling_1 = ends.filter(pl.col("model_year") == 2022).select("predicted_inventory").collect().item() \
/ mean_for_scaling

scaling_2 = (ratios
    .filter((pl.col("model_year") == 2022) & (pl.col("model_year_right") == 2023))
    .select("predicted_inventory_right_ratio")
    .collect()
    .item()
)

# Do a weighted average of the Karcher mean with 
# the 2023 curve to extrapolate the 2022 curve.
w1 = 1
w2 = 1

extrapolations_3 = (w1 * scaling_1 * extrapolations_1 + w2 * scaling_2 * extrapolations_2) / (w1 + w2)

mixed_years_3 = (mixed_years_2
    .with_columns(
        pl.lit(None).alias("extrapolation_method")
    )
)

mixed_years_3 = (mixed_years_3
    .filter((pl.col("model_year") == 2022) & (pl.col("predicted_inventory").is_nan()))
    .select("yr_diff", "model_year")
    .with_columns(
        pl.lit(pl.Series(extrapolations_3)).alias("predicted_inventory"),
        pl.lit("scaled_karcher_mean").alias("extrapolation_method")
    )
    .collect()
    .vstack(
        other=(mixed_years_3
            .filter((pl.col("model_year") != 2022) | (pl.col("predicted_inventory").is_not_nan()))
            .collect()
        )
    )
    .lazy()
)

mixed_years_3.collect()

yr_diff,model_year,predicted_inventory,extrapolation_method
f64,i32,f64,str
-2.0,2022,0.0,"""scaled_karcher_mean"""
-1.999167,2022,0.0,"""scaled_karcher_mean"""
-1.998333,2022,0.0,"""scaled_karcher_mean"""
-1.9975,2022,0.0,"""scaled_karcher_mean"""
-1.996666,2022,0.0,"""scaled_karcher_mean"""
…,…,…,…
2.186666,2025,,
2.1875,2025,,
2.188333,2025,,
2.189167,2025,,


### 2024

In [11]:
data_matrix = np.array(
    list(
        (mixed_years_2
            .filter(
                (pl.col("model_year") == yr) & 
                (pl.col("yr_diff") >= ends.filter(pl.col("model_year") == 2022).select("yr_diff").collect().item())
            )
            .sort(by="yr_diff")
            .select("predicted_inventory")
            .collect()
            .to_series()
            .to_numpy()
            .T
        ) for yr in [2022, 2023]
    )
)

grid_points = (mixed_years_2
    .filter(
        (pl.col("model_year") == 2022) & 
        (pl.col("yr_diff") >= ends.filter(pl.col("model_year") == 2022).select("yr_diff").collect().item())
    )
    .sort(by="yr_diff")
    .select("yr_diff")
    .collect()
    .to_series()
    .to_numpy()
)

# Create a FDataGrid object for use in functional data analysis.
fd_grid_obj = FDataGrid(
    data_matrix=data_matrix,
    grid_points=grid_points
)

mean_inventory = fisher_rao_karcher_mean(
    fdatagrid=fd_grid_obj
)

In [12]:
yr_diff_cutoff = (ends
    .filter(pl.col("model_year") == 2024)
    .select("yr_diff")
    .collect()
    .item()
)

# Use the appropriately scaled 
# mean inventory curves for 2022 and 2023 to impute for 2024.
mean_inventories = mean_inventory.data_matrix.ravel()
subset_1 = (mean_inventory.grid_points[0] == yr_diff_cutoff)
mean_for_scaling = mean_inventories[subset_1]

subset_2 = mean_inventory.grid_points[0] > yr_diff_cutoff
extrapolations_1 = mean_inventories[subset_2]

extrapolations_2 = (mixed_years_2
    .filter((pl.col("model_year") == 2023) & (pl.col("yr_diff") > yr_diff_cutoff))
    .sort(by="yr_diff")
    .select("predicted_inventory")  
    .collect()
    .to_series()              
)

# Determine the proper scaling factor.
scaling_1 = ends.filter(pl.col("model_year") == 2024).select("predicted_inventory").collect().item() \
/ mean_for_scaling

scaling_2 = (ratios
    .filter((pl.col("model_year") == 2024) & (pl.col("model_year_right") == 2023))
    .select("predicted_inventory_right_ratio")
    .collect()
    .item()
)

# Do a weighted average of the Karcher mean with 
# the 2023 curve to extrapolate the 2024 curve.
w1 = 1
w2 = 1

extrapolations_3 = (w1 * scaling_1 * extrapolations_1 + w2 * scaling_2 * extrapolations_2) / (w1 + w2)

mixed_years_3 = (mixed_years_3
    .filter((pl.col("model_year") == 2024) & (pl.col("predicted_inventory").is_nan()))
    .select("yr_diff", "model_year")
    .with_columns(
        pl.lit(pl.Series(extrapolations_3)).alias("predicted_inventory"),
        pl.lit("scaled_karcher_mean").alias("extrapolation_method")
    )
    .collect()
    .vstack(
        other=(mixed_years_3
            .filter((pl.col("model_year") != 2024) | (pl.col("predicted_inventory").is_not_nan()))
            .collect()
        )
    )
    .lazy()
)

mixed_years_3.collect()

yr_diff,model_year,predicted_inventory,extrapolation_method
f64,i32,f64,str
0.284621,2024,1924.400244,"""scaled_karcher_mean"""
0.285455,2024,1921.823531,"""scaled_karcher_mean"""
0.286288,2024,1919.237741,"""scaled_karcher_mean"""
0.287122,2024,1916.642469,"""scaled_karcher_mean"""
0.287955,2024,1914.037311,"""scaled_karcher_mean"""
…,…,…,…
2.186666,2025,,
2.1875,2025,,
2.188333,2025,,
2.189167,2025,,


### 2025

In [13]:
data_matrix = np.array(
    list(
        (mixed_years_3
            .filter(
                (pl.col("model_year") == yr) & 
                (pl.col("yr_diff") >= ends.filter(pl.col("model_year") == 2025).select("yr_diff").collect().item())
            )
            .sort(by="yr_diff")
            .select("predicted_inventory")
            .collect()
            .to_series()
            .to_numpy()
            .T
        ) for yr in [2022, 2023, 2024]
    )
)

grid_points = (mixed_years_3
    .filter(
        (pl.col("model_year") == 2023) & 
        (pl.col("yr_diff") >= ends.filter(pl.col("model_year") == 2025).select("yr_diff").collect().item())
    )
    .sort(by="yr_diff")
    .select("yr_diff")
    .collect()
    .to_series()
    .to_numpy()
)

# Create a FDataGrid object for use in functional data analysis.
fd_grid_obj = FDataGrid(
    data_matrix=data_matrix,
    grid_points=grid_points
)


In [14]:
mean_inventory = fisher_rao_karcher_mean(
    fdatagrid=fd_grid_obj
)

In [15]:
yr_diff_cutoff = (ends
    .filter(pl.col("model_year") == 2025)
    .select("yr_diff")
    .collect()
    .item()
)

# Use the appropriately scaled mean_inventories as well as the
# inventory curves for 2022, 2023, and 2024 to impute for 2025.
mean_inventories = mean_inventory.data_matrix.ravel()
subset_1 = (mean_inventory.grid_points[0] == yr_diff_cutoff)
mean_for_scaling = mean_inventories[subset_1]

subset_2 = mean_inventory.grid_points[0] > yr_diff_cutoff
extrapolations_1 = mean_inventories[subset_2]

extrapolations_2 = (mixed_years_3
    .filter((pl.col("model_year") == 2024) & (pl.col("yr_diff") > yr_diff_cutoff))
    .sort(by="yr_diff")
    .select("predicted_inventory")  
    .collect()
    .to_series()              
)

# Determine the proper scaling factor.
scaling_1 = ends.filter(pl.col("model_year") == 2025).select("predicted_inventory").collect().item() \
/ mean_for_scaling

scaling_2 = (ratios
    .filter((pl.col("model_year") == 2025) & (pl.col("model_year_right") == 2024))
    .select("predicted_inventory_right_ratio")
    .collect()
    .item()
)

# Do a weighted average of the Karcher mean with 
# the 2024 curve to extrapolate the 2025 curve.
w1 = 1
w2 = 1

extrapolations_3 = (w1 * scaling_1 * extrapolations_1 + w2 * scaling_2 * extrapolations_2) / (w1 + w2)

mixed_years_3 = (mixed_years_3
    .filter((pl.col("model_year") == 2025) & (pl.col("predicted_inventory").is_nan()))
    .select("yr_diff", "model_year")
    .with_columns(
        pl.lit(pl.Series(extrapolations_3)).alias("predicted_inventory"),
        pl.lit("scaled_karcher_mean").alias("extrapolation_method")
    )
    .collect()
    .vstack(
        other=(mixed_years_3
            .filter((pl.col("model_year") != 2025) | (pl.col("predicted_inventory").is_not_nan()))
            .collect()
        )
    )
    .lazy()
)

mixed_years_3.collect()

yr_diff,model_year,predicted_inventory,extrapolation_method
f64,i32,f64,str
-0.715578,2025,1173.538475,"""scaled_karcher_mean"""
-0.714744,2025,1177.901279,"""scaled_karcher_mean"""
-0.713911,2025,1182.256297,"""scaled_karcher_mean"""
-0.713077,2025,1186.604658,"""scaled_karcher_mean"""
-0.712244,2025,1190.947491,"""scaled_karcher_mean"""
…,…,…,…
-0.719745,2025,1162.967831,
-0.718912,2025,1165.337221,
-0.718078,2025,1167.202966,
-0.717245,2025,1168.500928,


### Average over all years

In [16]:
data_matrix = np.array(
    list(
        (mixed_years_3
            .sort(by="yr_diff")
            .select("predicted_inventory")
            .collect()
            .to_series()
            .to_numpy()
            .T
        ) for yr in [2022, 2023, 2024, 2025]
    )
)

grid_points = (mixed_years_3
    .sort(by="yr_diff")
    .select("yr_diff")
    .collect()
    .to_series()
    .to_numpy()
)

# Create a FDataGrid object for use in functional data analysis.
fd_grid_obj = FDataGrid(
    data_matrix=data_matrix,
    grid_points=grid_points
)


In [None]:
# basis = BSplineBasis(n_basis=100)

# regularization = L2Regularization(
#     LinearDifferentialOperator(2),
# )

In [None]:
# smoother = BasisSmoother(
#     basis=basis,
#     regularization=regularization
# )
kernel_estimator = NadarayaWatsonHatMatrix(
    bandwidth=0.05,
    kernel=normal
)
smoother = KernelSmoother(
    kernel_estimator=kernel_estimator
)

fd_grid_obj_2 = smoother.fit_transform(X=fd_grid_obj)

In [36]:
px.scatter(y = fd_grid_obj_2.data_matrix[0, :, :].ravel())

In [20]:
mean_inventory = gmean(fd_grid_obj)

In [24]:
px.scatter(y=mean_inventory.data_matrix.ravel())

In [18]:
mean_inventory = fisher_rao_karcher_mean(
    fdatagrid=fd_grid_obj,
    penalty=100,
    grid_dim=1
)

LinAlgError: Singular matrix

In [None]:
yr_diff_cutoff = (ends
    .filter(pl.col("model_year") == 2025)
    .select("yr_diff")
    .collect()
    .item()
)

# Use the appropriately scaled mean_inventories as well as the
# inventory curves for 2022, 2023, and 2024 to impute for 2025.
mean_inventories = mean_inventory.data_matrix.ravel()
subset_1 = (mean_inventory.grid_points[0] == yr_diff_cutoff)
mean_for_scaling = mean_inventories[subset_1]

subset_2 = mean_inventory.grid_points[0] > yr_diff_cutoff
extrapolations_1 = mean_inventories[subset_2]

extrapolations_2 = (mixed_years_3
    .filter((pl.col("model_year") == 2024) & (pl.col("yr_diff") > yr_diff_cutoff))
    .sort(by="yr_diff")
    .select("predicted_inventory")  
    .collect()
    .to_series()              
)

# Determine the proper scaling factor.
scaling_1 = ends.filter(pl.col("model_year") == 2025).select("predicted_inventory").collect().item() \
/ mean_for_scaling

scaling_2 = (ratios
    .filter((pl.col("model_year") == 2025) & (pl.col("model_year_right") == 2024))
    .select("predicted_inventory_right_ratio")
    .collect()
    .item()
)

# Do a weighted average of the Karcher mean with 
# the 2024 curve to extrapolate the 2025 curve.
w1 = 1
w2 = 1

extrapolations_3 = (w1 * scaling_1 * extrapolations_1 + w2 * scaling_2 * extrapolations_2) / (w1 + w2)

mixed_years_3 = (mixed_years_3
    .filter((pl.col("model_year") == 2025) & (pl.col("predicted_inventory").is_nan()))
    .select("yr_diff", "model_year")
    .with_columns(
        pl.lit(pl.Series(extrapolations_3)).alias("predicted_inventory"),
        pl.lit("scaled_karcher_mean").alias("extrapolation_method")
    )
    .collect()
    .vstack(
        other=(mixed_years_3
            .filter((pl.col("model_year") != 2025) | (pl.col("predicted_inventory").is_not_nan()))
            .collect()
        )
    )
    .lazy()
)

mixed_years_3.collect()

yr_diff,model_year,predicted_inventory,extrapolation_method
f64,i32,f64,str
-0.715578,2025,1173.538475,"""scaled_karcher_mean"""
-0.714744,2025,1177.901279,"""scaled_karcher_mean"""
-0.713911,2025,1182.256297,"""scaled_karcher_mean"""
-0.713077,2025,1186.604658,"""scaled_karcher_mean"""
-0.712244,2025,1190.947491,"""scaled_karcher_mean"""
…,…,…,…
-0.719745,2025,1162.967831,
-0.718912,2025,1165.337221,
-0.718078,2025,1167.202966,
-0.717245,2025,1168.500928,


### Plot

In [17]:
fig = px.scatter()
for yr in model_yr_range:
    # Make a plot of the original training data alongside the predictions.
    x = (mixed_years_3
        .filter((pl.col("model_year") == yr) & (pl.col("extrapolation_method").is_null()))
        .select("yr_diff")
        .collect()
        .to_series()
    )

    y = (mixed_years_3
        .filter((pl.col("model_year") == yr) & (pl.col("extrapolation_method").is_null()))
        .select("predicted_inventory")
        .collect()
        .to_series()
    )

    fig.add_scatter(
        x=x,
        y=y,
        mode="markers",
        name=f"{yr} Interpolated"
    )

    x = (mixed_years_3
        .filter((pl.col("model_year") == yr) & (pl.col("extrapolation_method") == "scaled_karcher_mean"))
        .select("yr_diff")
        .collect()
        .to_series()
    )

    y = (mixed_years_3
        .filter((pl.col("model_year") == yr) & (pl.col("extrapolation_method") == "scaled_karcher_mean"))
        .select("predicted_inventory")
        .collect()
        .to_series()
    )

    fig.add_scatter(
        x=x,
        y=y,
        mode="markers",
        name=f"{yr} Extrapolated using scaled_karcher_mean"
    )

    # https://plotly.com/python/figure-labels/
    fig.update_layout(
        title=dict(text="Curve Fitting for Different Model Years"),
        xaxis=dict(
            title=dict(
                text="calendar_time - (model_year + 6/12) (years)"
            )
        ),
        yaxis=dict(
            title=dict(
                text="Advertised Vehicle Inventory (thousands)"
            )
        )
    )

fig.show()

# Adjust Average Inventory Derivative

Now, adjust for the fact that in recent years, model years have been released earlier and earlier.  See [link](https://en.wikipedia.org/wiki/Model_year#:~:text=Model%20year%20followed%20with%20calendar,that%20the%20vehicle%20was%20manufactured.).

In 1935, the derivative of inventory can become positive at `calendar_time - (model_year + 6/12) = -0.417`.  In 1965, the derivative of inventory can become positive at `calendar_time - (model_year + 6/12) = -1.167`.