In [1]:
from pathlib import Path

import numpy as np
import polars as pl
import plotly.express as px

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import SplineTransformer
from sklearn.linear_model import Ridge

# Exploratory Data Analysis and Feature Engineering

We have data on model_year, calendar_year, and advertised_inventory.

In [2]:
# Import data
mixed_years_path = Path("..", "raw_data", "model_year_vs_calendar_year", "model_year_vs_calendar_year.csv")
mixed_years = pl.scan_csv(
    source=mixed_years_path,
)
# View head of data
mixed_years.collect().head()

model_year,calendar_year,advertised_inventory,model_year_is_previous_calendar_year,model_year_is_calendar_year,model_year_is_next_calendar_year
i64,f64,f64,i64,i64,i64
2022,2020.5,0.0,0,0,0
2022,2020.9,0.0,0,0,0
2022,2022.420572,935.612174,0,1,0
2022,2022.441428,928.768412,0,1,0
2022,2022.462284,920.898087,0,1,0


Create a new variable called yr_diff to represent how far ahead the model_year is from the calendar_year.

In [29]:
-1.9-0.5-2023


-2025.4

In [None]:
mixed_years = (
    mixed_years
        .select(
            "model_year",
            (pl.col("model_year") + 6/12 - pl.col("calendar_year")).alias("yr_diff"),
            "advertised_inventory"
        )
)

# Also, pad in some more 0s for easier modeling later.
mixed_years = (mixed_years
    .collect()
    .vstack(
        pl.DataFrame(
            {
                "model_year": [2023]*10, 
                "yr_diff": [-1.95, -2.0, -2.05, 1.65, 1.70, 1.75, 1.8, 1.85, 1.9, 1.95],
                "advertised_inventory": [0.0]*10,
            }
        )
    )
    .lazy()
)

Except for some fluctations in year-to-year amplitudes, the pattern of inventory rise and fall for different model years appears remarkably consistent.  Inventories for each model year peak around July of the same calendar year.

In [5]:
px.scatter(
    x=mixed_years.select("yr_diff").collect().to_series(),
    y=mixed_years.select("advertised_inventory").collect().to_series(),
    color=mixed_years.select(pl.col("model_year").cast(pl.Utf8)).collect().to_series(),
    title="The Rise and Fall of Vehicle Inventories",
    labels={
        "x": "model_year + 0.5 - calendar_year (years)",
        "y": "Advertised Vehicle Inventory (thousands)",
        "color": "Model Year"
    }
)

  sf: grouped.get_group(s if len(s) > 1 else s[0])


# Spline Regression

In [89]:
# Create a pipeline incorporating splines and ridge regression.
model_1 = make_pipeline(
    SplineTransformer(
        # n_knots=50,
        degree=3,
        # knots="quantile",
        # knots=(mixed_years
        #     .filter(
        #         (pl.col("model_year") == 2023) &  
        #         (
        #             (pl.col("yr_diff") < -1.25) | 
        #             (pl.col("yr_diff") > -1.18)
        #         )
        #     )
        #     .select(pl.col("yr_diff"))
        #     .collect()
        #     .sort(by="yr_diff")
        # ),
        knots=(
            np.sort(
                np.array([-2.1, -1.15, -1.05, -1, -0.95, -0.5, -0.1, -0.03, 0, 
                    0.03, 0.05, 0.09, 0.12, 0.15, 0.18, 0.2, 0.244, 0.3, 0.5, 
                    0.73, 0.75, 0.95, 1, 1.02, 1.08, 2
                ])
                # np.insert(
                #     arr=np.linspace(-2, 2, num=116), 
                #     obj=0, 
                #     values=[-0.9, 0.5, 0.002]
                # )
            )
            .reshape(-1, 1)
        ),
        extrapolation="constant",
        include_bias=True
    ),
    Ridge(
        # alpha=1e-5,
        alpha=1e-4,
        fit_intercept=True
    )
)

# Fit the model.
model_1.fit(
    X=(mixed_years
       .filter(
           (pl.col("model_year") == 2023) 
        #    (pl.col("yr_diff") > -1.3)
        )
       .select("yr_diff")
       .collect()
    ),
    y=(mixed_years
       .filter(
           (pl.col("model_year") == 2023) 
        #    (pl.col("yr_diff") > -1.3)
        )
       .select("advertised_inventory")
       .collect()
    )
)

In [90]:
a = 36 * (-1.0/12.0)
b = 36 * (1.0/12.0)
step = 1.0/12.0
yr_diffs = np.linspace(a, b, num=int((b - a)/step + 1))

mixed_years_2 = pl.DataFrame(
    data={
        "yr_diff": yr_diffs
    }
)

mixed_years_2 = mixed_years_2.with_columns(
    pl.lit(model_1.predict(X=mixed_years_2.select("yr_diff")).flatten()).alias("predicted_inventory")
)
fig = px.scatter()

fig.add_scatter(
    x=mixed_years_2.select("yr_diff").to_series(),
    y=mixed_years_2.select("predicted_inventory").to_series(),
    mode="markers",
    name="Predicted Advertised Vehicle Inventory"
)

fig.add_scatter(
    x=mixed_years.filter(pl.col("model_year") == 2023).select("yr_diff").collect().to_series(),
    y=mixed_years.filter(pl.col("model_year") == 2023).select("advertised_inventory").collect().to_series(),
    mode="markers",
    name="Actual Advertised Vehicle Inventory"
)

fig.show()