In [160]:
from pathlib import Path

import numpy as np
import polars as pl
import plotly.express as px
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import (ConstantKernel, RBF, WhiteKernel, ExpSineSquared)


In [118]:
mixed_years_path = Path("..", "raw_data", "model_year_vs_calendar_year", "model_year_vs_calendar_year.csv")
mixed_years = pl.scan_csv(
    source=mixed_years_path,
)
mixed_years.collect().head()

model_year,calendar_year,advertised_inventory,model_year_is_previous_calendar_year,model_year_is_calendar_year,model_year_is_next_calendar_year
i64,f64,f64,i64,i64,i64
2022,2022.420572,935.612174,0,1,0
2022,2022.441428,928.768412,0,1,0
2022,2022.462284,920.898087,0,1,0
2022,2022.483139,913.027761,0,1,0
2022,2022.503995,908.579316,0,1,0


In [137]:
px.scatter(
    x=mixed_years.filter(pl.col("model_year_is_previous_calendar_year") == 1).select("calendar_year").collect().to_series(),
    y=mixed_years.filter(pl.col("model_year_is_previous_calendar_year") == 1).select("advertised_inventory").collect().to_series()
)

In [138]:
px.scatter(
    x=mixed_years.filter(pl.col("model_year_is_calendar_year") == 1).select("calendar_year").collect().to_series(),
    y=mixed_years.filter(pl.col("model_year_is_calendar_year") == 1).select("advertised_inventory").collect().to_series()
)

In [139]:
px.scatter(
    x=mixed_years.filter(pl.col("model_year_is_next_calendar_year") == 1).select("calendar_year").collect().to_series(),
    y=mixed_years.filter(pl.col("model_year_is_next_calendar_year") == 1).select("advertised_inventory").collect().to_series()
)

In [176]:
# https://www.cs.toronto.edu/~duvenaud/cookbook/
# https://scikit-learn.org/stable/auto_examples/gaussian_process/plot_gpr_co2.html#sphx-glr-auto-examples-gaussian-process-plot-gpr-co2-py
long_term_trend_kernel = 500.0**2 * RBF(length_scale=500.0, length_scale_bounds=(1e1,1e4))

seasonal_kernel = (
    2500.0**2
    * RBF(length_scale=15.0, length_scale_bounds=(1e-1,1e3))
    * ExpSineSquared(length_scale=1e2, periodicity=1.0, length_scale_bounds=(1e-1, 1e3), periodicity_bounds=(0.9,1.1))
)

noise_kernel = 1.0**2 * RBF(length_scale=1.0, length_scale_bounds=(1e-5,2.0)) + WhiteKernel(
    noise_level=0.09**2, noise_level_bounds=(1e-7, 1e-1)
)

kernel = long_term_trend_kernel + seasonal_kernel + noise_kernel
# kernel = ConstantKernel(constant_value_bounds=(1e-8,1e1)) \
#     * RBF(length_scale=0.25, length_scale_bounds=(1e-3,3.1)) \
#     * ExpSineSquared(
#         length_scale=0.25, 
#         periodicity=1,
#         length_scale_bounds=(1e-3,1e1),
#         periodicity_bounds=(0.9, 3.1)
#     )

In [185]:
advertised_inventory_mean = (
    mixed_years
        .select(pl.col("advertised_inventory").mean()).collect().item()
)

In [None]:
gaussian_process_obj = GaussianProcessRegressor(
    kernel=kernel,
    alpha=50,
    n_restarts_optimizer=50,
    normalize_y=False
)
gaussian_process_obj.fit(
    X=mixed_years.select("model_year", "calendar_year").collect(),
    y=mixed_years.select(pl.col("advertised_inventory") - advertised_inventory_mean).collect()
)


The optimal value found for dimension 0 of parameter k1__k2__k2__periodicity is close to the specified lower bound 0.9. Decreasing the bound and calling fit again may find a better value.


The optimal value found for dimension 0 of parameter k2__k1__k1__constant_value is close to the specified upper bound 100000.0. Increasing the bound and calling fit again may find a better value.



In [178]:
gaussian_process_obj.kernel_

93.8**2 * RBF(length_scale=21.9) + 42.8**2 * RBF(length_scale=0.271) * ExpSineSquared(length_scale=0.827, periodicity=0.9) + 316**2 * RBF(length_scale=0.389) + WhiteKernel(noise_level=1.82e-07)

In [216]:
# calendar_years = [2022 + 5/12 + x/12 for x in range(29)]
calendar_years = [2019 + 5/12 + x/12 for x in range(29+112)]
max(calendar_years)

2031.0833333333335

In [217]:
X = pl.DataFrame(
    data={"model_year": list(range(2022, 2026, 1))*len(calendar_years),
          "calendar_year": calendar_years*4
    }
)

In [218]:
gaussian_process_obj_preds = gaussian_process_obj.predict(
    X=X
) + advertised_inventory_mean

In [219]:
X_with_preds = (
    X.hstack([pl.Series("advertised_inventory_prediction", gaussian_process_obj_preds)])
)

In [220]:
px.scatter(
    x=X_with_preds.select("calendar_year").to_series(),
    y=X_with_preds.select("advertised_inventory_prediction").to_series(),
    color=X_with_preds.select("model_year").to_series()
)

In [221]:
px.scatter_3d(
    x=X_with_preds.select("calendar_year").to_series(),
    z=X_with_preds.select("advertised_inventory_prediction").to_series(),
    y=X_with_preds.select("model_year").to_series()
)