# Preprocessing of time series features

# Polars

In [None]:
import numpy as np
import xarray as xr


In [None]:


import polars as pl
import pandas as pd
from sktime.datatypes import get_examples


In [None]:
import polars as pl
import pandas as pd
from sktime.datatypes import get_examples

df = (
    pl.read_parquet("../data/reduced_table.parquet")
    .sort(["id", "date"])
    .to_pandas()
    # [lambda df: df["store_id"]=="CA_1"]
    # [lambda df: df["cat_id"]=="FOODS"]
    .assign(
        id=lambda df: df["id"].astype("category"),
        item_id=lambda df: df["item_id"].astype("category"),
        dept_id=lambda df: df["dept_id"].astype("category"),
        cat_id=lambda df: df["cat_id"].astype("category"),
        store_id=lambda df: df["store_id"].astype("category"),
        state_id=lambda df: df["state_id"].astype("category"),
        event=lambda df: df["event"].astype("category"),
        
    )
    .set_index(["id", "date"])
)

In [None]:
df

In [None]:
da = df.to_xarray()

In [None]:
da["sales"].rolling(date=7).mean().isel(id=0)

In [None]:
xr.Dataset(da.to_dict())

In [None]:
da.to_unstacked_dataset("date", level=-2)

In [None]:
arr = xr.DataArray(
    np.arange(6).reshape(2, 3),
    coords=[("x", ["a", "b"]), ("y", [0, 1, 2])],
)
data = xr.Dataset({"a": arr, "b": arr.isel(y=0)})
data
stacked = data.to_stacked_array("z", ["x"])
stacked.indexes["z"]
roundtripped = stacked.to_unstacked_dataset(dim="z")

In [None]:
stacked

In [None]:
arr.to_stacked_array("z", ["x"])

casting all categories to pandas categories is important to be able to define 

In [None]:
from sktime.datatypes import mtype, scitype, check_is_scitype, check_is_mtype

print(
    "mtype of the dataset: %s" %
    mtype(df.iloc[:10000], as_scitype="Hierarchical")
)
print(
    "scitype of the dataset: %s" % 
    scitype(df.iloc[:10000], candidate_scitypes="Hierarchical")
)
# scitype(df["cat_id"])
# mtype(df)


In [None]:
df.head()

In [None]:
# define what are endogenous and exogenous ts variables
target = ["sales"]
y = df[target]


exog_lag_features = [
    "sell_price", "snap"
]
future_features = [
    "event"
]
exog_features = exog_lag_features + future_features
X = df[exog_features]



In [None]:
from sklearn.feature_extraction.text import CountVectorizer

CountVectorizer().fit_transform(df["event"]).toarray()

# 2. define features

In [None]:
from sklearn.preprocessing import OneHotEncoder, SplineTransformer

from sklego.preprocessing import RepeatingBasisFunction

from sktime.transformations.compose import TransformerPipeline
from sktime.transformations.series.date import DateTimeFeatures
from sktime.transformations.series.time_since import TimeSince
from sktime.transformations.series.adapt import TabularToSeriesAdaptor
from sktime.pipeline import sklearn_to_sktime

In [None]:
day_of_year_feature = TransformerPipeline([
        DateTimeFeatures(ts_freq='D', manual_selection=['day_of_year'], 
                                              keep_original_columns=False),
        sklearn_to_sktime(
            RepeatingBasisFunction(n_periods=12, input_range=(1, 365))),
    ])

day_of_week_feature = TransformerPipeline([
        DateTimeFeatures(ts_freq='D', manual_selection=['day_of_week'], 
                                              keep_original_columns=False),
        sklearn_to_sktime(OneHotEncoder(sparse_output=False)),
    ])

trend_feature = TransformerPipeline([
        TimeSince(),
        sklearn_to_sktime(SplineTransformer(n_knots=4, degree=3, extrapolation='constant'))
    ])

In [None]:
trend_feature.fit_transform(X.iloc[:10000], y.iloc[:10000])

In [None]:
from sktime.transformations.compose import YtoX
from sktime.transformations.series.summarize import WindowSummarizer
from sktime.transformations.series.impute import Imputer

# define autoregressive feature parameters
autoregressive_lags_base = {
    # "lag": [1],
    "lag": [1, 7], # previous day and -7 days
    "mean": [[1, 7], [1, 3]], # long and short averages
    "std": [[1, 7], [1, 3]]
}


# define autoregressive feature generator
autoregressive_feature = TransformerPipeline([
    YtoX(),
    WindowSummarizer(lag_feature=autoregressive_lags_base),
    # LogTransformer(offset=1),
    Imputer()
])


In [None]:
from itertools import product


In [None]:
(
    df
    # .lazy()
    .head(10000)
    .select(
        # pl.col("sales").shift(1).rolling_apply(lambda s: s.mean(), window_size=3).fill_null(strategy="backward").over("id")
        **{
            f"{i}_{w}": pl.col("sales").shift(i).rolling_apply(lambda s: s.mean(), window_size=w).fill_null(pl.all().median()).over("id")
            for i, w in product([1], [3, 7, 14])
        }
    )# .collect()
    
)

In [None]:
autoregressive_feature.fit_transform(X, y)

In [None]:

future_lags = {
    "lag": [-2, -1, 0, 1]
}
indicator_autoregressive_feature_generator = TransformerPipeline([
    ColumnSelect(future_features),
    WindowSummarizer(lag_feature=future_lags,
                     target_cols=future_features),
    Imputer()
])

# define feature union for all the Xs of the TS regression
feature_generator = FeatureUnion([
    ('calendar', calendar_feature_generator),
    ('endog_autoregressive', endog_autoregressive_feature_generator),
    ('exog_lagged', exog_lagged_autoregressive_feature_generator),
    ('exog_futures', indicator_autoregressive_feature_generator)
])



# 3. time series regression