# Preprocessing of time series features

# Polars

In [1]:

import polars as pl
import pandas as pd
from sktime.datatypes import get_examples

df = (
    pl.read_parquet("../data/reduced_table.parquet")
    .sort(["id", "date"])
)

In [2]:
import polars as pl
import pandas as pd
from sktime.datatypes import get_examples

df = (
    pl.read_parquet("../data/reduced_table.parquet")
    .sort(["id", "date"])
    .to_pandas()
    .assign(
        date=lambda df: pd.PeriodIndex(df["date"].dt.to_period("1D")),
        id=lambda df: df["id"].astype("category"),
        item_id=lambda df: df["item_id"].astype("category"),
        dept_id=lambda df: df["dept_id"].astype("category"),
        cat_id=lambda df: df["cat_id"].astype("category"),
        store_id=lambda df: df["store_id"].astype("category"),
        state_id=lambda df: df["state_id"].astype("category"),
        event=lambda df: df["event"].astype("category"),
        # snap=lambda df: df["snap"].astype("category")
        
    )
    .set_index(["state_id", "store_id", "cat_id", "dept_id", "item_id", "id", "date"])
)

casting all categories to pandas categories is important to be able to define 

In [3]:
from sktime.datatypes import mtype, scitype, check_is_scitype, check_is_mtype

print(
    "mtype of the dataset: %s" %
    mtype(df.iloc[:10000], as_scitype="Hierarchical")
)
print(
    "scitype of the dataset: %s" % 
    scitype(df.iloc[:10000], candidate_scitypes="Hierarchical")
)
# scitype(df["cat_id"])
# mtype(df)


mtype of the dataset: pd_multiindex_hier
scitype of the dataset: Hierarchical


In [4]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,sales,event,snap,sell_price
state_id,store_id,cat_id,dept_id,item_id,id,date,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
CA,CA_1,FOODS,FOODS_1,FOODS_1_001,FOODS_1_001_CA_1_evaluation,2011-01-29,3,,0,2.0
CA,CA_1,FOODS,FOODS_1,FOODS_1_001,FOODS_1_001_CA_1_evaluation,2011-01-30,0,,0,2.0
CA,CA_1,FOODS,FOODS_1,FOODS_1_001,FOODS_1_001_CA_1_evaluation,2011-01-31,0,,0,2.0
CA,CA_1,FOODS,FOODS_1,FOODS_1_001,FOODS_1_001_CA_1_evaluation,2011-02-01,1,,1,2.0
CA,CA_1,FOODS,FOODS_1,FOODS_1_001,FOODS_1_001_CA_1_evaluation,2011-02-02,4,,1,2.0


In [5]:
# define what are endogenous and exogenous ts variables
target = ["sales"]
y = df[target]


exog_lag_features = [
    "sell_price", "snap"
]
future_features = [
    "event"
]
exog_features = exog_lag_features + future_features
X = df[exog_features]



In [6]:
from sklearn.feature_extraction.text import CountVectorizer

CountVectorizer().fit_transform(df["event"]).toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# 2. define features

In [7]:
from sklearn.preprocessing import OneHotEncoder, SplineTransformer

from sklego.preprocessing import RepeatingBasisFunction

from sktime.transformations.compose import TransformerPipeline
from sktime.transformations.series.date import DateTimeFeatures
from sktime.transformations.series.time_since import TimeSince
from sktime.transformations.series.adapt import TabularToSeriesAdaptor
from sktime.pipeline import sklearn_to_sktime

In [8]:
day_of_year_feature = TransformerPipeline([
        DateTimeFeatures(ts_freq='D', manual_selection=['day_of_year'], 
                                              keep_original_columns=False),
        sklearn_to_sktime(
            RepeatingBasisFunction(n_periods=12, input_range=(1, 365))),
    ])

day_of_week_feature = TransformerPipeline([
        DateTimeFeatures(ts_freq='D', manual_selection=['day_of_week'], 
                                              keep_original_columns=False),
        sklearn_to_sktime(OneHotEncoder(sparse_output=False)),
    ])

trend_feature = TransformerPipeline([
        TimeSince(),
        sklearn_to_sktime(SplineTransformer(n_knots=4, degree=3, extrapolation='constant'))
    ])

In [9]:
trend_feature.fit_transform(X.iloc[:10000], y.iloc[:10000])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,0,1,2,3,4,5
state_id,store_id,cat_id,dept_id,item_id,id,date,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
CA,CA_1,FOODS,FOODS_1,FOODS_1_001,FOODS_1_001_CA_1_evaluation,2011-01-29,0.166667,0.666667,1.666667e-01,0.000000e+00,0.000000,0.000000
CA,CA_1,FOODS,FOODS_1,FOODS_1_001,FOODS_1_001_CA_1_evaluation,2011-01-30,0.165895,0.666664,1.674411e-01,6.163215e-10,0.000000,0.000000
CA,CA_1,FOODS,FOODS_1,FOODS_1_001,FOODS_1_001_CA_1_evaluation,2011-01-31,0.165125,0.666657,1.682178e-01,4.930572e-09,0.000000,0.000000
CA,CA_1,FOODS,FOODS_1,FOODS_1_001,FOODS_1_001_CA_1_evaluation,2011-02-01,0.164358,0.666645,1.689970e-01,1.664068e-08,0.000000,0.000000
CA,CA_1,FOODS,FOODS_1,FOODS_1_001,FOODS_1_001_CA_1_evaluation,2011-02-02,0.163593,0.666629,1.697785e-01,3.944458e-08,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
TX,TX_2,FOODS,FOODS_1,FOODS_1_001,FOODS_1_001_TX_2_evaluation,2011-11-15,0.000000,0.000000,1.133315e-05,1.878738e-01,0.665035,0.147080
TX,TX_2,FOODS,FOODS_1,FOODS_1_001,FOODS_1_001_TX_2_evaluation,2011-11-16,0.000000,0.000000,4.781171e-06,1.824270e-01,0.665744,0.151824
TX,TX_2,FOODS,FOODS_1,FOODS_1_001,FOODS_1_001_TX_2_evaluation,2011-11-17,0.000000,0.000000,1.416643e-06,1.770747e-01,0.666254,0.156669
TX,TX_2,FOODS,FOODS_1,FOODS_1_001,FOODS_1_001_TX_2_evaluation,2011-11-18,0.000000,0.000000,1.770804e-07,1.718202e-01,0.666563,0.161617


In [10]:
from sktime.transformations.compose import YtoX
from sktime.transformations.series.summarize import WindowSummarizer
from sktime.transformations.series.impute import Imputer

# define autoregressive feature parameters
autoregressive_lags_base = {
    # "lag": [1],
    "lag": [1, 7], # previous day and -7 days
    "mean": [[1, 7], [1, 3]], # long and short averages
    "std": [[1, 7], [1, 3]]
}


# define autoregressive feature generator
autoregressive_feature = TransformerPipeline([
    YtoX(),
    WindowSummarizer(lag_feature=autoregressive_lags_base),
    # LogTransformer(offset=1),
    Imputer()
])


In [12]:
from itertools import product


In [23]:
(
    df
    # .lazy()
    .head(10000)
    .select(
        # pl.col("sales").shift(1).rolling_apply(lambda s: s.mean(), window_size=3).fill_null(strategy="backward").over("id")
        **{
            f"{i}_{w}": pl.col("sales").shift(i).rolling_apply(lambda s: s.mean(), window_size=w).fill_null(pl.all().median()).over("id")
            for i, w in product([1], [3, 7, 14])
        }
    )# .collect()
    
)

DuplicateError: column with name '1_3' has more than one occurrences

In [12]:
autoregressive_feature.fit_transform(X, y)

MemoryError: Unable to allocate 54.5 GiB for an array with shape (58567326300,) and data type int8

In [None]:

future_lags = {
    "lag": [-2, -1, 0, 1]
}
indicator_autoregressive_feature_generator = TransformerPipeline([
    ColumnSelect(future_features),
    WindowSummarizer(lag_feature=future_lags,
                     target_cols=future_features),
    Imputer()
])

# define feature union for all the Xs of the TS regression
feature_generator = FeatureUnion([
    ('calendar', calendar_feature_generator),
    ('endog_autoregressive', endog_autoregressive_feature_generator),
    ('exog_lagged', exog_lagged_autoregressive_feature_generator),
    ('exog_futures', indicator_autoregressive_feature_generator)
])



# 3. time series regression