In [2]:
import polars as pl
import pickle 
from data_functions import create_calendric_features, add_lag_features, add_trend_feature
import typing as t

In [4]:
# Define file paths
mapping_path = '../../data/feature_mapping_train.pkl'
features_path = '../../data/processed/train_data_features.feather'
target_path = '../../data/train_data_target.feather'

# Load the mapping (pickle file)
with open(mapping_path, 'rb') as f:
    mapping = pickle.load(f)

# Load train_features and train_target from Feather files
train_features = pl.read_ipc(features_path)  # Polars uses `read_ipc` for Feather files
train_target = pl.read_ipc(target_path)

# Convert the mapping (dictionary or list) to a Polars DataFrame
feature_mapping = pl.DataFrame(mapping)

In [4]:
df = create_calendric_features(train_features, 'date')
df = df.to_dummies(
    columns=["day_of_week", "month", "quarter", "week_of_year", "year", "is_weekend"]
)

df = add_lag_features(
    df,
    lags=range(1, 8),
    group_by_cols=["skuID", "frequency"],
    value_col="feature_0038",
    date_col="date"
)


df = df.drop('lag_target_1','feature_0038')
df = df.filter(pl.col("not_for_sale") != 1)

df = add_trend_feature(df, date_col="date")

In [5]:
df.shape

(46881677, 143)

In [6]:
(
    df.lazy()
    .select(["feature_0038_lag_1", "productID"])
    .sort("feature_0038_lag_1")
    .drop_nulls()
    .group_by("productID")
    .agg(pl.col("feature_0038_lag_1").mean().alias("mean"))
    .sort("mean")
    .collect()
)

productID,mean
i64,f64
78778,0.036712
79731,0.053758
79427,0.055439
79382,0.056939
79617,0.05807
…,…
81055,20.714691
81023,25.647474
80720,29.551907
81054,48.023351


In [7]:
df.sample(n=10, seed=42)

frequency,idx,bdID,base_date,date,dateID,skuID,productID,storeID,companyID,missing_value,not_for_sale,feature_0000,feature_0001,feature_0002,feature_0003,feature_0004,feature_0005,feature_0006,feature_0007,feature_0008,feature_0009,feature_0010,feature_0011,feature_0012,feature_0013,feature_0014,feature_0015,feature_0016,feature_0017,feature_0018,feature_0019,feature_0020,feature_0021,feature_0022,feature_0023,feature_0024,…,week_of_year_42,week_of_year_43,week_of_year_44,week_of_year_45,week_of_year_46,week_of_year_47,week_of_year_48,week_of_year_49,week_of_year_5,week_of_year_50,week_of_year_51,week_of_year_52,week_of_year_53,week_of_year_6,week_of_year_7,week_of_year_8,week_of_year_9,year_2011,year_2012,year_2013,year_2014,year_2015,year_2016,is_weekend_false,is_weekend_true,quarter_1,quarter_2,quarter_3,quarter_4,feature_0038_lag_1,feature_0038_lag_2,feature_0038_lag_3,feature_0038_lag_4,feature_0038_lag_5,feature_0038_lag_6,feature_0038_lag_7,trend
str,i64,i64,date,date,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,f64,f64,f64,f64,f64,f64,f64,i64
"""daily""",19143763,247356610,2015-07-07,2015-07-07,5667,263378,79955,1330,22,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1621
"""daily""",7922152,226323137,2013-08-16,2013-08-16,4977,268005,78484,1332,22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,931
"""daily""",55483932,211565706,2012-04-19,2012-04-19,4493,267734,81262,1331,22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0.0,0.0,1.0,1.0,3.0,1.0,0.0,447
"""daily""",53837436,254661111,2016-03-02,2016-03-02,5906,280769,79052,1336,22,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0.0,2.0,3.0,0.0,2.0,1.0,1.0,1860
"""daily""",30370574,249411164,2015-09-12,2015-09-12,5734,275102,79483,1334,22,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1688
"""daily""",52680817,202084033,2011-06-13,2011-06-13,4182,268451,78930,1332,22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,136
"""daily""",51443341,228966804,2013-11-11,2013-11-11,5064,259042,78668,1329,22,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,1.0,2.0,0.0,0.0,1.0,1.0,0.0,1018
"""daily""",16199108,242541649,2015-01-30,2015-01-30,5509,265837,79365,1331,22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,1463
"""daily""",52909709,256811276,2016-05-12,2016-05-12,5977,266144,79672,1331,22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1931
"""daily""",43947197,228908256,2013-11-09,2013-11-09,5062,261474,81100,1329,22,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1016


In [8]:
# Save DataFrame to Feather
df.write_ipc("../data/processed/train_data_features.feather")  # or "zstd" / None

# Load DataFrame from Feather
df = pl.read_ipc("../data/processed/train_data_features.feather")

In [9]:
df.sample(n=10, seed=42)

frequency,idx,bdID,base_date,date,dateID,skuID,productID,storeID,companyID,missing_value,not_for_sale,feature_0000,feature_0001,feature_0002,feature_0003,feature_0004,feature_0005,feature_0006,feature_0007,feature_0008,feature_0009,feature_0010,feature_0011,feature_0012,feature_0013,feature_0014,feature_0015,feature_0016,feature_0017,feature_0018,feature_0019,feature_0020,feature_0021,feature_0022,feature_0023,feature_0024,…,week_of_year_42,week_of_year_43,week_of_year_44,week_of_year_45,week_of_year_46,week_of_year_47,week_of_year_48,week_of_year_49,week_of_year_5,week_of_year_50,week_of_year_51,week_of_year_52,week_of_year_53,week_of_year_6,week_of_year_7,week_of_year_8,week_of_year_9,year_2011,year_2012,year_2013,year_2014,year_2015,year_2016,is_weekend_false,is_weekend_true,quarter_1,quarter_2,quarter_3,quarter_4,feature_0038_lag_1,feature_0038_lag_2,feature_0038_lag_3,feature_0038_lag_4,feature_0038_lag_5,feature_0038_lag_6,feature_0038_lag_7,trend
str,i64,i64,date,date,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,f64,f64,f64,f64,f64,f64,f64,i64
"""daily""",19143763,247356610,2015-07-07,2015-07-07,5667,263378,79955,1330,22,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1621
"""daily""",7922152,226323137,2013-08-16,2013-08-16,4977,268005,78484,1332,22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,931
"""daily""",55483932,211565706,2012-04-19,2012-04-19,4493,267734,81262,1331,22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0.0,0.0,1.0,1.0,3.0,1.0,0.0,447
"""daily""",53837436,254661111,2016-03-02,2016-03-02,5906,280769,79052,1336,22,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0.0,2.0,3.0,0.0,2.0,1.0,1.0,1860
"""daily""",30370574,249411164,2015-09-12,2015-09-12,5734,275102,79483,1334,22,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1688
"""daily""",52680817,202084033,2011-06-13,2011-06-13,4182,268451,78930,1332,22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,136
"""daily""",51443341,228966804,2013-11-11,2013-11-11,5064,259042,78668,1329,22,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,1.0,2.0,0.0,0.0,1.0,1.0,0.0,1018
"""daily""",16199108,242541649,2015-01-30,2015-01-30,5509,265837,79365,1331,22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,1463
"""daily""",52909709,256811276,2016-05-12,2016-05-12,5977,266144,79672,1331,22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1931
"""daily""",43947197,228908256,2013-11-09,2013-11-09,5062,261474,81100,1329,22,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1016
