# Store Sales

In [144]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from darts import TimeSeries
from darts.dataprocessing import Pipeline
from darts.dataprocessing.transformers import Scaler, InvertibleMapper, StaticCovariatesTransformer
from darts.dataprocessing.transformers.missing_values_filler import MissingValuesFiller
from darts.metrics import rmsle
from darts.models import LinearRegressionModel, LightGBMModel, XGBModel, CatBoostModel
from darts.models.filtering.moving_average_filter import MovingAverageFilter
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from tqdm.notebook import tqdm_notebook
import itertools

plt.style.use("ggplot")
plt.rcParams["font.size"] = 15  
COLORS = list(sns.color_palette())

import warnings
warnings.filterwarnings("ignore")

# 1. Data Preprocessing

In [145]:
# helper function to print messages
def cprint(title, *args):
    print(
        "="*len(title), title, "="*len(title),
        *args,
        sep="\n",
    )

### Loading the datasets.

In [146]:
train = pd.read_csv("train.csv", parse_dates=["date"]).drop(columns='onpromotion')
test = pd.read_csv("test.csv", parse_dates=["date"]).drop(columns='onpromotion')

oil = pd.read_csv('oil.csv', parse_dates=["date"]).rename(columns={"price": "oil"})
store = pd.read_csv("stores.csv")
holiday = pd.read_csv("holidays_events.csv", parse_dates=["date"])
holiday_indonesia = pd.read_csv("Holiday Indonesian.csv", parse_dates=["date"])

In [147]:
test_range = pd.date_range(start='2017-08-16', end='2025-08-16')
stores = test['store_nbr'].unique()
families = test['family'].unique()

all_combinations = pd.DataFrame(
    list(itertools.product(test_range, stores, families)),
    columns=['date', 'store_nbr', 'family']
)
all_combinations['id'] = range(1, len(all_combinations)+1)
test = all_combinations[['id', 'date', 'store_nbr', 'family']]
test


Unnamed: 0,id,date,store_nbr,family
0,1,2017-08-16,1,AUTOMOTIVE
1,2,2017-08-16,1,BABY CARE
2,3,2017-08-16,1,BEAUTY
3,4,2017-08-16,1,BEVERAGES
4,5,2017-08-16,1,BOOKS
...,...,...,...,...
5208781,5208782,2025-08-16,9,POULTRY
5208782,5208783,2025-08-16,9,PREPARED FOODS
5208783,5208784,2025-08-16,9,PRODUCE
5208784,5208785,2025-08-16,9,SCHOOL AND OFFICE SUPPLIES


In [148]:
oil['date'] = pd.to_datetime(oil['date'], dayfirst=True)
oil = oil.sort_values('date', ascending=True).reset_index().drop(columns=['index'])
oil

Unnamed: 0,date,oil
0,2013-01-02,93.12
1,2013-01-03,92.92
2,2013-01-04,93.09
3,2013-01-07,93.19
4,2013-01-08,93.15
...,...,...
3307,2025-08-11,63.96
3308,2025-08-12,63.17
3309,2025-08-13,62.65
3310,2025-08-14,63.96


### Check: There are missing gaps in the training data.

In [149]:
num_family = train.family.nunique()
num_store = train.store_nbr.nunique()
num_ts = train.groupby(["store_nbr", "family"]).ngroups
train_start = train.date.min().date()
train_end = train.date.max().date()
num_train_date = train.date.nunique()
train_len = (train_end - train_start).days + 1
test_start = test.date.min().date()
test_end = test.date.max().date()
num_test_date = test.date.nunique()
test_len = (test_end - test_start).days + 1

In [150]:
cprint(
    "Basic information of data",
    f"Number of family types      : {num_family}",
    f"Number of stores            : {num_store}",
    f"Number of store-family pairs: {num_family * num_store}",
    f"Number of target series     : {num_ts}",
    "",
    f"Number of unique train dates: {num_train_date}",
    f"Train date range            : {train_len} days from {train_start} to {train_end}",
    f"Number of unique test dates : {num_test_date}",
    f"Test date range             : {test_len} days from {test_start} to {test_end}",
)

Basic information of data
Number of family types      : 33
Number of stores            : 54
Number of store-family pairs: 1782
Number of target series     : 1782

Number of unique train dates: 1684
Train date range            : 1688 days from 2013-01-01 to 2017-08-15
Number of unique test dates : 2923
Test date range             : 2923 days from 2017-08-16 to 2025-08-16


###  Check: The 4 missing dates fall on Christmas across the years.

In [151]:
missing_dates = pd.date_range(train_start, train_end).difference(train.date.unique())
missing_dates = missing_dates.strftime("%Y-%m-%d").tolist()

unique_dp_count = train.groupby(["store_nbr", "family"]).date.count().unique().tolist()

In [152]:
cprint(
    "Missing gaps in time series",
    f"List incl. unique counts of data points: {unique_dp_count}",
    f"Missing dates                          : {missing_dates}",
)

Missing gaps in time series
List incl. unique counts of data points: [1684]
Missing dates                          : ['2013-12-25', '2014-12-25', '2015-12-25', '2016-12-25']


In [153]:
# reindex training data
multi_idx = pd.MultiIndex.from_product(
    [pd.date_range(train_start, train_end), train.store_nbr.unique(), train.family.unique()],
    names=["date", "store_nbr", "family"],
)
train = train.set_index(["date", "store_nbr", "family"]).reindex(multi_idx).reset_index()

# fill missing values with 0s
train["sales"] = train["sales"].fillna(0.)
train.id = train.id.interpolate(method="linear") # interpolate linearly as a filler for the 'id'

### Check: There are no oil prices on weekends.

In [154]:
missing_oil_dates = pd.date_range(train_start, test_end).difference(oil.date)
num_missing_oil_dates = len(missing_oil_dates)
num_wknd_missing = (missing_oil_dates.weekday >= 5).sum()
total_num_wknd = (pd.date_range(train_start, test_end).weekday >= 5).sum()

In [155]:
cprint(
    "Missing oil dates",
    f"Number of missing oil dates: {num_missing_oil_dates}",
    f"Number of weekends missing : {num_wknd_missing}",
    f"Total number of weekends   : {total_num_wknd}",
)

Missing oil dates
Number of missing oil dates: 1299
Number of weekends missing : 1273
Total number of weekends   : 1317


In [156]:
# reindex oil data
oil = oil.merge(
    pd.DataFrame({"date": pd.date_range(train_start, test_end)}),
    on="date",
    how="outer",
).sort_values("date", ignore_index=True)

# fill missing values using linear interpolation
oil.oil = oil.oil.interpolate(method="linear", limit_direction="both")

### Preprocessing On Holiday, the corresponden one is National only

In [157]:
holiday['type'] = holiday['type'].replace({'Bridge': 'Additional', 'Event': 'Holiday'})
holiday = holiday[~holiday['type'].isin(['Transfer', 'Work Day'])]
holiday = holiday.reset_index(drop=True)
holiday = pd.get_dummies(holiday, columns=['type'])

In [158]:
holiday = holiday[~holiday['locale'].isin(['Local', 'Regional'])]
holiday = holiday.drop(['locale', 'locale_name', 'description', 'transferred'], axis=1)
holiday = holiday[holiday['date'] <= '2017-08-15']
holiday = holiday.reset_index(drop=True)

In [159]:
holiday_col = ['type_Additional', 'type_Holiday']
holiday[holiday_col] = holiday[holiday_col].astype(int)

In [160]:
holiday_indonesia
holiday_indonesia.columns = ['date', 'type']
holiday_indonesia = pd.get_dummies(holiday_indonesia, columns=['type'])

In [161]:
holiday_indonesia[holiday_col] = holiday_indonesia[holiday_col].astype(int)

In [162]:
holiday

Unnamed: 0,date,type_Additional,type_Holiday
0,2012-08-10,0,1
1,2012-10-09,0,1
2,2012-11-02,0,1
3,2012-11-03,0,1
4,2012-12-21,1,0
...,...,...,...
147,2017-05-01,0,1
148,2017-05-13,1,0
149,2017-05-14,0,1
150,2017-05-24,0,1


In [163]:
holiday_indonesia

Unnamed: 0,date,type_Additional,type_Holiday
0,2017-08-17,0,1
1,2017-09-01,0,1
2,2017-09-21,0,1
3,2017-12-01,0,1
4,2017-12-25,0,1
...,...,...,...
182,2025-06-27,0,1
183,2025-08-17,0,1
184,2025-09-05,0,1
185,2025-12-25,0,1


In [164]:
holiday = pd.concat([holiday, holiday_indonesia], ignore_index=True)
holiday = holiday.sort_values(['date','type_Additional', 'type_Holiday']).drop_duplicates(subset=['date'], keep='last')
holiday = holiday.sort_values('date').reset_index(drop=True)
holiday

Unnamed: 0,date,type_Additional,type_Holiday
0,2012-08-10,0,1
1,2012-10-09,0,1
2,2012-11-02,0,1
3,2012-11-03,0,1
4,2012-12-21,1,0
...,...,...,...
328,2025-06-27,0,1
329,2025-08-17,0,1
330,2025-09-05,0,1
331,2025-12-25,0,1


In [165]:
print('=======================')
print('List of Holiday Count')
print('=======================')
for col in holiday_col:
    print(f'{col} : {len(holiday[holiday[col] == 1])}')

List of Holiday Count
type_Additional : 90
type_Holiday : 243


---

### This is Ecuador's Dataset that doesn't relate to Indonesian Dataset, thats why we have to remove the states and cities

In [166]:
store = store.drop(['city', 'state'], axis=1)

---

# 3. Model Training

### Combining the datasets.

In [167]:
# combine all the datasets
data = pd.concat(
    [train, test], axis=0, ignore_index=True,
).merge(
    oil, on="date", how="left",
).merge(
    store, on="store_nbr", how="left",
).merge(
    holiday, on="date", how="left",
).sort_values(["date", "store_nbr", "family"], ignore_index=True)

# fill columns with 0s to indicate absence of holidays/events
data[holiday.columns] = data[holiday.columns].fillna(0)

# include date-related future covariates
data["day"] = data.date.dt.day
data["month"] = data.date.dt.month
data["year"] = data.date.dt.year
data["day_of_week"] = data.date.dt.dayofweek
data["day_of_year"] = data.date.dt.dayofyear
data["week_of_year"] = data.date.dt.isocalendar().week.astype(int)
data["date_index"] = data.date.factorize()[0] # sort by date above before computing this

# to impute days with zero sales using linear interpolation later
zero_sales_dates = missing_dates + [f"{j}-01-01" for j in range(2013, 2018)]
data.loc[(data.date.isin(zero_sales_dates))&(data.sales.eq(0)), ["sales"]] = np.nan

# add prefixes for clarity
data.store_nbr = data.store_nbr.apply(lambda x: (f"store_nbr_{x}"))
data.cluster = data.cluster.apply(lambda x: (f"cluster_{x}"))
data.type = data.type.apply(lambda x: (f"type_{x}"))

data

Unnamed: 0,date,store_nbr,family,id,sales,oil,type,cluster,type_Additional,type_Holiday,day,month,year,day_of_week,day_of_year,week_of_year,date_index
0,2013-01-01,store_nbr_1,AUTOMOTIVE,0.0,,93.12,type_D,cluster_13,0.0,1.0,1,1,2013,1,1,1,0
1,2013-01-01,store_nbr_1,BABY CARE,1.0,,93.12,type_D,cluster_13,0.0,1.0,1,1,2013,1,1,1,0
2,2013-01-01,store_nbr_1,BEAUTY,2.0,,93.12,type_D,cluster_13,0.0,1.0,1,1,2013,1,1,1,0
3,2013-01-01,store_nbr_1,BEVERAGES,3.0,,93.12,type_D,cluster_13,0.0,1.0,1,1,2013,1,1,1,0
4,2013-01-01,store_nbr_1,BOOKS,4.0,,93.12,type_D,cluster_13,0.0,1.0,1,1,2013,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8216797,2025-08-16,store_nbr_54,POULTRY,5208650.0,,62.80,type_C,cluster_3,0.0,0.0,16,8,2025,5,228,33,4610
8216798,2025-08-16,store_nbr_54,PREPARED FOODS,5208651.0,,62.80,type_C,cluster_3,0.0,0.0,16,8,2025,5,228,33,4610
8216799,2025-08-16,store_nbr_54,PRODUCE,5208652.0,,62.80,type_C,cluster_3,0.0,0.0,16,8,2025,5,228,33,4610
8216800,2025-08-16,store_nbr_54,SCHOOL AND OFFICE SUPPLIES,5208653.0,,62.80,type_C,cluster_3,0.0,0.0,16,8,2025,5,228,33,4610


In [168]:
# buat aturan agregasi
agg_rules = {
    "sales": "mean",   # rata-rata untuk sales
    "oil": "first",    # misalnya ambil nilai pertama (karena biasanya sama per tanggal)
    "day": "first",
    "month": "first",
    "year": "first",
    "day_of_week": "first",
    "day_of_year": "first",
    "week_of_year": "first",
    "date_index": "first",
    "type_Additional": "first",
    "type_Holiday": "first"
}

# lakukan groupby dan simpan ke data baru
data = (
    data.groupby(["date", "family", "type"], as_index=False)
        .agg(agg_rules)
)

data

Unnamed: 0,date,family,type,sales,oil,day,month,year,day_of_week,day_of_year,week_of_year,date_index,type_Additional,type_Holiday
0,2013-01-01,AUTOMOTIVE,type_A,,93.12,1,1,2013,1,1,1,0,0.0,1.0
1,2013-01-01,AUTOMOTIVE,type_B,,93.12,1,1,2013,1,1,1,0,0.0,1.0
2,2013-01-01,AUTOMOTIVE,type_C,,93.12,1,1,2013,1,1,1,0,0.0,1.0
3,2013-01-01,AUTOMOTIVE,type_D,,93.12,1,1,2013,1,1,1,0,0.0,1.0
4,2013-01-01,AUTOMOTIVE,type_E,,93.12,1,1,2013,1,1,1,0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
760810,2025-08-16,SEAFOOD,type_A,,62.80,16,8,2025,5,228,33,4610,0.0,0.0
760811,2025-08-16,SEAFOOD,type_B,,62.80,16,8,2025,5,228,33,4610,0.0,0.0
760812,2025-08-16,SEAFOOD,type_C,,62.80,16,8,2025,5,228,33,4610,0.0,0.0
760813,2025-08-16,SEAFOOD,type_D,,62.80,16,8,2025,5,228,33,4610,0.0,0.0


### Defining the transformation pipelines.

From here on, we begin exploring the functionalities of the Darts library for forecasting. We define the transformation pipelines here, which includes:
* `MissingValuesFiller` to fill missing values like the NaN values we temporarily set previously. The default behavior is linear interpolation.
* `StaticCovariatesTransformer` to perform encoding/scaling of our static covariates. Our static covariates are all categorical, so we specify to perform one-hot encoding using `OneHotEncoder` from sklearn.
* `InvertibleMapper` to define a custom log transformer for our target series. Log transforming may help to stabilize our target series by reducing the magnitude of large values.
* `Scaler` to perform scaling for all our target series and covariates. The default behavior is min-max scaling.

In [169]:
def get_pipeline(static_covs_transform=False, log_transform=False):
    lst = []
    
    # fill missing values
    filler = MissingValuesFiller(n_jobs=-1)
    lst.append(filler)
    
    # specify transformation for static covariates
    if static_covs_transform:
        static_covs_transformer = StaticCovariatesTransformer(
            transformer_cat=OneHotEncoder(),
            n_jobs=-1,
        )
        lst.append(static_covs_transformer)

    # perform log transformation on sales
    if log_transform:
        log_transformer = InvertibleMapper(
            fn=np.log1p,
            inverse_fn=np.expm1,
            n_jobs=-1,
        )
        lst.append(log_transformer)

    # rescale time series
    scaler = Scaler()
    lst.append(scaler)

    # chain all transformations
    pipeline = Pipeline(lst)
    return pipeline

### Extracting the target series together with the static covariates.

In [170]:
data

Unnamed: 0,date,family,type,sales,oil,day,month,year,day_of_week,day_of_year,week_of_year,date_index,type_Additional,type_Holiday
0,2013-01-01,AUTOMOTIVE,type_A,,93.12,1,1,2013,1,1,1,0,0.0,1.0
1,2013-01-01,AUTOMOTIVE,type_B,,93.12,1,1,2013,1,1,1,0,0.0,1.0
2,2013-01-01,AUTOMOTIVE,type_C,,93.12,1,1,2013,1,1,1,0,0.0,1.0
3,2013-01-01,AUTOMOTIVE,type_D,,93.12,1,1,2013,1,1,1,0,0.0,1.0
4,2013-01-01,AUTOMOTIVE,type_E,,93.12,1,1,2013,1,1,1,0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
760810,2025-08-16,SEAFOOD,type_A,,62.80,16,8,2025,5,228,33,4610,0.0,0.0
760811,2025-08-16,SEAFOOD,type_B,,62.80,16,8,2025,5,228,33,4610,0.0,0.0
760812,2025-08-16,SEAFOOD,type_C,,62.80,16,8,2025,5,228,33,4610,0.0,0.0
760813,2025-08-16,SEAFOOD,type_D,,62.80,16,8,2025,5,228,33,4610,0.0,0.0


In [None]:
data.drop(['store_nbr', 'cluster'], axis=1, inplace=True)

In [174]:
def get_target_series(static_cols, log_transform=True):    
    target_dict = {}
    pipe_dict = {}
    id_dict = {}

    # Aturan agregasi: gunakan rata-rata untuk sales
    agg_rules = {"sales": "mean"}
    for col in static_cols:
        if col not in ["type"]:   # cegah duplikat
            agg_rules[col] = "first"

    for fam in tqdm_notebook(data.family.unique(), desc="Extracting target series"):
        df_fam = data[(data.family.eq(fam)) & (data.date.le(train_end.strftime("%Y-%m-%d")))]
        
        # Agregasi per type dan date
        df_agg = df_fam.groupby(["type", "date"]).agg(agg_rules).reset_index()
        
        pipe = get_pipeline(True, log_transform=log_transform)
        
        target = TimeSeries.from_group_dataframe(
            df=df_agg,
            time_col="date",
            value_cols="sales",
            group_cols="type",
            static_cols=static_cols,
            freq="D",
            fill_missing_dates=True,
            fillna_value=0
        )

        target_id = [{"type": t.static_covariates.type[0], "family": fam} 
                     for t in target]
        id_dict[fam] = target_id
        
        target = pipe.fit_transform(target)
        target_dict[fam] = [t.astype(np.float32) for t in target]
        pipe_dict[fam] = pipe[2:]
        
    return target_dict, pipe_dict, id_dict


In [175]:
static_cols = []

target_dict, pipe_dict, id_dict = get_target_series(static_cols)

Extracting target series:   0%|          | 0/33 [00:00<?, ?it/s]

### Extracting the past and future covariates.

In [176]:
data

Unnamed: 0,date,family,type,sales,oil,day,month,year,day_of_week,day_of_year,week_of_year,date_index,type_Additional,type_Holiday
0,2013-01-01,AUTOMOTIVE,type_A,,93.12,1,1,2013,1,1,1,0,0.0,1.0
1,2013-01-01,AUTOMOTIVE,type_B,,93.12,1,1,2013,1,1,1,0,0.0,1.0
2,2013-01-01,AUTOMOTIVE,type_C,,93.12,1,1,2013,1,1,1,0,0.0,1.0
3,2013-01-01,AUTOMOTIVE,type_D,,93.12,1,1,2013,1,1,1,0,0.0,1.0
4,2013-01-01,AUTOMOTIVE,type_E,,93.12,1,1,2013,1,1,1,0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
760810,2025-08-16,SEAFOOD,type_A,,62.80,16,8,2025,5,228,33,4610,0.0,0.0
760811,2025-08-16,SEAFOOD,type_B,,62.80,16,8,2025,5,228,33,4610,0.0,0.0
760812,2025-08-16,SEAFOOD,type_C,,62.80,16,8,2025,5,228,33,4610,0.0,0.0
760813,2025-08-16,SEAFOOD,type_D,,62.80,16,8,2025,5,228,33,4610,0.0,0.0


In [177]:
def get_covariates(
    df_input, # <-- Menerima dataframe sebagai input
    future_cols,
    past_ma_cols=None,
    future_ma_cols=None,
    past_window_sizes=[7, 28],
    future_window_sizes=[7, 28],
):
    past_dict = {}
    future_dict = {}
    
    covs_pipe = get_pipeline() # Asumsi fungsi get_pipeline() sudah ada

    for fam in tqdm_notebook(df_input.family.unique(), desc="Extracting covariates"):
        df = df_input[df_input.family.eq(fam)]
        
        # Ekstrak past covariates
        past_covs = TimeSeries.from_group_dataframe(
            df=df[df.date.le(train_end.strftime("%Y-%m-%d"))],
            time_col="date",
            value_cols=[],  # kosong
            group_cols="type",
            freq='D',
            fill_missing_dates=True,
            fillna_value=0
        )
        past_covs = [p.with_static_covariates(None) for p in past_covs]
        # past_covs = covs_pipe.fit_transform(past_covs)
        
        # if past_ma_cols is not None:
        #     for size in past_window_sizes:
        #         ma_filter = MovingAverageFilter(window=size)
        #         old_names = [f"rolling_mean_{size}_{col}" for col in past_ma_cols]
        #         new_names = [f"{col}_ma{size}" for col in past_ma_cols]
        #         past_ma_covs = [
        #             ma_filter.filter(p[past_ma_cols]).with_columns_renamed(old_names, new_names) 
        #             for p in past_covs
        #         ]
        #         past_covs = [p.stack(p_ma) for p, p_ma in zip(past_covs, past_ma_covs)]

        if past_ma_cols is not None and len(past_ma_cols) > 0:
            past_covs = TimeSeries.from_group_dataframe(
                df=df[df.date.le(train_end.strftime("%Y-%m-%d"))],
                time_col="date",
                value_cols=past_ma_cols,
                group_cols="type",
                freq='D',
                fill_missing_dates=True,
                fillna_value=0 
            )
            past_covs = [p.with_static_covariates(None) for p in past_covs]
            past_covs = covs_pipe.fit_transform(past_covs)
        else:
            # Tidak ada past covariates → tetap buat TimeSeries kosong tapi tidak pakai pipeline
            past_covs = TimeSeries.from_group_dataframe(
                df=df[df.date.le(train_end.strftime("%Y-%m-%d"))],
                time_col="date",
                value_cols=[],  # buat dummy kolom
                group_cols="type",
                freq='D',
                fill_missing_dates=True,
                fillna_value=0 
            )
            past_covs = [p.with_static_covariates(None) for p in past_covs]
        
        past_dict[fam] = [p.astype(np.float32) for p in past_covs]
        if all([p.n_components == 0 for p in past_covs]):
            past_dict[fam] = None

        # Ekstrak future covariates
        future_covs = TimeSeries.from_group_dataframe(
            df=df,
            time_col="date",
            value_cols=future_cols,
            group_cols="type",
            freq='D',
            fill_missing_dates=True,
            fillna_value=0 
        )
        future_covs = [f.with_static_covariates(None) for f in future_covs]
        future_covs = covs_pipe.fit_transform(future_covs)

        if future_ma_cols is not None:
            if future_ma_cols is not None:
                for size in future_window_sizes:
                    ma_filter = MovingAverageFilter(window=size)
                    old_names = [f"rolling_mean_{size}_{col}" for col in future_ma_cols]
                    new_names = [f"{col}_ma{size}" for col in future_ma_cols]
                    future_ma_covs = [
                        ma_filter.filter(f[future_ma_cols]).with_columns_renamed(old_names, new_names) 
                        for f in future_covs
                    ]
                    future_covs = [f.stack(f_ma) for f, f_ma in zip(future_covs, future_ma_covs)]
        
        future_dict[fam] = [f.astype(np.float32) for f in future_covs]
            
    return past_dict, future_dict

future_cols = ["oil", "day", "month", "year", "day_of_week", "day_of_year", "week_of_year", "date_index", "type_Holiday", "type_Additional"]
past_ma_cols = None
future_ma_cols = ["oil"]

print("\nMengekstrak covariates dari data yang sudah diagregasi...")
agg_dict = {
    'sales': 'sum',
    'oil': 'mean',
    'day': 'first',
    'month': 'first',
    'year': 'first',
    'day_of_week': 'first',
    'day_of_year': 'first',
    'week_of_year': 'first',
    'date_index': 'first',
    'type_Additional': 'max',
    'type_Holiday': 'max',
    'type': 'first'
}

non_agg_cols = ["date", "family", "type"]
data = data.groupby(non_agg_cols, as_index=False).agg(agg_dict)


past_dict, future_dict = get_covariates(
    data,
    future_cols, 
    past_ma_cols, 
    future_ma_cols
)

print("\nEkstraksi covariates selesai tanpa error.")


Mengekstrak covariates dari data yang sudah diagregasi...


Extracting covariates:   0%|          | 0/33 [00:00<?, ?it/s]




Ekstraksi covariates selesai tanpa error.


### Setting up the model trainer.

In [178]:
cprint(
    "List of all covariates:",
    "Static covariates:",
    static_cols,
    "",
    # "Past covariates:",
    # past_dict["AUTOMOTIVE"][0].components.tolist(),
    "",
    "Future covariates:",
    future_dict["AUTOMOTIVE"][0].components.tolist(),
)

List of all covariates:
Static covariates:
[]


Future covariates:
['oil', 'day', 'month', 'year', 'day_of_week', 'day_of_year', 'week_of_year', 'date_index', 'type_Holiday', 'type_Additional', 'oil_ma7', 'oil_ma28']


In [179]:
mask = (data['date'] >= "2017-08-16") & (data['date'] <= "2025-08-16")
subset = data.loc[mask]

forecast_horizon = len(subset.drop_duplicates(subset='date', keep='first'))

In [180]:
TRAINER_CONFIG = {
    # the time series data previously extracted
    "target_dict": target_dict,
    "pipe_dict": pipe_dict,
    "id_dict": id_dict,
    "past_dict": past_dict,
    "future_dict": future_dict,
    
    # time series cross-validation using a rolling forecasting origin
    "forecast_horizon": forecast_horizon, # the length of the validation set
    "folds": 1, # the number of training sets (setting to 1 means the standard train-validation split)
    
    # the number of previous days to check for zero sales; if all are zero, generate zero forecasts
    "zero_fc_window": 21,
    
    # specify the covariates in a list to include in the model
    # set to None to not use any, and set to 'keep_all' to include everything
    "static_covs": "keep_all", # specify from ['city', 'state', 'cluster', 'type', 'store_nbr'], will extract all one-hot encoded columns
    "past_covs": "keep_all",
    "future_covs": "keep_all",
}

In [181]:
class Trainer:
    def __init__(
        self,
        target_dict,
        pipe_dict,
        id_dict,
        past_dict,
        future_dict,
        forecast_horizon,
        folds,
        zero_fc_window,
        static_covs=None,
        past_covs=None,
        future_covs=None,
    ):
        self.target_dict = target_dict.copy()
        self.pipe_dict = pipe_dict.copy()
        self.id_dict = id_dict.copy()
        self.past_dict = past_dict.copy()
        self.future_dict = future_dict.copy()
        self.forecast_horizon = forecast_horizon
        self.folds = folds
        self.zero_fc_window = zero_fc_window
        self.static_covs = static_covs
        self.past_covs = past_covs
        self.future_covs = future_covs
        self.last_fitted_models_ = {}
        
        # set up time series data
        self.setup()
    
    def setup(self):
        for fam in tqdm_notebook(self.target_dict.keys(), desc="Setting up"):
            # keep the specified static covariates
            if self.static_covs != "keep_all":
                if self.static_covs is not None:
                    target = self.target_dict[fam]
                    keep_static = [col for col in target[0].static_covariates.columns if col.startswith(tuple(self.static_covs))]
                    static_covs_df = [t.static_covariates[keep_static] for t in target]
                    self.target_dict[fam] = [t.with_static_covariates(d) for t, d in zip(target, static_covs_df)]
                else:
                    self.target_dict[fam] = [t.with_static_covariates(None) for t in target]
            
            # keep the specified past covariates
            if self.past_covs != "keep_all":
                if self.past_covs is not None:
                    self.past_dict[fam] = [p[self.past_covs] for p in self.past_dict[fam]]
                else:
                    self.past_dict[fam] = None
                
            # keep the specified future covariates
            if self.future_covs != "keep_all":
                if self.future_covs is not None:
                    self.future_dict[fam] = [p[self.future_covs] for p in self.future_dict[fam]]
                else:
                    self.future_dict[fam] = None
    
    def clip(self, array):
        return np.clip(array, a_min=0., a_max=None)
    
    def train_valid_split(self, target, length):
        train = [t[:-length] for t in target]
        valid_end_idx = -length + self.forecast_horizon
        if valid_end_idx >= 0:
            valid_end_idx = None
        valid = [t[-length:valid_end_idx] for t in target]
        
        return train, valid
    
    def get_models(self, model_names, model_configs):
        models = {
            "lr": LinearRegressionModel,
            "lgbm": LightGBMModel,
            "cat": CatBoostModel,
            "xgb": XGBModel,
        }
        assert isinstance(model_names, list) and isinstance(model_configs, list),\
        "Both the model names and model configurations must be specified in lists."
        assert all(name in models for name in model_names),\
        f"Model names '{model_names}' not recognized."
        assert len(model_names) == len(model_configs),\
        "The number of model names and the number of model configurations do not match."
        
        if "xgb" in model_names:
            xgb_idx = np.where(np.array(model_names)=="xgb")[0]
            for idx in xgb_idx:
                # change to histogram-based method for XGBoost to get faster training time
                model_configs[idx] = {"tree_method": "hist", **model_configs[idx]}
        
        return [models[name](**model_configs[j]) for j, name in enumerate(model_names)]
    
    def generate_forecasts(self, models, train, pipe, past_covs, future_covs, drop_before):
        if drop_before is not None:
            date = pd.Timestamp(drop_before) - pd.Timedelta(days=1)
            train = [t.drop_before(date) for t in train]
        inputs = {
            "series": train,
            "past_covariates": past_covs,
            "future_covariates": future_covs,
        }
        zero_pred = pd.DataFrame({
            "date": pd.date_range(train[0].end_time(), periods=self.forecast_horizon+1)[1:],
            "sales": np.zeros(self.forecast_horizon),
        })
        zero_pred = TimeSeries.from_dataframe(
            df=zero_pred,
            time_col="date",
            value_cols="sales",
        )
        
        pred_list = []
        ens_pred = [0 for _ in range(len(train))]
        
        for m in models:
            # fit training data to model
            m.fit(**inputs)

            # generate forecasts and apply inverse transformations
            pred = m.predict(n=self.forecast_horizon, **inputs)
            pred = pipe.inverse_transform(pred)

            # set zero forecasts for target series where the recent observations are 0s
            for j in range(len(train)):
                if train[j][-self.zero_fc_window:].values().sum() == 0:
                    pred[j] = zero_pred
            
            # clip negative forecasts to 0s
            pred = [p.map(self.clip) for p in pred]
            pred_list.append(pred)
            
            # ensemble averaging
            for j in range(len(ens_pred)):
                ens_pred[j] += pred[j] / len(models)

        return pred_list, ens_pred, models
    
    def metric(self, valid, pred):
        return float(np.mean(rmsle(valid, pred)))
    
    def validate(self, model_names, model_configs, drop_before=None):
        # helper value to align printed text below
        longest_len = len(max(self.target_dict.keys(), key=len))
        
        # store metric values for each model
        model_metrics_history = []
        ens_metric_history = []
        
        for fam in tqdm_notebook(self.target_dict, desc="Performing validation"):
            target = self.target_dict[fam]
            pipe = self.pipe_dict[fam]
            past_covs = self.past_dict[fam]
            future_covs = self.future_dict[fam]
            
            # record average metric value over all folds
            model_metrics = []
            ens_metric = 0
            
            for j in range(self.folds):    
                # perform train-validation split and apply transformations
                length = (self.folds - j) * self.forecast_horizon
                train, valid = self.train_valid_split(target, length)
                valid = pipe.inverse_transform(valid)

                # generate forecasts and compute metric
                models = self.get_models(model_names, model_configs)
                pred_list, ens_pred = self.generate_forecasts(models, train, pipe, past_covs, future_covs, drop_before)
                metric_list = [self.metric(valid, pred) / self.folds for pred in pred_list]
                model_metrics.append(metric_list)
                if len(models) > 1:
                    ens_metric_fold = self.metric(valid, ens_pred) / self.folds
                    ens_metric += ens_metric_fold
                
            # store final metric value for each model
            model_metrics = np.sum(model_metrics, axis=0)
            model_metrics_history.append(model_metrics)
            ens_metric_history.append(ens_metric)
            
            # print metric value for each family
            print(
                fam,
                " " * (longest_len - len(fam)),
                " | ",
                " - ".join([f"{model}: {metric:.5f}" for model, metric in zip(model_names, model_metrics)]),
                f" - ens: {ens_metric:.5f}" if len(models) > 1 else "",
                sep="",
            )
            
        # print overall metric value
        cprint(
            "Average RMSLE | "
            + " - ".join([f"{model}: {metric:.5f}" 
                          for model, metric in zip(model_names, np.mean(model_metrics_history, axis=0))])
            + (f" - ens: {np.mean(ens_metric_history):.5f}" if len(models) > 1 else ""),
        )
        
    def ensemble_predict(self, model_names, model_configs, drop_before=None):
        forecasts = []
        self.last_fitted_models_ = {} 

        for fam in tqdm_notebook(self.target_dict.keys(), desc="Generating forecasts"):
            target = self.target_dict[fam]
            pipe = self.pipe_dict[fam]
            target_id = self.id_dict[fam]
            past_covs = self.past_dict.get(fam)
            future_covs = self.future_dict.get(fam)

            models = self.get_models(model_names, model_configs)
            
            # Unpacking sudah benar
            # Di dalam fungsi ensemble_predict, sebelum baris yang error

            pred_list, ens_pred, trained_models = self.generate_forecasts(models, target, pipe, past_covs, future_covs, drop_before)
            self.last_fitted_models_[fam] = trained_models
            
            # --- TAMBAHKAN KODE DEBUG INI ---
            if ens_pred: # Jika ens_pred tidak kosong
                print("Melihat semua atribut dan metode yang tersedia untuk objek pertama di ens_pred:")
                print(dir(ens_pred[0]))
            # -------------------------------

            # Baris asli Anda yang menyebabkan error
            ens_pred = [p.to_dataframe().reset_index().assign(**i) for p, i in zip(ens_pred, target_id)]
            # ------------------------------------

            ens_pred = pd.concat(ens_pred, axis=0)
            forecasts.append(ens_pred)

        forecasts = pd.concat(forecasts, axis=0)
        forecasts = forecasts.rename_axis(None, axis=1)

        if "date" in forecasts.columns:
            forecasts = forecasts.reset_index(drop=True)  # jangan bikin kolom date lagi
        else:
            forecasts = forecasts.reset_index(names="date")

        return forecasts

In [182]:
BASE_CONFIG = {
    "random_state": 0,
    "lags": 63,
    "lags_past_covariates": None,  
    "lags_future_covariates": list(range(1, 15)) if TRAINER_CONFIG["future_covs"] is not None else None,
    "output_chunk_length": 1,
}

max_future_lag = max(BASE_CONFIG["lags_future_covariates"])  # misal 14
some_fam = list(future_dict.keys())[0]
last_date_future_covs = future_dict[some_fam][0].end_time()
print("Tanggal terakhir future covariates:", last_date_future_covs)
last_date_target = target_dict[some_fam][0].end_time()
forecast_horizon_safe = (last_date_future_covs - last_date_target).days - max_future_lag
print("Forecast horizon aman:", forecast_horizon_safe)
TRAINER_CONFIG["forecast_horizon"] = forecast_horizon_safe

Tanggal terakhir future covariates: 2025-08-16 00:00:00
Forecast horizon aman: 2909


In [183]:
trainer = Trainer(**TRAINER_CONFIG)

Setting up:   0%|          | 0/33 [00:00<?, ?it/s]

In [184]:
GBDT_CONFIG_LAG7   = {**BASE_CONFIG, "lags": list(range(-7, -1))}
GBDT_CONFIG_LAG365 = {**BASE_CONFIG, "lags": list(range(-365, -1))}
GBDT_CONFIG_LAG730 = {**BASE_CONFIG, "lags": list(range(-730, -1))}

FINAL_MODELS = ["lgbm", "lgbm", "lgbm"]
FINAL_CONFIGS = [GBDT_CONFIG_LAG7, GBDT_CONFIG_LAG365, GBDT_CONFIG_LAG730]

# forecast_horizon = (pd.to_datetime("2025-08-16") - pd.to_datetime("2017-08-16")).days

In [185]:
# # generate forecasts for model trained on the entire data
# predictions1 = trainer.ensemble_predict(
#     model_names=FINAL_MODELS, 
#     model_configs=FINAL_CONFIGS,
# )

In [186]:
# =============================================================================
# KODE UNTUK DIJALANKAN DI NOTEBOOK ANDA SETELAH CLASS TRAINER DIPERBARUI
# =============================================================================
import pickle
import os

print("Menjalankan ensemble_predict untuk melatih dan menangkap model final...")
predictions1 = trainer.ensemble_predict(
    model_names=FINAL_MODELS, 
    model_configs=FINAL_CONFIGS,
)
print("Proses training dan prediksi selesai. Model final sudah tersimpan di dalam objek trainer.")

ARTIFACTS_DIR = "artifacts_nadi_pasar"
if not os.path.exists(ARTIFACTS_DIR):
    os.makedirs(ARTIFACTS_DIR)

with open(os.path.join(ARTIFACTS_DIR, 'trainer_object.pkl'), 'wb') as f:
    pickle.dump(trainer, f)
print(f"✅ Objek 'trainer' berhasil disimpan.")

with open(os.path.join(ARTIFACTS_DIR, 'final_models_dict.pkl'), 'wb') as f:
    pickle.dump(trainer.last_fitted_models_, f)
print(f"✅ Kamus model final ('final_models_dict.pkl') berhasil disimpan.")

# 3. (Jangan lupa) Simpan pemetaan produk ke family Anda
# product_to_family_map = {...}
# with open(os.path.join(ARTIFACTS_DIR, 'product_to_family_map.pkl'), 'wb') as f:
#     pickle.dump(product_to_family_map, f)
# print(f"Pemetaan produk ('product_to_family_map.pkl') berhasil disimpan.")

Menjalankan ensemble_predict untuk melatih dan menangkap model final...


Generating forecasts:   0%|          | 0/33 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010593 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20958
[LightGBM] [Info] Number of data points in the train set: 8405, number of used features: 179
[LightGBM] [Info] Start training from score 0.633610
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033130 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 112248
[LightGBM] [Info] Number of data points in the train set: 6615, number of used features: 537
[LightGBM] [Info] Start training from score 0.658206
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.054367 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 205301
[LightGBM] [Info] Number of data points in the train set: 4790, number of used features: 902
[LightGBM] [Info] Star

In [187]:
data

Unnamed: 0,date,family,sales,oil,day,month,year,day_of_week,day_of_year,week_of_year,date_index,type_Additional,type_Holiday,type
0,2013-01-01,AUTOMOTIVE,0.0,93.12,1,1,2013,1,1,1,0,0.0,1.0,type_A
1,2013-01-01,AUTOMOTIVE,0.0,93.12,1,1,2013,1,1,1,0,0.0,1.0,type_B
2,2013-01-01,AUTOMOTIVE,0.0,93.12,1,1,2013,1,1,1,0,0.0,1.0,type_C
3,2013-01-01,AUTOMOTIVE,0.0,93.12,1,1,2013,1,1,1,0,0.0,1.0,type_D
4,2013-01-01,AUTOMOTIVE,0.0,93.12,1,1,2013,1,1,1,0,0.0,1.0,type_E
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
760810,2025-08-16,SEAFOOD,0.0,62.80,16,8,2025,5,228,33,4610,0.0,0.0,type_A
760811,2025-08-16,SEAFOOD,0.0,62.80,16,8,2025,5,228,33,4610,0.0,0.0,type_B
760812,2025-08-16,SEAFOOD,0.0,62.80,16,8,2025,5,228,33,4610,0.0,0.0,type_C
760813,2025-08-16,SEAFOOD,0.0,62.80,16,8,2025,5,228,33,4610,0.0,0.0,type_D


In [188]:
predictions1

Unnamed: 0,date,sales,type,family
0,2017-08-16,10.085077,type_A,AUTOMOTIVE
1,2017-08-17,10.246009,type_A,AUTOMOTIVE
2,2017-08-18,11.022766,type_A,AUTOMOTIVE
3,2017-08-19,16.914660,type_A,AUTOMOTIVE
4,2017-08-20,18.052866,type_A,AUTOMOTIVE
...,...,...,...,...
479980,2025-07-29,4.170287,type_E,SEAFOOD
479981,2025-07-30,3.947154,type_E,SEAFOOD
479982,2025-07-31,4.257635,type_E,SEAFOOD
479983,2025-08-01,4.655717,type_E,SEAFOOD


In [189]:
data = data.merge(
    predictions1[['date','family','sales']],
    on=['date','family'],
    how='left',
    suffixes=('', '_pred')
)

# Hanya replace sales jika ada prediksi, biarkan yang lain tetap
data['sales'] = data['sales_pred'].combine_first(data['sales'])
data = data.drop(columns=['sales_pred'])
data

Unnamed: 0,date,family,sales,oil,day,month,year,day_of_week,day_of_year,week_of_year,date_index,type_Additional,type_Holiday,type
0,2013-01-01,AUTOMOTIVE,0.0,93.12,1,1,2013,1,1,1,0,0.0,1.0,type_A
1,2013-01-01,AUTOMOTIVE,0.0,93.12,1,1,2013,1,1,1,0,0.0,1.0,type_B
2,2013-01-01,AUTOMOTIVE,0.0,93.12,1,1,2013,1,1,1,0,0.0,1.0,type_C
3,2013-01-01,AUTOMOTIVE,0.0,93.12,1,1,2013,1,1,1,0,0.0,1.0,type_D
4,2013-01-01,AUTOMOTIVE,0.0,93.12,1,1,2013,1,1,1,0,0.0,1.0,type_E
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2680750,2025-08-16,SEAFOOD,0.0,62.80,16,8,2025,5,228,33,4610,0.0,0.0,type_A
2680751,2025-08-16,SEAFOOD,0.0,62.80,16,8,2025,5,228,33,4610,0.0,0.0,type_B
2680752,2025-08-16,SEAFOOD,0.0,62.80,16,8,2025,5,228,33,4610,0.0,0.0,type_C
2680753,2025-08-16,SEAFOOD,0.0,62.80,16,8,2025,5,228,33,4610,0.0,0.0,type_D


In [190]:
start_date = '2025-08-03'
end_date = '2025-08-16'

data = data[~((data['date'] >= start_date) & (data['date'] <= end_date))]

In [191]:
data

Unnamed: 0,date,family,sales,oil,day,month,year,day_of_week,day_of_year,week_of_year,date_index,type_Additional,type_Holiday,type
0,2013-01-01,AUTOMOTIVE,0.000000,93.120000,1,1,2013,1,1,1,0,0.0,1.0,type_A
1,2013-01-01,AUTOMOTIVE,0.000000,93.120000,1,1,2013,1,1,1,0,0.0,1.0,type_B
2,2013-01-01,AUTOMOTIVE,0.000000,93.120000,1,1,2013,1,1,1,0,0.0,1.0,type_C
3,2013-01-01,AUTOMOTIVE,0.000000,93.120000,1,1,2013,1,1,1,0,0.0,1.0,type_D
4,2013-01-01,AUTOMOTIVE,0.000000,93.120000,1,1,2013,1,1,1,0,0.0,1.0,type_E
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2678440,2025-08-02,SEAFOOD,52.562431,66.983333,2,8,2025,5,214,31,4596,0.0,0.0,type_E
2678441,2025-08-02,SEAFOOD,15.001511,66.983333,2,8,2025,5,214,31,4596,0.0,0.0,type_E
2678442,2025-08-02,SEAFOOD,7.053787,66.983333,2,8,2025,5,214,31,4596,0.0,0.0,type_E
2678443,2025-08-02,SEAFOOD,29.794912,66.983333,2,8,2025,5,214,31,4596,0.0,0.0,type_E


In [193]:
# buat aturan agregasi
agg_rules = {
    "sales": "mean",   # rata-rata untuk sales
    "oil": "first",    # misalnya ambil nilai pertama (karena biasanya sama per tanggal)
    "day": "first",
    "month": "first",
    "year": "first",
    "day_of_week": "first",
    "day_of_year": "first",
    "week_of_year": "first",
    "date_index": "first",
    "type_Additional": "first",
    "type_Holiday": "first"
}

# lakukan groupby dan simpan ke data baru
data = (
    data.groupby(["date", "family", "type"], as_index=False)
        .agg(agg_rules)
)

data

Unnamed: 0,date,family,type,sales,oil,day,month,year,day_of_week,day_of_year,week_of_year,date_index,type_Additional,type_Holiday
0,2013-01-01,AUTOMOTIVE,type_A,0.000000,93.120000,1,1,2013,1,1,1,0,0.0,1.0
1,2013-01-01,AUTOMOTIVE,type_B,0.000000,93.120000,1,1,2013,1,1,1,0,0.0,1.0
2,2013-01-01,AUTOMOTIVE,type_C,0.000000,93.120000,1,1,2013,1,1,1,0,0.0,1.0
3,2013-01-01,AUTOMOTIVE,type_D,0.000000,93.120000,1,1,2013,1,1,1,0,0.0,1.0
4,2013-01-01,AUTOMOTIVE,type_E,0.000000,93.120000,1,1,2013,1,1,1,0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
758500,2025-08-02,SEAFOOD,type_A,22.116532,66.983333,2,8,2025,5,214,31,4596,0.0,0.0
758501,2025-08-02,SEAFOOD,type_B,22.116532,66.983333,2,8,2025,5,214,31,4596,0.0,0.0
758502,2025-08-02,SEAFOOD,type_C,22.116532,66.983333,2,8,2025,5,214,31,4596,0.0,0.0
758503,2025-08-02,SEAFOOD,type_D,22.116532,66.983333,2,8,2025,5,214,31,4596,0.0,0.0


In [196]:
data.to_csv('dataset_fix.csv', index=False)