In [1]:
import os
import numpy as np
np.bool = np.bool_
import pandas as pd
import pickle
from gluonts.dataset.field_names import FieldName
from gluonts.dataset.common import ListDataset
from utils import highlight_print, reduce_memory

In [2]:
def prepare_datasets(save_dir='../dataset/else'):
    levels = [
        [],                        # Level 1: Total
        ['state_id'],              # Level 2: State
        ['store_id'],              # Level 3: Store
        ['cat_id'],                # Level 4: Category
        ['dept_id'],               # Level 5: Department
        ['state_id', 'cat_id'],    # Level 6: State-Category
        ['state_id', 'dept_id'],   # Level 7: State-Department
        ['store_id', 'cat_id'],    # Level 8: Store-Category
        ['store_id', 'dept_id'],   # Level 9: Store-Department
        ['item_id'],               # Level 10: Item
        ['item_id', 'state_id'],   # Level 11: Item-State
        ['item_id', 'store_id']    # Level 12: Individual
    ]

    for level_idx, level in enumerate(levels, start=1):
        highlight_print(f"Preparing dataset for level {level_idx}")
        datasets = {'train': {}, 'test': {}}

        # load data
        agg_df = pd.read_csv(f'../data/preprocessed/agg_df_level_{level_idx}.csv')
        calendar_df = pd.read_csv('../data/preprocessed/calendar_df.csv')

        # reduce memory
        agg_df = reduce_memory(agg_df)
        calendar_df = reduce_memory(calendar_df)

        # convert to datetime
        start_date = pd.to_datetime('2011-01-29')
        valid_start_date = pd.to_datetime('2016-03-28')
        agg_df['d'] = agg_df['d'].apply(lambda x: int(x.split('_')[1]) - 1)
        agg_df['d'] = start_date + pd.to_timedelta(agg_df['d'], unit='D')
        calendar_df['d'] = calendar_df['d'].apply(lambda x: int(x.split('_')[1]) - 1)
        calendar_df['d'] = start_date + pd.to_timedelta(calendar_df['d'], unit='D')
        
        # create id (group)
        if len(level) == 0:
            agg_df.insert(1, 'id', 'total')
        elif len(level) == 1: 
            agg_df.insert(1, 'id', agg_df[level[0]])
            del agg_df[level[0]]
        elif len(level) > 1:
            agg_df.insert(1, 'id', agg_df[level[0]] + '_' + agg_df[level[1]])
            del agg_df[level[0]]
            del agg_df[level[1]]

        # id (group) encoding
        groups = agg_df['id'].unique()
        group_encoder = {group: group_idx for group_idx, group in enumerate(groups)}
 
        # merge
        group_df = agg_df.merge(calendar_df, on="d", how="left")

        # datasets
        train_dataset = ListDataset(
            [
                {
                    FieldName.ITEM_ID: id,
                    FieldName.TARGET: group["sales_sum"].values[:-56],
                    FieldName.START: pd.Period(start_date, freq="1D"),
                    FieldName.FEAT_STATIC_CAT: [group_encoder[id]],
                    FieldName.FEAT_DYNAMIC_REAL: group[[
                        'sales_mean', 'sales_std', 'sales_max', 'sales_min', 'sales_diff_mean', 
                        'sales_lag1_mean', 'sales_lag7_mean', 'sales_lag28_mean', 
                        'sales_rolling7_mean', 'sales_rolling28_mean', 'sales_rolling7_diff_mean', 'sales_rolling28_diff_mean', 
                        'release_mean', 'out_of_stock_mean', 
                        'sell_price_mean', 'sell_price_std', 'sell_price_max', 'sell_price_min', 'sell_price_diff_mean',
                        'sell_price_lag_mean', 'sell_price_rolling_mean', 'sell_price_rolling_diff_mean',
                        'sell_price_in_store_mean',
                        "year_delta", "quarter_sin", "quarter_cos", "month_sin", "month_cos",  
                        "day_sin", "day_cos", "weekday_sin", "weekday_cos",
                        'event_count'
                    ]].values[:-56].T,
                    FieldName.FEAT_DYNAMIC_CAT: group[[
                        'snap_CA', 'snap_TX', 'snap_WI', 
                        'event_name_1_enc', 'event_name_2_enc', 
                        'event_type_1_enc', 'event_type_2_enc'
                    ]].values[:-56].T,
                }
                for id, group in group_df.groupby("id")
            ],
            freq="D",
        )
        valid_dataset = ListDataset(
            [
                {
                    FieldName.ITEM_ID: id,
                    FieldName.TARGET: group["sales_sum"].values[:-28],
                    FieldName.START: pd.Period(valid_start_date, freq="1D"),
                    FieldName.FEAT_STATIC_CAT: [group_encoder[id]],
                    FieldName.FEAT_DYNAMIC_REAL: group[[
                        'sales_mean', 'sales_std', 'sales_max', 'sales_min', 'sales_diff_mean', 
                        'sales_lag1_mean', 'sales_lag7_mean', 'sales_lag28_mean', 
                        'sales_rolling7_mean', 'sales_rolling28_mean', 'sales_rolling7_diff_mean', 'sales_rolling28_diff_mean', 
                        'release_mean', 'out_of_stock_mean', 
                        'sell_price_mean', 'sell_price_std', 'sell_price_max', 'sell_price_min', 'sell_price_diff_mean',
                        'sell_price_lag_mean', 'sell_price_rolling_mean', 'sell_price_rolling_diff_mean',
                        'sell_price_in_store_mean',
                        "year_delta", "quarter_sin", "quarter_cos", "month_sin", "month_cos",  
                        "day_sin", "day_cos", "weekday_sin", "weekday_cos",
                        'event_count'
                    ]].values[:-28].T,
                    FieldName.FEAT_DYNAMIC_CAT: group[[
                        'snap_CA', 'snap_TX', 'snap_WI', 
                        'event_name_1_enc', 'event_name_2_enc', 
                        'event_type_1_enc', 'event_type_2_enc'
                    ]].values[:-28].T,
                }
                for id, group in group_df.groupby("id")
            ],
            freq="D",
        )
        test_dataset = ListDataset(
            [
                {
                    FieldName.ITEM_ID: id,
                    FieldName.TARGET: group["sales_sum"].values[:], 
                    FieldName.START: pd.Period(start_date, freq="1D"),
                    FieldName.FEAT_STATIC_CAT: [group_encoder[id]],
                    FieldName.FEAT_DYNAMIC_REAL: group[[
                        'sales_mean', 'sales_std', 'sales_max', 'sales_min', 'sales_diff_mean', 
                        'sales_lag1_mean', 'sales_lag7_mean', 'sales_lag28_mean', 
                        'sales_rolling7_mean', 'sales_rolling28_mean', 'sales_rolling7_diff_mean', 'sales_rolling28_diff_mean', 
                        'release_mean', 'out_of_stock_mean', 
                        'sell_price_mean', 'sell_price_std', 'sell_price_max', 'sell_price_min', 'sell_price_diff_mean',
                        'sell_price_lag_mean', 'sell_price_rolling_mean', 'sell_price_rolling_diff_mean',
                        'sell_price_in_store_mean',
                        "year_delta", "quarter_sin", "quarter_cos", "month_sin", "month_cos",  
                        "day_sin", "day_cos", "weekday_sin", "weekday_cos",
                        'event_count'
                    ]].values[:].T,  
                    FieldName.FEAT_DYNAMIC_CAT: group[[
                        'snap_CA', 'snap_TX', 'snap_WI', 
                        'event_name_1_enc', 'event_name_2_enc', 
                        'event_type_1_enc', 'event_type_2_enc'
                    ]].values[:].T,  
                }
                for id, group in group_df.groupby("id")
            ],
            freq="D",
        )   
        
        datasets['train'] = train_dataset
        datasets['valid'] = valid_dataset
        datasets['test'] = test_dataset

        # save
        with open(os.path.join(save_dir, f'dataset_level_{level_idx}.pkl'), 'wb') as f:
            pickle.dump(datasets, f)

        # reduce memory
        del agg_df
        del calendar_df
        del group_df
        del train_dataset
        del valid_dataset
        del test_dataset
        del datasets

prepare_datasets()

[93mPreparing dataset for level 1[0m
[93mPreparing dataset for level 2[0m
[93mPreparing dataset for level 3[0m
[93mPreparing dataset for level 4[0m
[93mPreparing dataset for level 5[0m
[93mPreparing dataset for level 6[0m
[93mPreparing dataset for level 7[0m
[93mPreparing dataset for level 8[0m
[93mPreparing dataset for level 9[0m
[93mPreparing dataset for level 10[0m
[93mPreparing dataset for level 11[0m
[93mPreparing dataset for level 12[0m


In [3]:
def prepare_tft_datasets(save_dir='../dataset/tft'):
    os.makedirs(save_dir, exist_ok=True)

    levels = [
        [],                        # Level 1: Total
        ['state_id'],              # Level 2: State
        ['store_id'],              # Level 3: Store
        ['cat_id'],                # Level 4: Category
        ['dept_id'],               # Level 5: Department
        ['state_id', 'cat_id'],    # Level 6: State-Category
        ['state_id', 'dept_id'],   # Level 7: State-Department
        ['store_id', 'cat_id'],    # Level 8: Store-Category
        ['store_id', 'dept_id'],   # Level 9: Store-Department
        ['item_id'],               # Level 10: Item
        ['item_id', 'state_id'],   # Level 11: Item-State
        ['item_id', 'store_id']    # Level 12: Individual
    ]

    for level_idx, level in enumerate(levels, start=1):
        highlight_print(f"Preparing dataset for level {level_idx}")
        datasets = {'train': {}, 'test': {}}

        # load data
        agg_df = pd.read_csv(f'../data/preprocessed/agg_df_level_{level_idx}.csv')
        calendar_df = pd.read_csv('../data/preprocessed/calendar_df.csv')

        # reduce memory
        agg_df = reduce_memory(agg_df)
        calendar_df = reduce_memory(calendar_df)

        # convert to datetime
        start_date = pd.to_datetime('2011-01-29')
        valid_start_date = pd.to_datetime('2016-03-28')
        agg_df['d'] = agg_df['d'].apply(lambda x: int(x.split('_')[1]) - 1)
        agg_df['d'] = start_date + pd.to_timedelta(agg_df['d'], unit='D')
        calendar_df['d'] = calendar_df['d'].apply(lambda x: int(x.split('_')[1]) - 1)
        calendar_df['d'] = start_date + pd.to_timedelta(calendar_df['d'], unit='D')
        
        # create id (group)
        if len(level) == 0:
            agg_df.insert(1, 'id', 'total')
        elif len(level) == 1: 
            agg_df.insert(1, 'id', agg_df[level[0]])
            del agg_df[level[0]]
        elif len(level) > 1:
            agg_df.insert(1, 'id', agg_df[level[0]] + '_' + agg_df[level[1]])
            del agg_df[level[0]]
            del agg_df[level[1]]

        # id (group) encoding
        groups = agg_df['id'].unique()
        group_encoder = {group: group_idx for group_idx, group in enumerate(groups)}
 
        # merge
        group_df = agg_df.merge(calendar_df, on="d", how="left")

        # datasets
        train_dataset = ListDataset(
            [
                {
                    FieldName.ITEM_ID: id,
                    FieldName.TARGET: group["sales_sum"].values[:-56],
                    FieldName.START: pd.Period(start_date, freq="1D"),
                    'id': [group_encoder[id]],
                    "sales_mean": group["sales_mean"].values[:-56].reshape(1, -1),
                    "sales_std": group["sales_std"].values[:-56].reshape(1, -1),
                    "sales_max": group["sales_max"].values[:-56].reshape(1, -1),
                    "sales_min": group["sales_min"].values[:-56].reshape(1, -1),
                    "sales_diff_mean": group["sales_diff_mean"].values[:-56].reshape(1, -1),
                    "sales_lag1_mean": group["sales_lag1_mean"].values[:-56].reshape(1, -1),
                    "sales_lag7_mean": group["sales_lag7_mean"].values[:-56].reshape(1, -1),
                    "sales_lag28_mean": group["sales_lag28_mean"].values[:-56].reshape(1, -1),
                    "sales_rolling7_mean": group["sales_rolling7_mean"].values[:-56].reshape(1, -1),
                    "sales_rolling28_mean": group["sales_rolling28_mean"].values[:-56].reshape(1, -1),
                    "sales_rolling7_diff_mean": group["sales_rolling7_diff_mean"].values[:-56].reshape(1, -1),
                    "sales_rolling28_diff_mean": group["sales_rolling28_diff_mean"].values[:-56].reshape(1, -1),
                    "release_mean": group["release_mean"].values[:-56].reshape(1, -1),
                    "out_of_stock_mean": group["out_of_stock_mean"].values[:-56].reshape(1, -1),
                    "sell_price_mean": group["sell_price_mean"].values[:-56].reshape(1, -1),
                    "sell_price_std": group["sell_price_std"].values[:-56].reshape(1, -1),
                    "sell_price_max": group["sell_price_max"].values[:-56].reshape(1, -1),
                    "sell_price_min": group["sell_price_min"].values[:-56].reshape(1, -1),
                    "sell_price_diff_mean": group["sell_price_diff_mean"].values[:-56].reshape(1, -1),
                    "sell_price_lag_mean": group["sell_price_lag_mean"].values[:-56].reshape(1, -1),
                    "sell_price_rolling_mean": group["sell_price_rolling_mean"].values[:-56].reshape(1, -1),
                    "sell_price_rolling_diff_mean": group["sell_price_rolling_diff_mean"].values[:-56].reshape(1, -1),
                    "sell_price_in_store_mean": group["sell_price_in_store_mean"].values[:-56].reshape(1, -1),
                    "year_delta": group["year_delta"].values[:-56].reshape(1, -1),
                    "quarter_sin": group["quarter_sin"].values[:-56].reshape(1, -1),
                    "quarter_cos": group["quarter_cos"].values[:-56].reshape(1, -1),
                    "month_sin": group["month_sin"].values[:-56].reshape(1, -1),
                    "month_cos": group["month_cos"].values[:-56].reshape(1, -1),
                    "day_sin": group["day_sin"].values[:-56].reshape(1, -1),
                    "day_cos": group["day_cos"].values[:-56].reshape(1, -1),
                    "weekday_sin": group["weekday_sin"].values[:-56].reshape(1, -1),
                    "weekday_cos": group["weekday_cos"].values[:-56].reshape(1, -1),
                    "event_count": group["event_count"].values[:-56].reshape(1, -1),
                    'snap_CA': group['snap_CA'].values[:-56],
                    'snap_TX': group['snap_TX'].values[:-56],
                    'snap_WI': group['snap_WI'].values[:-56],
                    'event_name_1_enc': group['event_name_1_enc'].values[:-56],
                    'event_name_2_enc': group['event_name_2_enc'].values[:-56],
                    'event_type_1_enc': group['event_type_1_enc'].values[:-56],
                    'event_type_2_enc': group['event_type_2_enc'].values[:-56]
                }
                for id, group in group_df.groupby("id")
            ],
            freq="D",
        )

        valid_dataset = ListDataset(
            [
                {
                    FieldName.ITEM_ID: id,
                    FieldName.TARGET: group["sales_sum"].values[:-28],
                    FieldName.START: pd.Period(valid_start_date, freq="1D"),
                    'id': [group_encoder[id]],
                    "sales_mean": group["sales_mean"].values[:-28].reshape(1, -1),
                    "sales_std": group["sales_std"].values[:-28].reshape(1, -1),
                    "sales_max": group["sales_max"].values[:-28].reshape(1, -1),
                    "sales_min": group["sales_min"].values[:-28].reshape(1, -1),
                    "sales_diff_mean": group["sales_diff_mean"].values[:-28].reshape(1, -1),
                    "sales_lag1_mean": group["sales_lag1_mean"].values[:-28].reshape(1, -1),
                    "sales_lag7_mean": group["sales_lag7_mean"].values[:-28].reshape(1, -1),
                    "sales_lag28_mean": group["sales_lag28_mean"].values[:-28].reshape(1, -1),
                    "sales_rolling7_mean": group["sales_rolling7_mean"].values[:-28].reshape(1, -1),
                    "sales_rolling28_mean": group["sales_rolling28_mean"].values[:-28].reshape(1, -1),
                    "sales_rolling7_diff_mean": group["sales_rolling7_diff_mean"].values[:-28].reshape(1, -1),
                    "sales_rolling28_diff_mean": group["sales_rolling28_diff_mean"].values[:-28].reshape(1, -1),
                    "release_mean": group["release_mean"].values[:-28].reshape(1, -1),
                    "out_of_stock_mean": group["out_of_stock_mean"].values[:-28].reshape(1, -1),
                    "sell_price_mean": group["sell_price_mean"].values[:-28].reshape(1, -1),
                    "sell_price_std": group["sell_price_std"].values[:-28].reshape(1, -1),
                    "sell_price_max": group["sell_price_max"].values[:-28].reshape(1, -1),
                    "sell_price_min": group["sell_price_min"].values[:-28].reshape(1, -1),
                    "sell_price_diff_mean": group["sell_price_diff_mean"].values[:-28].reshape(1, -1),
                    "sell_price_lag_mean": group["sell_price_lag_mean"].values[:-28].reshape(1, -1),
                    "sell_price_rolling_mean": group["sell_price_rolling_mean"].values[:-28].reshape(1, -1),
                    "sell_price_rolling_diff_mean": group["sell_price_rolling_diff_mean"].values[:-28].reshape(1, -1),
                    "sell_price_in_store_mean": group["sell_price_in_store_mean"].values[:-28].reshape(1, -1),
                    "year_delta": group["year_delta"].values[:-28].reshape(1, -1),
                    "quarter_sin": group["quarter_sin"].values[:-28].reshape(1, -1),
                    "quarter_cos": group["quarter_cos"].values[:-28].reshape(1, -1),
                    "month_sin": group["month_sin"].values[:-28].reshape(1, -1),
                    "month_cos": group["month_cos"].values[:-28].reshape(1, -1),
                    "day_sin": group["day_sin"].values[:-28].reshape(1, -1),
                    "day_cos": group["day_cos"].values[:-28].reshape(1, -1),
                    "weekday_sin": group["weekday_sin"].values[:28].reshape(1, -1),
                    "weekday_cos": group["weekday_cos"].values[:-28].reshape(1, -1),
                    "event_count": group["event_count"].values[:-28].reshape(1, -1),
                    'snap_CA': group['snap_CA'].values[:-28],
                    'snap_TX': group['snap_TX'].values[:-28],
                    'snap_WI': group['snap_WI'].values[:-28],
                    'event_name_1_enc': group['event_name_1_enc'].values[:-28],
                    'event_name_2_enc': group['event_name_2_enc'].values[:-28],
                    'event_type_1_enc': group['event_type_1_enc'].values[:-28],
                    'event_type_2_enc': group['event_type_2_enc'].values[:-28]
                }
                for id, group in group_df.groupby("id")
            ],
            freq="D",
        )

        test_dataset = ListDataset(
            [
                {
                    FieldName.ITEM_ID: id,
                    FieldName.TARGET: group["sales_sum"].values[:], 
                    FieldName.START: pd.Period(start_date, freq="1D"),
                    'id': [group_encoder[id]],
                    "sales_mean": group["sales_mean"].values[:].reshape(1, -1),
                    "sales_std": group["sales_std"].values[:].reshape(1, -1),
                    "sales_max": group["sales_max"].values[:].reshape(1, -1),
                    "sales_min": group["sales_min"].values[:].reshape(1, -1),
                    "sales_diff_mean": group["sales_diff_mean"].values[:].reshape(1, -1),
                    "sales_lag1_mean": group["sales_lag1_mean"].values[:].reshape(1, -1),
                    "sales_lag7_mean": group["sales_lag7_mean"].values[:].reshape(1, -1),
                    "sales_lag28_mean": group["sales_lag28_mean"].values[:].reshape(1, -1),
                    "sales_rolling7_mean": group["sales_rolling7_mean"].values[:].reshape(1, -1),
                    "sales_rolling28_mean": group["sales_rolling28_mean"].values[:].reshape(1, -1),
                    "sales_rolling7_diff_mean": group["sales_rolling7_diff_mean"].values[:].reshape(1, -1),
                    "sales_rolling28_diff_mean": group["sales_rolling28_diff_mean"].values[:].reshape(1, -1),
                    "release_mean": group["release_mean"].values[:].reshape(1, -1),
                    "out_of_stock_mean": group["out_of_stock_mean"].values[:].reshape(1, -1),
                    "sell_price_mean": group["sell_price_mean"].values[:].reshape(1, -1),
                    "sell_price_std": group["sell_price_std"].values[:].reshape(1, -1),
                    "sell_price_max": group["sell_price_max"].values[:].reshape(1, -1),
                    "sell_price_min": group["sell_price_min"].values[:].reshape(1, -1),
                    "sell_price_diff_mean": group["sell_price_diff_mean"].values[:].reshape(1, -1),
                    "sell_price_lag_mean": group["sell_price_lag_mean"].values[:].reshape(1, -1),
                    "sell_price_rolling_mean": group["sell_price_rolling_mean"].values[:].reshape(1, -1),
                    "sell_price_rolling_diff_mean": group["sell_price_rolling_diff_mean"].values[:].reshape(1, -1),
                    "sell_price_in_store_mean": group["sell_price_in_store_mean"].values[:].reshape(1, -1),
                    "year_delta": group["year_delta"].values[:].reshape(1, -1),
                    "quarter_sin": group["quarter_sin"].values[:].reshape(1, -1),
                    "quarter_cos": group["quarter_cos"].values[:].reshape(1, -1),
                    "month_sin": group["month_sin"].values[:].reshape(1, -1),
                    "month_cos": group["month_cos"].values[:].reshape(1, -1),
                    "day_sin": group["day_sin"].values[:].reshape(1, -1),
                    "day_cos": group["day_cos"].values[:].reshape(1, -1),
                    "weekday_sin": group["weekday_sin"].values[:].reshape(1, -1),
                    "weekday_cos": group["weekday_cos"].values[:].reshape(1, -1),
                    "event_count": group["event_count"].values[:].reshape(1, -1),
                    'snap_CA': group['snap_CA'].values[:],
                    'snap_TX': group['snap_TX'].values[:],
                    'snap_WI': group['snap_WI'].values[:],
                    'event_name_1_enc': group['event_name_1_enc'].values[:],
                    'event_name_2_enc': group['event_name_2_enc'].values[:],
                    'event_type_1_enc': group['event_type_1_enc'].values[:],
                    'event_type_2_enc': group['event_type_2_enc'].values[:]
                }
                for id, group in group_df.groupby("id")
            ],
            freq="D",
        )   
        
        datasets['train'] = train_dataset
        datasets['valid'] = valid_dataset
        datasets['test'] = test_dataset
    
        # save
        with open(os.path.join(save_dir, f'dataset_level_{level_idx}.pkl'), 'wb') as f:
            pickle.dump(datasets, f)

        # reduce memory
        del agg_df
        del calendar_df
        del group_df
        del train_dataset
        del valid_dataset
        del test_dataset
        del datasets

prepare_tft_datasets()

[93mPreparing dataset for level 1[0m
[93mPreparing dataset for level 2[0m
[93mPreparing dataset for level 3[0m
[93mPreparing dataset for level 4[0m
[93mPreparing dataset for level 5[0m
[93mPreparing dataset for level 6[0m
[93mPreparing dataset for level 7[0m
[93mPreparing dataset for level 8[0m
[93mPreparing dataset for level 9[0m
[93mPreparing dataset for level 10[0m
[93mPreparing dataset for level 11[0m
[93mPreparing dataset for level 12[0m


In [None]:
def prepare_lgb_datasets(save_dir='../dataset/lgb'):
    os.makedirs(save_dir, exist_ok=True)
    
    levels = [
        [],                        # Level 1: Total
        ['state_id'],              # Level 2: State
        ['store_id'],              # Level 3: Store
        ['cat_id'],                # Level 4: Category
        ['dept_id'],               # Level 5: Department
        ['state_id', 'cat_id'],    # Level 6: State-Category
        ['state_id', 'dept_id'],   # Level 7: State-Department
        ['store_id', 'cat_id'],    # Level 8: Store-Category
        ['store_id', 'dept_id'],   # Level 9: Store-Department
        ['item_id'],               # Level 10: Item
        ['item_id', 'state_id'],   # Level 11: Item-State
        ['item_id', 'store_id']    # Level 12: Individual
    ]
    
    for level_idx, level in enumerate(levels, start=1):
        highlight_print(f"Preparing dataset for level {level_idx}")
        
        # load data
        agg_df = pd.read_csv(f'../data/preprocessed/agg_df_level_{level_idx}.csv')
        calendar_df = pd.read_csv('../data/preprocessed/calendar_df.csv')
        
        # reduce memory
        agg_df = reduce_memory(agg_df)
        calendar_df = reduce_memory(calendar_df)
        
        # convert dates
        start_date = pd.to_datetime('2011-01-29')
        valid_start_date = pd.to_datetime('2016-03-28')
        agg_df['d'] = agg_df['d'].apply(lambda x: int(x.split('_')[1]) - 1)
        agg_df['d'] = start_date + pd.to_timedelta(agg_df['d'], unit='D')
        calendar_df['d'] = calendar_df['d'].apply(lambda x: int(x.split('_')[1]) - 1)
        calendar_df['d'] = start_date + pd.to_timedelta(calendar_df['d'], unit='D')
        
        # create id (group)
        if len(level) == 0:
            agg_df.insert(1, 'id', 'total')
        elif len(level) == 1:
            agg_df.insert(1, 'id', agg_df[level[0]])
            del agg_df[level[0]]
        else:
            agg_df.insert(1, 'id', agg_df[level[0]] + '_' + agg_df[level[1]])
            del agg_df[level[0]]
            del agg_df[level[1]]
            
        # merge
        df = agg_df.merge(calendar_df, on="d", how="left")
        
        # create time-based features
        df['year'] = df['d'].dt.year
        df['month'] = df['d'].dt.month
        df['week'] = df['d'].dt.isocalendar().week
        df['day'] = df['d'].dt.day
        df['dayofweek'] = df['d'].dt.dayofweek
        df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)
        
        # Create lag features specific
        groups = df.groupby('id')
        for lag in [1, 7, 14, 28]:
            df[f'sales_lag_{lag}'] = groups['sales_sum'].transform(lambda x: x.shift(lag))
            
        # create rolling features
        for window in [7, 14, 28]:
            df[f'sales_rolling_mean_{window}'] = groups['sales_sum'].transform(
                lambda x: x.shift(1).rolling(window=window).mean())
            df[f'sales_rolling_std_{window}'] = groups['sales_sum'].transform(
                lambda x: x.shift(1).rolling(window=window).std())
        
        # split datasets
        train_df = df[df['d'] < valid_start_date].copy()
        valid_df = df[(df['d'] >= valid_start_date) & (df['d'] < valid_start_date + pd.Timedelta(days=28))].copy()
        test_df = df.copy()
        
        # drop unnecessary columns and handle missing values
        drop_cols = ['d'] + [col for col in df.columns if 'enc' in col]
        train_df = train_df.drop(columns=drop_cols).fillna(0)
        valid_df = valid_df.drop(columns=drop_cols).fillna(0)
        test_df = test_df.drop(columns=drop_cols).fillna(0)
        
        # datasets
        datasets = {
            'train': {
                'data': train_df,
                'target': train_df['sales_sum'],
                'groups': train_df['id']
            },
            'valid': {
                'data': valid_df,
                'target': valid_df['sales_sum'],
                'groups': valid_df['id']
            },
            'test': {
                'data': test_df,
                'target': test_df['sales_sum'],
                'groups': test_df['id']
            }
        }
        
        # save
        with open(os.path.join(save_dir, f'dataset_level_{level_idx}.pkl'), 'wb') as f:
            pickle.dump(datasets, f)
            
        # reduce memory
        del agg_df, calendar_df, df, train_df, valid_df, test_df, datasets

prepare_lgb_datasets()