In [6]:
import os
import pandas as pd
import pickle
from gluonts.dataset.field_names import FieldName
from gluonts.dataset.common import ListDataset
from datetime import timedelta
from utils import highlight_print, reduce_memory

AttributeError: module 'mxnet' has no attribute 'nd'

In [2]:
def prepare_datasets(save_dir='../dataset'):
    levels = [
        [],                        # Level 1: Total
        ['state_id'],              # Level 2: State
        ['store_id'],              # Level 3: Store
        ['cat_id'],                # Level 4: Category
        ['dept_id'],               # Level 5: Department
        ['state_id', 'cat_id'],    # Level 6: State-Category
        ['state_id', 'dept_id'],   # Level 7: State-Department
        ['store_id', 'cat_id'],    # Level 8: Store-Category
        ['store_id', 'dept_id'],   # Level 9: Store-Department
        ['item_id'],               # Level 10: Item
        ['item_id', 'state_id'],   # Level 11: Item-State
        ['item_id', 'store_id']    # Level 12: Individual
    ]

    for level_idx, level in enumerate(levels, start=1):
        highlight_print(f"Preparing dataset for level {level_idx}")
        datasets = {'train': {}, 'test': {}}

        # load data
        agg_df = pd.read_csv(f'../data/preprocessed/agg_df_level_{level_idx}.csv')
        calendar_df = pd.read_csv('../data/preprocessed/calendar_df.csv')

        # reduce memory
        agg_df = reduce_memory(agg_df)
        calendar_df = reduce_memory(calendar_df)

        # convert to datetime
        start_date = pd.to_datetime('2011-01-29')
        agg_df['d'] = agg_df['d'].apply(lambda x: int(x.split('_')[1]) - 1)
        agg_df['d'] = start_date + pd.to_timedelta(agg_df['d'], unit='D')
        calendar_df['d'] = calendar_df['d'].apply(lambda x: int(x.split('_')[1]) - 1)
        calendar_df['d'] = start_date + pd.to_timedelta(calendar_df['d'], unit='D')
        
        # create id (group)
        if len(level) == 0:
            agg_df.insert(1, 'id', 'total')
        elif len(level) == 1: 
            agg_df.insert(1, 'id', agg_df[level[0]])
            del agg_df[level[0]]
        elif len(level) > 1:
            agg_df.insert(1, 'id', agg_df[level[0]] + '_' + agg_df[level[1]])
            del agg_df[level[0]]
            del agg_df[level[1]]

        # id (group) encoding
        groups = agg_df['id'].unique()
        group_encoder = {group: group_idx for group_idx, group in enumerate(groups)}
 
        # merge
        group_df = agg_df.merge(calendar_df, on="d", how="left")

        # dataset
        train_dataset = ListDataset(
            [
                {
                    FieldName.ITEM_ID: id,
                    FieldName.TARGET: group["sales_sum"].values[:-28],
                    FieldName.START: pd.Period(min(group["d"]), freq="1D"),
                    FieldName.FEAT_STATIC_CAT: [group_encoder[id]],
                    FieldName.FEAT_DYNAMIC_REAL: group[[
                        'sales_mean', 'sales_std', 'sales_max', 'sales_min', 'sales_diff_mean', 
                        'sales_lag1_mean', 'sales_lag7_mean', 'sales_lag28_mean', 
                        'sales_rolling7_mean', 'sales_rolling28_mean', 'sales_rolling7_diff_mean', 'sales_rolling28_diff_mean', 
                        'release_mean', 'out_of_stock_mean', 
                        'sell_price_mean', 'sell_price_std', 'sell_price_max', 'sell_price_min', 'sell_price_diff_mean',
                        'sell_price_lag_mean', 'sell_price_rolling_mean', 'sell_price_rolling_diff_mean',
                        'sell_price_in_store_mean',
                        "year_delta", "quarter_sin", "quarter_cos", "month_sin", "month_cos",  
                        "day_sin", "day_cos", "weekday_sin", "weekday_cos",
                        'event_count'
                    ]].values[:-28].T,
                    FieldName.FEAT_DYNAMIC_CAT: group[[
                        'snap_CA', 'snap_TX', 'snap_WI', 
                        'event_name_1_enc', 'event_name_2_enc', 
                        'event_type_1_enc', 'event_type_2_enc'
                    ]].values[:-28].T,
                }
                for id, group in group_df.groupby("id")
            ],
            freq="D",
        )
        test_dataset = ListDataset(
            [
                {
                    FieldName.ITEM_ID: id,
                    FieldName.TARGET: group["sales_sum"].values[:], 
                    FieldName.START: pd.Period(min(group["d"]), freq="1D"),
                    FieldName.FEAT_STATIC_CAT: [group_encoder[id]],
                    FieldName.FEAT_DYNAMIC_REAL: group[[
                        'sales_mean', 'sales_std', 'sales_max', 'sales_min', 'sales_diff_mean', 
                        'sales_lag1_mean', 'sales_lag7_mean', 'sales_lag28_mean', 
                        'sales_rolling7_mean', 'sales_rolling28_mean', 'sales_rolling7_diff_mean', 'sales_rolling28_diff_mean', 
                        'release_mean', 'out_of_stock_mean', 
                        'sell_price_mean', 'sell_price_std', 'sell_price_max', 'sell_price_min', 'sell_price_diff_mean',
                        'sell_price_lag_mean', 'sell_price_rolling_mean', 'sell_price_rolling_diff_mean',
                        'sell_price_in_store_mean',
                        "year_delta", "quarter_sin", "quarter_cos", "month_sin", "month_cos",  
                        "day_sin", "day_cos", "weekday_sin", "weekday_cos",
                        'event_count'
                    ]].values[:].T,  
                    FieldName.FEAT_DYNAMIC_CAT: group[[
                        'snap_CA', 'snap_TX', 'snap_WI', 
                        'event_name_1_enc', 'event_name_2_enc', 
                        'event_type_1_enc', 'event_type_2_enc'
                    ]].values[:].T,  
                }
                for id, group in group_df.groupby("id")
            ],
            freq="D",
        )   
        
        datasets['train'] = train_dataset
        datasets['test'] = test_dataset

        # save
        with open(os.path.join(save_dir, f'dataset_level_{level_idx}.pkl'), 'wb') as f:
            pickle.dump(datasets, f)

        # reduce memory
        del agg_df
        del calendar_df
        del group_df
        del train_dataset
        del test_dataset
        del datasets

prepare_datasets()

NameError: name 'highlight_print' is not defined

In [4]:
def prepare_datasets(save_dir='../dataset/tft'):
    os.makedirs(save_dir, exist_ok=True)

    levels = [
        [],                        # Level 1: Total
        ['state_id'],              # Level 2: State
        ['store_id'],              # Level 3: Store
        ['cat_id'],                # Level 4: Category
        ['dept_id'],               # Level 5: Department
        ['state_id', 'cat_id'],    # Level 6: State-Category
        ['state_id', 'dept_id'],   # Level 7: State-Department
        ['store_id', 'cat_id'],    # Level 8: Store-Category
        ['store_id', 'dept_id'],   # Level 9: Store-Department
        ['item_id'],               # Level 10: Item
        ['item_id', 'state_id'],   # Level 11: Item-State
        ['item_id', 'store_id']    # Level 12: Individual
    ]

    for level_idx, level in enumerate(levels, start=1):
        highlight_print(f"Preparing dataset for level {level_idx}")
        datasets = {'train': {}, 'test': {}}

        # load data
        agg_df = pd.read_csv(f'../data/preprocessed/agg_df_level_{level_idx}.csv')
        calendar_df = pd.read_csv('../data/preprocessed/calendar_df.csv')

        # reduce memory
        agg_df = reduce_memory(agg_df)
        calendar_df = reduce_memory(calendar_df)

        # convert to datetime
        start_date = pd.to_datetime('2011-01-29')
        agg_df['d'] = agg_df['d'].apply(lambda x: int(x.split('_')[1]) - 1)
        agg_df['d'] = start_date + pd.to_timedelta(agg_df['d'], unit='D')
        calendar_df['d'] = calendar_df['d'].apply(lambda x: int(x.split('_')[1]) - 1)
        calendar_df['d'] = start_date + pd.to_timedelta(calendar_df['d'], unit='D')
        
        # create id (group)
        if len(level) == 0:
            agg_df.insert(1, 'id', 'total')
        elif len(level) == 1: 
            agg_df.insert(1, 'id', agg_df[level[0]])
            del agg_df[level[0]]
        elif len(level) > 1:
            agg_df.insert(1, 'id', agg_df[level[0]] + '_' + agg_df[level[1]])
            del agg_df[level[0]]
            del agg_df[level[1]]

        # id (group) encoding
        groups = agg_df['id'].unique()
        group_encoder = {group: group_idx for group_idx, group in enumerate(groups)}
 
        # merge
        group_df = agg_df.merge(calendar_df, on="d", how="left")

        # dataset
        train_dataset = ListDataset(
            [
                {
                    FieldName.ITEM_ID: id,
                    FieldName.TARGET: group["sales_sum"].values[:-28],
                    FieldName.START: pd.Period(min(group["d"]), freq="1D"),
                    'id': [group_encoder[id]],
                    "sales_mean": group["sales_mean"].values[:-28].reshape(1, -1),
                    "sales_std": group["sales_std"].values[:-28].reshape(1, -1),
                    "sales_max": group["sales_max"].values[:-28].reshape(1, -1),
                    "sales_min": group["sales_min"].values[:-28].reshape(1, -1),
                    "sales_diff_mean": group["sales_diff_mean"].values[:-28].reshape(1, -1),
                    "sales_lag1_mean": group["sales_lag1_mean"].values[:-28].reshape(1, -1),
                    "sales_lag7_mean": group["sales_lag7_mean"].values[:-28].reshape(1, -1),
                    "sales_lag28_mean": group["sales_lag28_mean"].values[:-28].reshape(1, -1),
                    "sales_rolling7_mean": group["sales_rolling7_mean"].values[:-28].reshape(1, -1),
                    "sales_rolling28_mean": group["sales_rolling28_mean"].values[:-28].reshape(1, -1),
                    "sales_rolling7_diff_mean": group["sales_rolling7_diff_mean"].values[:-28].reshape(1, -1),
                    "sales_rolling28_diff_mean": group["sales_rolling28_diff_mean"].values[:-28].reshape(1, -1),
                    "release_mean": group["release_mean"].values[:-28].reshape(1, -1),
                    "out_of_stock_mean": group["out_of_stock_mean"].values[:-28].reshape(1, -1),
                    "sell_price_mean": group["sell_price_mean"].values[:-28].reshape(1, -1),
                    "sell_price_std": group["sell_price_std"].values[:-28].reshape(1, -1),
                    "sell_price_max": group["sell_price_max"].values[:-28].reshape(1, -1),
                    "sell_price_min": group["sell_price_min"].values[:-28].reshape(1, -1),
                    "sell_price_diff_mean": group["sell_price_diff_mean"].values[:-28].reshape(1, -1),
                    "sell_price_lag_mean": group["sell_price_lag_mean"].values[:-28].reshape(1, -1),
                    "sell_price_rolling_mean": group["sell_price_rolling_mean"].values[:-28].reshape(1, -1),
                    "sell_price_rolling_diff_mean": group["sell_price_rolling_diff_mean"].values[:-28].reshape(1, -1),
                    "sell_price_in_store_mean": group["sell_price_in_store_mean"].values[:-28].reshape(1, -1),
                    "year_delta": group["year_delta"].values[:-28].reshape(1, -1),
                    "quarter_sin": group["quarter_sin"].values[:-28].reshape(1, -1),
                    "quarter_cos": group["quarter_cos"].values[:-28].reshape(1, -1),
                    "month_sin": group["month_sin"].values[:-28].reshape(1, -1),
                    "month_cos": group["month_cos"].values[:-28].reshape(1, -1),
                    "day_sin": group["day_sin"].values[:-28].reshape(1, -1),
                    "day_cos": group["day_cos"].values[:-28].reshape(1, -1),
                    "weekday_sin": group["weekday_sin"].values[:-28].reshape(1, -1),
                    "weekday_cos": group["weekday_cos"].values[:-28].reshape(1, -1),
                    "event_count": group["event_count"].values[:-28].reshape(1, -1),
                    'snap_CA': group['snap_CA'].values[:-28],
                    'snap_TX': group['snap_TX'].values[:-28],
                    'snap_WI': group['snap_WI'].values[:-28],
                    'event_name_1_enc': group['event_name_1_enc'].values[:-28],
                    'event_name_2_enc': group['event_name_2_enc'].values[:-28],
                    'event_type_1_enc': group['event_type_1_enc'].values[:-28],
                    'event_type_2_enc': group['event_type_2_enc'].values[:-28]
                }
                for id, group in group_df.groupby("id")
            ],
            freq="D",
        )

        test_dataset = ListDataset(
            [
                {
                    FieldName.ITEM_ID: id,
                    FieldName.TARGET: group["sales_sum"].values[:], 
                    FieldName.START: pd.Period(min(group["d"]), freq="1D"),
                    'id': [group_encoder[id]],
                    "sales_mean": group["sales_mean"].values[:].reshape(1, -1),
                    "sales_std": group["sales_std"].values[:].reshape(1, -1),
                    "sales_max": group["sales_max"].values[:].reshape(1, -1),
                    "sales_min": group["sales_min"].values[:].reshape(1, -1),
                    "sales_diff_mean": group["sales_diff_mean"].values[:].reshape(1, -1),
                    "sales_lag1_mean": group["sales_lag1_mean"].values[:].reshape(1, -1),
                    "sales_lag7_mean": group["sales_lag7_mean"].values[:].reshape(1, -1),
                    "sales_lag28_mean": group["sales_lag28_mean"].values[:].reshape(1, -1),
                    "sales_rolling7_mean": group["sales_rolling7_mean"].values[:].reshape(1, -1),
                    "sales_rolling28_mean": group["sales_rolling28_mean"].values[:].reshape(1, -1),
                    "sales_rolling7_diff_mean": group["sales_rolling7_diff_mean"].values[:].reshape(1, -1),
                    "sales_rolling28_diff_mean": group["sales_rolling28_diff_mean"].values[:].reshape(1, -1),
                    "release_mean": group["release_mean"].values[:].reshape(1, -1),
                    "out_of_stock_mean": group["out_of_stock_mean"].values[:].reshape(1, -1),
                    "sell_price_mean": group["sell_price_mean"].values[:].reshape(1, -1),
                    "sell_price_std": group["sell_price_std"].values[:].reshape(1, -1),
                    "sell_price_max": group["sell_price_max"].values[:].reshape(1, -1),
                    "sell_price_min": group["sell_price_min"].values[:].reshape(1, -1),
                    "sell_price_diff_mean": group["sell_price_diff_mean"].values[:].reshape(1, -1),
                    "sell_price_lag_mean": group["sell_price_lag_mean"].values[:].reshape(1, -1),
                    "sell_price_rolling_mean": group["sell_price_rolling_mean"].values[:].reshape(1, -1),
                    "sell_price_rolling_diff_mean": group["sell_price_rolling_diff_mean"].values[:].reshape(1, -1),
                    "sell_price_in_store_mean": group["sell_price_in_store_mean"].values[:].reshape(1, -1),
                    "year_delta": group["year_delta"].values[:].reshape(1, -1),
                    "quarter_sin": group["quarter_sin"].values[:].reshape(1, -1),
                    "quarter_cos": group["quarter_cos"].values[:].reshape(1, -1),
                    "month_sin": group["month_sin"].values[:].reshape(1, -1),
                    "month_cos": group["month_cos"].values[:].reshape(1, -1),
                    "day_sin": group["day_sin"].values[:].reshape(1, -1),
                    "day_cos": group["day_cos"].values[:].reshape(1, -1),
                    "weekday_sin": group["weekday_sin"].values[:].reshape(1, -1),
                    "weekday_cos": group["weekday_cos"].values[:].reshape(1, -1),
                    "event_count": group["event_count"].values[:].reshape(1, -1),
                    'snap_CA': group['snap_CA'].values[:],
                    'snap_TX': group['snap_TX'].values[:],
                    'snap_WI': group['snap_WI'].values[:],
                    'event_name_1_enc': group['event_name_1_enc'].values[:],
                    'event_name_2_enc': group['event_name_2_enc'].values[:],
                    'event_type_1_enc': group['event_type_1_enc'].values[:],
                    'event_type_2_enc': group['event_type_2_enc'].values[:]
                }
                for id, group in group_df.groupby("id")
            ],
            freq="D",
        )   
        
        datasets['train'] = train_dataset
        datasets['test'] = test_dataset
    
        # save
        with open(os.path.join(save_dir, f'dataset_level_{level_idx}.pkl'), 'wb') as f:
            pickle.dump(datasets, f)

        # reduce memory
        del agg_df
        del calendar_df
        del group_df
        del train_dataset
        del test_dataset
        del datasets

prepare_datasets()

[93mPreparing dataset for level 12[0m
