source: https://www.kaggle.com/kyakovlev/m5-lags-features from https://www.kaggle.com/ejunichi/m5-three-shades-of-dark-darker-magic

In [1]:
import sys
import os
import pathlib
import gc
import pandas as pd
pd.set_option('display.max_columns', 500)
# pd.set_option('display.max_rows', 500)
import numpy as np
import math
import random
import pickle
import time
import psutil
import warnings

# custom import
from sklearn.preprocessing import LabelEncoder
from multiprocessing import Pool        # Multiprocess Runs

# warnings.filterwarnings('ignore')

# constant variables for helper functions

In [2]:
N_CORES = psutil.cpu_count()     # Available CPU cores
print(f"N_CORES: {N_CORES}")

N_CORES: 36


# function nicely diplaying a head of Pandas DataFrame

In [3]:
import IPython

def display(*dfs, head=True):
    for df in dfs:
        IPython.display.display(df.head() if head else df)

# function fixing random seeds

In [4]:
def seed_everything(seed=0):
    """Sets seed to make all processes deterministic     # type: int
    
    """
    random.seed(seed)
    np.random.seed(seed)

SEED = 42
seed_everything(SEED)    

# function processing df in multiprocess

In [5]:
def run_df_in_multiprocess(func, t_split):
    """Process ds in Multiprocess
    
    """
    num_cores = np.min([N_CORES,len(t_split)])
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, t_split), axis=1)
    pool.close()
    pool.join()
    return df

# other helper functions

In [6]:
def get_memory_usage():
    """メモリ使用量を確認するためのシンプルな「メモリプロファイラ」
    
    """
    return np.round(psutil.Process(os.getpid()).memory_info()[0]/2.**30, 2) 
        
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)


def merge_by_concat(df1, df2, merge_on):
    """
    dtypesを失わないための連結による結合
    
    """
    
    merged_gf = df1[merge_on]
    merged_gf = merged_gf.merge(df2, on=merge_on, how='left')
    new_columns = [col for col in list(merged_gf) if col not in merge_on]
    df1 = pd.concat([df1, merged_gf[new_columns]], axis=1)
    return df1

#  constant variables for data import

In [7]:
_DATA_DIR = os.path.sep.join(["data", "M5_Three_shades_of_Dark_Darker_magic"])
_CALENDAR_CSV_FILE = "calendar.csv"

_CLEANED_BASE_GRID = "clearned_base_grid_for_darker_magic.pkl"
_BASE_GRID_WITH_SALES_PRICE_FEATURES = "base_grid_with_sales_price_features_for_darker_magic.pkl"
_BASE_GRID_WITH_CALENDAR_FEATURES = "base_grid_with_calendar_features_for_darker_magic.pkl"


# function importing data

In [8]:
def reduce_mem_usage(df, verbose=True):
    """
    reduce the memory usage of the given dataframe.
    https://qiita.com/hiroyuki_kageyama/items/02865616811022f79754
    
    Args:
        df: Dataframe
        verbose: 
        
    Returns:
        df, whose memory usage is reduced.

    Raises:
        None
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns: #columns毎に処理
        col_type = df[col].dtypes
        if col_type in numerics: #numericsのデータ型の範囲内のときに処理を実行. データの最大最小値を元にデータ型を効率的なものに変更
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def read_data(directory, file_name):
    print('Reading files...')
    df = pd.read_csv(os.path.sep.join([str(directory), _DATA_DIR, file_name]))
    df = reduce_mem_usage(df)
    print('{} has {} rows and {} columns'.format(file_name, df.shape[0], df.shape[1]))
    
    return df

# read pickle

In [9]:
parent_dir = pathlib.Path(os.path.abspath(os.curdir)).parent.parent

grid_df = pd.read_pickle(os.path.sep.join([str(parent_dir), _DATA_DIR, _CLEANED_BASE_GRID]))
display(grid_df)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,release
0,HOBBIES_1_008_CA_1_validation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,1,12.0,0
1,HOBBIES_1_009_CA_1_validation,HOBBIES_1_009,HOBBIES_1,HOBBIES,CA_1,CA,1,2.0,0
2,HOBBIES_1_010_CA_1_validation,HOBBIES_1_010,HOBBIES_1,HOBBIES,CA_1,CA,1,0.0,0
3,HOBBIES_1_012_CA_1_validation,HOBBIES_1_012,HOBBIES_1,HOBBIES,CA_1,CA,1,0.0,0
4,HOBBIES_1_015_CA_1_validation,HOBBIES_1_015,HOBBIES_1,HOBBIES,CA_1,CA,1,4.0,0


# constant variables for feature engineering

In [10]:
# 予測期間とitem数の定義 / number of items, and number of prediction period
_NUM_UNIQUE_ITEM_ID = 30490
_DAYS_FOR_PREDICTION = 28

DAYS_PER_YEAR = 365
_NUM_YEARS_FOR_MELT = 2
_NUM_IMPORT_ROWS_FOR_MELT = DAYS_PER_YEAR * _NUM_YEARS_FOR_MELT * _NUM_UNIQUE_ITEM_ID
print(f"_NUM_IMPORT_ROWS_FOR_MELT: {_NUM_IMPORT_ROWS_FOR_MELT}")

_SALES_HISTORY_DAYS = 1913 # And we will use last 28 days as validation
_SALES_HISTORY_START_DAYS_FOR_VALIDATION = _SALES_HISTORY_DAYS + 1
_SALES_HISTORY_START_DAYS_FOR_EVALUATION = 1942

TARGET = 'sales' # Our Target
MAIN_INDEX = ['id','d']  # We can identify items by these columns

_NUM_IMPORT_ROWS_FOR_MELT: 22257700


# add features

In [11]:
# todo:
# Try df.groupby(['id'])['demand'].transform(lambda x: x.shift(365).rolling(7).mean()
# Get the demand of the same day of previous year rolling mean, std for windows (7, 15, 30 …..) can help also.
# Not entirely sure, but you can try.

# We need only 'id','d','sales' to make lags and rollings
grid_df = grid_df[['id','d', TARGET]]

print("grid_df")
display(grid_df)

# Lags with 28 day shift
start_time = time.time()
print('Create lags')
num_lag_days = 32
# 厳密には異なるが、便宜上30日前を1ヶ月前とする
days_last_month_ = 28
LAG_DAYS = [col for col in range(days_last_month_, days_last_month_ + num_lag_days)]
# 365,366,367日(1年、閏年によるずれを考慮)前に売れた個数を特徴量として追加(28日以下はleakageになる)
LAG_DAYS.extend([365,366,367])
print(f"LAG_DAYS: {LAG_DAYS}")

grid_df = grid_df.assign(**{
        '{}_lag_{}'.format(col, l): grid_df.groupby(['id'])[col].transform(lambda x: x.shift(l))
        for l in LAG_DAYS
        for col in [TARGET]
    })

print("grid_df after assigning Lags with 28 day shift features")
display(grid_df)

# Minify lag columns memory usage
for col in list(grid_df):
    if 'lag' in col:
        grid_df[col] = grid_df[col].astype(np.float16)

print('%0.2f min: Lags' % ((time.time() - start_time) / 60))

# Rollings with _DAYS_FOR_PREDICTION day shift
start_time = time.time()
print('Create rolling aggs')

for i in [7,14,30,60,180]:
    print('Rolling period:', i)
    grid_df['rolling_mean_'+str(i)] = grid_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(_DAYS_FOR_PREDICTION).rolling(i).mean()).astype(np.float16)
    grid_df['rolling_std_'+str(i)]  = grid_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(_DAYS_FOR_PREDICTION).rolling(i).std()).astype(np.float16)

print("grid_df after Rollings with 28 day shift")
display(grid_df)
        
# Rollings with sliding shift.
for d_shift in [1,7,14]: 
    print('Shifting period:', d_shift)
    for d_window in [7,14,30,60]:
#          ある日の予測は28日以内の売り上げ個数を知らない体なので、_DAYS_FOR_PREDICTIONを追加。
#          prediction (training_and_validation.ipynb or training_and_validation_and_evaluation.ipynb)で"_tmp_"をフックにして取り除かれる。
        col_name = 'rolling_mean_tmp_'+str(d_shift)+'_'+str(d_window)
#          ある日の予測は28日以内の売り上げ個数を知らない体なので、_DAYS_FOR_PREDICTIONを追加。
        grid_df[col_name] = grid_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(_DAYS_FOR_PREDICTION + d_shift).rolling(d_window).mean()).astype(np.float16)
        
print("grid_df after Rollings after 1,7,14 day shift")
display(grid_df)
    
print('%0.2f min: Lags' % ((time.time() - start_time) / 60))

grid_df


Unnamed: 0,id,d,sales
0,HOBBIES_1_008_CA_1_validation,1,12.0
1,HOBBIES_1_009_CA_1_validation,1,2.0
2,HOBBIES_1_010_CA_1_validation,1,0.0
3,HOBBIES_1_012_CA_1_validation,1,0.0
4,HOBBIES_1_015_CA_1_validation,1,4.0


Create lags
LAG_DAYS: [28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 365, 366, 367]
grid_df after assigning Lags with 28 day shift features


Unnamed: 0,id,d,sales,sales_lag_28,sales_lag_29,sales_lag_30,sales_lag_31,sales_lag_32,sales_lag_33,sales_lag_34,sales_lag_35,sales_lag_36,sales_lag_37,sales_lag_38,sales_lag_39,sales_lag_40,sales_lag_41,sales_lag_42,sales_lag_43,sales_lag_44,sales_lag_45,sales_lag_46,sales_lag_47,sales_lag_48,sales_lag_49,sales_lag_50,sales_lag_51,sales_lag_52,sales_lag_53,sales_lag_54,sales_lag_55,sales_lag_56,sales_lag_57,sales_lag_58,sales_lag_59,sales_lag_365,sales_lag_366,sales_lag_367
0,HOBBIES_1_008_CA_1_validation,1,12.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,HOBBIES_1_009_CA_1_validation,1,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,HOBBIES_1_010_CA_1_validation,1,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,HOBBIES_1_012_CA_1_validation,1,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,HOBBIES_1_015_CA_1_validation,1,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


12.29 min: Lags
Create rolling aggs
Rolling period: 7
Rolling period: 14
Rolling period: 30
Rolling period: 60
Rolling period: 180
grid_df after Rollings with 28 day shift


Unnamed: 0,id,d,sales,sales_lag_28,sales_lag_29,sales_lag_30,sales_lag_31,sales_lag_32,sales_lag_33,sales_lag_34,sales_lag_35,sales_lag_36,sales_lag_37,sales_lag_38,sales_lag_39,sales_lag_40,sales_lag_41,sales_lag_42,sales_lag_43,sales_lag_44,sales_lag_45,sales_lag_46,sales_lag_47,sales_lag_48,sales_lag_49,sales_lag_50,sales_lag_51,sales_lag_52,sales_lag_53,sales_lag_54,sales_lag_55,sales_lag_56,sales_lag_57,sales_lag_58,sales_lag_59,sales_lag_365,sales_lag_366,sales_lag_367,rolling_mean_7,rolling_std_7,rolling_mean_14,rolling_std_14,rolling_mean_30,rolling_std_30,rolling_mean_60,rolling_std_60,rolling_mean_180,rolling_std_180
0,HOBBIES_1_008_CA_1_validation,1,12.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,HOBBIES_1_009_CA_1_validation,1,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,HOBBIES_1_010_CA_1_validation,1,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,HOBBIES_1_012_CA_1_validation,1,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,HOBBIES_1_015_CA_1_validation,1,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


Shifting period: 1
Shifting period: 7
Shifting period: 14
grid_df after Rollings after 1,7,14 day shift


Unnamed: 0,id,d,sales,sales_lag_28,sales_lag_29,sales_lag_30,sales_lag_31,sales_lag_32,sales_lag_33,sales_lag_34,sales_lag_35,sales_lag_36,sales_lag_37,sales_lag_38,sales_lag_39,sales_lag_40,sales_lag_41,sales_lag_42,sales_lag_43,sales_lag_44,sales_lag_45,sales_lag_46,sales_lag_47,sales_lag_48,sales_lag_49,sales_lag_50,sales_lag_51,sales_lag_52,sales_lag_53,sales_lag_54,sales_lag_55,sales_lag_56,sales_lag_57,sales_lag_58,sales_lag_59,sales_lag_365,sales_lag_366,sales_lag_367,rolling_mean_7,rolling_std_7,rolling_mean_14,rolling_std_14,rolling_mean_30,rolling_std_30,rolling_mean_60,rolling_std_60,rolling_mean_180,rolling_std_180,rolling_mean_tmp_1_7,rolling_mean_tmp_1_14,rolling_mean_tmp_1_30,rolling_mean_tmp_1_60,rolling_mean_tmp_7_7,rolling_mean_tmp_7_14,rolling_mean_tmp_7_30,rolling_mean_tmp_7_60,rolling_mean_tmp_14_7,rolling_mean_tmp_14_14,rolling_mean_tmp_14_30,rolling_mean_tmp_14_60
0,HOBBIES_1_008_CA_1_validation,1,12.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,HOBBIES_1_009_CA_1_validation,1,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,HOBBIES_1_010_CA_1_validation,1,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,HOBBIES_1_012_CA_1_validation,1,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,HOBBIES_1_015_CA_1_validation,1,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


13.24 min: Lags


# export grid_df having lag features

In [12]:
_EXPORT_FILE_NAME = "base_grid_with_lag_features_for_" + str(_DAYS_FOR_PREDICTION) + "_days.pkl"
print("data export start")
grid_df.to_pickle(os.path.sep.join([str(parent_dir), _DATA_DIR, _EXPORT_FILE_NAME]))
print('data export finished. Size:', grid_df.shape)

data export start
data export finished. Size: (46881677, 60)
