source: https://www.kaggle.com/kyakovlev/m5-custom-features from https://www.kaggle.com/ejunichi/m5-three-shades-of-dark-darker-magic

In [1]:
import sys
import os
import pathlib
import gc
import pandas as pd
pd.set_option('display.max_columns', 500)
# pd.set_option('display.max_rows', 500)
import numpy as np
import math
import random
import pickle
import time
import psutil
import warnings

# custom import
from sklearn.preprocessing import LabelEncoder
from multiprocessing import Pool        # Multiprocess Runs

# warnings.filterwarnings('ignore')

# constant variables for helper functions

In [2]:
N_CORES = psutil.cpu_count()     # Available CPU cores
print(f"N_CORES: {N_CORES}")

N_CORES: 16


# function nicely diplaying a head of Pandas DataFrame

In [3]:
import IPython

def display(*dfs, head=True):
    for df in dfs:
        IPython.display.display(df.head() if head else df)

# function fixing random seeds

In [4]:
def seed_everything(seed=0):
    """Sets seed to make all processes deterministic     # type: int
    
    """
    random.seed(seed)
    np.random.seed(seed)

SEED = 42
seed_everything(SEED)    

# function processing df in multiprocess

In [5]:
def run_df_in_multiprocess(func, t_split):
    """Process ds in Multiprocess
    
    """
    num_cores = np.min([N_CORES,len(t_split)])
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, t_split), axis=1)
    pool.close()
    pool.join()
    return df

# other helper functions

In [6]:
def get_memory_usage():
    """メモリ使用量を確認するためのシンプルな「メモリプロファイラ」
    
    """
    return np.round(psutil.Process(os.getpid()).memory_info()[0]/2.**30, 2) 
        
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)


def merge_by_concat(df1, df2, merge_on):
    """
    dtypesを失わないための連結による結合
    
    """
    
    merged_gf = df1[merge_on]
    merged_gf = merged_gf.merge(df2, on=merge_on, how='left')
    new_columns = [col for col in list(merged_gf) if col not in merge_on]
    df1 = pd.concat([df1, merged_gf[new_columns]], axis=1)
    return df1

#  constant variables for data import

In [7]:
_DATA_DIR = os.path.sep.join(["data", "M5_Three_shades_of_Dark_Darker_magic"])

_CLEANED_BASE_GRID = "clearned_base_grid_for_darker_magic_evaluation.pkl"
_BASE_GRID_WITH_SALES_PRICE_FEATURES = "base_grid_with_sales_price_features_for_darker_magic_evaluation.pkl"
_BASE_GRID_WITH_CALENDAR_FEATURES = "base_grid_with_calendar_features_for_darker_magic_evaluation.pkl"

# function importing data

In [8]:
def reduce_mem_usage(df, verbose=True):
    """
    reduce the memory usage of the given dataframe.
    https://qiita.com/hiroyuki_kageyama/items/02865616811022f79754
    
    Args:
        df: Dataframe
        verbose: 
        
    Returns:
        df, whose memory usage is reduced.

    Raises:
        None
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns: #columns毎に処理
        col_type = df[col].dtypes
        if col_type in numerics: #numericsのデータ型の範囲内のときに処理を実行. データの最大最小値を元にデータ型を効率的なものに変更
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def read_data(directory, file_name):
    print('Reading files...')
    df = pd.read_csv(os.path.sep.join([str(directory), _DATA_DIR, file_name]))
    df = reduce_mem_usage(df)
    print('{} has {} rows and {} columns'.format(file_name, df.shape[0], df.shape[1]))
    
    return df

# read pickle

In [9]:
parent_dir = pathlib.Path(os.path.abspath(os.curdir)).parent.parent

grid_df = pd.read_pickle(os.path.sep.join([str(parent_dir), _DATA_DIR, _CLEANED_BASE_GRID]))
print(f"grid_df: {grid_df}")

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,release
0,HOBBIES_1_008_CA_1_evaluation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,1,12.0,0
1,HOBBIES_1_009_CA_1_evaluation,HOBBIES_1_009,HOBBIES_1,HOBBIES,CA_1,CA,1,2.0,0
2,HOBBIES_1_010_CA_1_evaluation,HOBBIES_1_010,HOBBIES_1,HOBBIES,CA_1,CA,1,0.0,0
3,HOBBIES_1_012_CA_1_evaluation,HOBBIES_1_012,HOBBIES_1,HOBBIES,CA_1,CA,1,0.0,0
4,HOBBIES_1_015_CA_1_evaluation,HOBBIES_1_015,HOBBIES_1,HOBBIES,CA_1,CA,1,4.0,0


# constant variables for feature engineering

In [10]:
# 予測期間とitem数の定義 / number of items, and number of prediction period
_NUM_UNIQUE_ITEM_ID = 30490
_DAYS_FOR_PREDICTION = 28

DAYS_PER_YEAR = 365
_NUM_YEARS_FOR_MELT = 2
_NUM_IMPORT_ROWS_FOR_MELT = DAYS_PER_YEAR * _NUM_YEARS_FOR_MELT * _NUM_UNIQUE_ITEM_ID
print(f"_NUM_IMPORT_ROWS_FOR_MELT: {_NUM_IMPORT_ROWS_FOR_MELT}")

_SALES_HISTORY_DAYS = 1913
_SALES_HISTORY_START_DAYS_FOR_VALIDATION = _SALES_HISTORY_DAYS + 1
# we will use last 28 days as evaluation
_SALES_HISTORY_START_DAYS_FOR_EVALUATION = _SALES_HISTORY_START_DAYS_FOR_VALIDATION + _DAYS_FOR_PREDICTION
print(f"_SALES_HISTORY_START_DAYS_FOR_EVALUATION: {_SALES_HISTORY_START_DAYS_FOR_EVALUATION}")

TARGET = 'sales' # Our Target
MAIN_INDEX = ['id','d']  # We can identify items by these columns

_NUM_IMPORT_ROWS_FOR_MELT: 22257700
_SALES_HISTORY_START_DAYS_FOR_EVALUATION: 1942


# add features

In [11]:
# let evaluation sales value nan to avoid leakage in calculating means and stds
grid_df[TARGET][grid_df['d']>=(_SALES_HISTORY_START_DAYS_FOR_EVALUATION)] = np.nan

base_cols = list(grid_df)
print(f"base_cols: {base_cols}")

icols =  [
            ['state_id'],
            ['store_id'],
            ['cat_id'],
            ['dept_id'],
            ['state_id', 'cat_id'],
            ['state_id', 'dept_id'],
            ['store_id', 'cat_id'],
            ['store_id', 'dept_id'],
            ['item_id'],
            ['item_id', 'state_id'],
            ['item_id', 'store_id']
            ]

for col in icols:
    print('Encoding', col)
    col_name = '_'+'_'.join(col)+'_'
    grid_df['enc'+col_name+'mean'] = grid_df.groupby(col)[TARGET].transform('mean').astype(np.float16)
    grid_df['enc'+col_name+'std'] = grid_df.groupby(col)[TARGET].transform('std').astype(np.float16)

keep_cols = [col for col in list(grid_df) if col not in base_cols]
grid_df = grid_df[['id','d']+keep_cols]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


base_cols: ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd', 'sales', 'release']
Encoding ['state_id']
Encoding ['store_id']
Encoding ['cat_id']
Encoding ['dept_id']
Encoding ['state_id', 'cat_id']
Encoding ['state_id', 'dept_id']
Encoding ['store_id', 'cat_id']
Encoding ['store_id', 'dept_id']
Encoding ['item_id']
Encoding ['item_id', 'state_id']
Encoding ['item_id', 'store_id']


In [12]:
grid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47735397 entries, 0 to 47735396
Data columns (total 24 columns):
 #   Column                     Dtype   
---  ------                     -----   
 0   id                         category
 1   d                          int16   
 2   enc_state_id_mean          float16 
 3   enc_state_id_std           float16 
 4   enc_store_id_mean          float16 
 5   enc_store_id_std           float16 
 6   enc_cat_id_mean            float16 
 7   enc_cat_id_std             float16 
 8   enc_dept_id_mean           float16 
 9   enc_dept_id_std            float16 
 10  enc_state_id_cat_id_mean   float16 
 11  enc_state_id_cat_id_std    float16 
 12  enc_state_id_dept_id_mean  float16 
 13  enc_state_id_dept_id_std   float16 
 14  enc_store_id_cat_id_mean   float16 
 15  enc_store_id_cat_id_std    float16 
 16  enc_store_id_dept_id_mean  float16 
 17  enc_store_id_dept_id_std   float16 
 18  enc_item_id_mean           float16 
 19  enc_item_id_std    

# export base_grid having mean-encoded ids and their means and stds

In [13]:
_EXPORT_FILE_NAME = "base_grid_with_mean_encoded_ids_means_stds_for_darker_magic_evaluation.pkl"
print("data export start")
grid_df.to_pickle(os.path.sep.join([str(parent_dir), _DATA_DIR, _EXPORT_FILE_NAME]))
print('data export finished. Size:', grid_df.shape)


data export start
data export finished. Size: (47735397, 24)
