source: https://www.kaggle.com/kyakovlev/m5-custom-features from https://www.kaggle.com/ejunichi/m5-three-shades-of-dark-darker-magic

In [1]:
import sys
import os
import pathlib
import gc
import pandas as pd
pd.set_option('display.max_columns', 500)
# pd.set_option('display.max_rows', 500)
import numpy as np
import math
import random
import pickle
import time
import psutil
import warnings


# function fixing random seeds

In [2]:
def seed_everything(seed=0):
    """Sets seed to make all processes deterministic     # type: int
    
    """
    random.seed(seed)
    np.random.seed(seed)

SEED = 42
seed_everything(SEED)    

# constant variables for helper functions

In [3]:
N_CORES = psutil.cpu_count()     # Available CPU cores
print(f"N_CORES: {N_CORES}")

N_CORES: 36


#  constant variables for data import

In [4]:
# change this var according to the dataset you refer to 
# path to the source's pickle files
# _DATA_DIR = os.path.sep.join(["data", "M5_Three_shades_of_Dark_Darker_magic", "sample"])
_DATA_DIR = os.path.sep.join(["data", "M5_Three_shades_of_Dark_Darker_magic"])
_OUTPUT_DIR = os.path.sep.join(["data", "M5_Three_shades_of_Dark_Darker_magic"])

_VALIDATION_RESULT = "submission_v3_validation.csv"
_EVALUATION_RESULT = "submission_v3_evaluation.csv"

# function nicely diplaying a head of Pandas DataFrame

In [5]:
import IPython

def display(*dfs, head=True):
    for df in dfs:
        IPython.display.display(df.head() if head else df)

# function processing df in multiprocess

In [6]:
def run_df_in_multiprocess(func, t_split):
    """Process ds in Multiprocess
    
    """
    num_cores = np.min([N_CORES,len(t_split)])
    print(f"num_cores: {num_cores}")
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, t_split), axis=1)
    pool.close()
    pool.join()
    return df

# other helper functions

In [7]:
def get_memory_usage():
    """メモリ使用量を確認するためのシンプルな「メモリプロファイラ」
    
    """
    return np.round(psutil.Process(os.getpid()).memory_info()[0]/2.**30, 2) 
        
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)


def merge_by_concat(df1, df2, merge_on):
    """
    dtypesを失わないための連結による結合
    
    """
    
    merged_gf = df1[merge_on]
    merged_gf = merged_gf.merge(df2, on=merge_on, how='left')
    new_columns = [col for col in list(merged_gf) if col not in merge_on]
    df1 = pd.concat([df1, merged_gf[new_columns]], axis=1)
    return df1


def get_base_test():
    """Recombines Test set after training
    
    """
    base_test = pd.DataFrame()

    for store_id in STORE_IDS:
        test_pkl_path = os.path.sep.join([PRETRAINED_MODEL_DIR, 'test_dataset_'+store_id+'.pkl'])
        temp_df = pd.read_pickle(test_pkl_path)
        temp_df['store_id'] = store_id
        base_test = pd.concat([base_test, temp_df]).reset_index(drop=True)
    
    return base_test



##### Helper to make dynamic rolling lags #####
def make_lag(lag_day):
    """
    
    """
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'sales_lag_'+str(lag_day)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(lag_day)).astype(np.float16)
    return lag_df[[col_name]]


def make_lag_roll(lag_day):
    """
    
    """
    shift_day = lag_day[0]
    roll_wind = lag_day[1]
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'rolling_mean_tmp_'+str(shift_day)+'_'+str(roll_wind)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(shift_day).rolling(roll_wind).mean())
    return lag_df[[col_name]]
##### Helper to make dynamic rolling lags #####

# function importing data

In [8]:
def reduce_mem_usage(df, verbose=True):
    """
    reduce the memory usage of the given dataframe.
    https://qiita.com/hiroyuki_kageyama/items/02865616811022f79754
    
    Args:
        df: Dataframe
        verbose: 
        
    Returns:
        df, whose memory usage is reduced.

    Raises:
        None
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns: #columns毎に処理
        col_type = df[col].dtypes
        if col_type in numerics: #numericsのデータ型の範囲内のときに処理を実行. データの最大最小値を元にデータ型を効率的なものに変更
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def read_csv_data(directory, file_name):
    print('Reading files...')
    df = pd.read_csv(os.path.sep.join([str(directory), _DATA_DIR, file_name]))
    df = reduce_mem_usage(df)
    print('{} has {} rows and {} columns'.format(file_name, df.shape[0], df.shape[1]))
    
    return df


def read_data_by_store(store):
#     # Read and contact basic feature
#     df = pd.concat([pd.read_pickle(BASE),
#                     pd.read_pickle(PRICE).iloc[:,2:],
#                     pd.read_pickle(CALENDAR).iloc[:,2:]],
#                     axis=1)

    # Read and contact basic feature
    parent_dir = pathlib.Path(os.path.abspath(os.curdir)).parent.parent
    df = pd.concat([pd.read_pickle(os.path.sep.join([str(parent_dir), _DATA_DIR, BASE])),
                    pd.read_pickle(os.path.sep.join([str(parent_dir), _DATA_DIR, PRICE])).iloc[:,2:],
                    pd.read_pickle(os.path.sep.join([str(parent_dir), _DATA_DIR, CALENDAR])).iloc[:,2:]],
                    axis=1)
#     print(f"df at read_data_by_store: {df}")
    
    # Leave only relevant store
    df = df[df['store_id']==store]

    # With memory limits we have to read lags and mean encoding features separately and drop items that we don't need.
    # As our Features Grids are aligned 
    # we can use index to keep only necessary rows
    # Alignment is good for us as concat uses less memory than merge.
    df2 = pd.read_pickle(os.path.sep.join([str(parent_dir), _DATA_DIR, MEAN_ENC]))[MEAN_STD_FEATURES]
    df2 = df2[df2.index.isin(df.index)]
    print(f"MEAN_ENC: {MEAN_ENC}")
    
    df3 = pd.read_pickle(os.path.sep.join([str(parent_dir), _DATA_DIR, LAGS])).iloc[:,3:]
    df3 = df3[df3.index.isin(df.index)]
    print(f"LAGS: {LAGS}")
    
    df = pd.concat([df, df2], axis=1)
    del df2 # to not reach memory limit 
    
    df = pd.concat([df, df3], axis=1)
    del df3 # to not reach memory limit 
    
    # Create features list
    features = [col for col in list(df) if col not in REMOVE_FEATURES]
    df = df[['id','d',TARGET]+features]
    
    # Skipping first n rows
    df = df[df['d']>=START_DAY_TRAIN].reset_index(drop=True)
    
    return df, features

# read csv data

In [9]:
parent_dir = pathlib.Path(os.path.abspath(os.curdir)).parent.parent
print(f"parent_dir: {parent_dir}")

validation_df = read_csv_data(parent_dir, _VALIDATION_RESULT)
evaluation_df = read_csv_data(parent_dir, _EVALUATION_RESULT)


parent_dir: /home/ec2-user/SageMaker
Reading files...
Mem. usage decreased to  3.72 Mb (72.4% reduction)
submission_v3_validation.csv has 60980 rows and 29 columns
Reading files...
Mem. usage decreased to  3.72 Mb (72.4% reduction)
submission_v3_evaluation.csv has 60980 rows and 29 columns


In [10]:
print(validation_df)

                                  id        F1        F2        F3        F4  \
0      HOBBIES_1_001_CA_1_validation  0.669434  0.569824  0.683594  0.586426   
1      HOBBIES_1_002_CA_1_validation  0.220703  0.188721  0.195312  0.207642   
2      HOBBIES_1_003_CA_1_validation  0.292725  0.276855  0.284912  0.298584   
3      HOBBIES_1_004_CA_1_validation  1.514648  1.352539  1.205078  1.211914   
4      HOBBIES_1_005_CA_1_validation  0.935059  0.808105  0.900879  0.946777   
...                              ...       ...       ...       ...       ...   
60975    FOODS_3_823_WI_3_evaluation  0.000000  0.000000  0.000000  0.000000   
60976    FOODS_3_824_WI_3_evaluation  0.000000  0.000000  0.000000  0.000000   
60977    FOODS_3_825_WI_3_evaluation  0.000000  0.000000  0.000000  0.000000   
60978    FOODS_3_826_WI_3_evaluation  0.000000  0.000000  0.000000  0.000000   
60979    FOODS_3_827_WI_3_evaluation  0.000000  0.000000  0.000000  0.000000   

             F5        F6        F7    

In [11]:
print(evaluation_df)

                                  id        F1        F2        F3        F4  \
0      HOBBIES_1_001_CA_1_validation  0.000000  0.000000  0.000000  0.000000   
1      HOBBIES_1_002_CA_1_validation  0.000000  0.000000  0.000000  0.000000   
2      HOBBIES_1_003_CA_1_validation  0.000000  0.000000  0.000000  0.000000   
3      HOBBIES_1_004_CA_1_validation  0.000000  0.000000  0.000000  0.000000   
4      HOBBIES_1_005_CA_1_validation  0.000000  0.000000  0.000000  0.000000   
...                              ...       ...       ...       ...       ...   
60975    FOODS_3_823_WI_3_evaluation  0.352783  0.295654  0.300049  0.378418   
60976    FOODS_3_824_WI_3_evaluation  0.204346  0.207520  0.217285  0.221436   
60977    FOODS_3_825_WI_3_evaluation  0.667969  0.559082  0.593262  0.562988   
60978    FOODS_3_826_WI_3_evaluation  0.920898  0.991211  0.862305  0.969727   
60979    FOODS_3_827_WI_3_evaluation  1.707031  1.809570  1.599609  1.524414   

             F5        F6        F7    

In [12]:
evaluation_df[evaluation_df["id"].str.contains("validation")] = validation_df[validation_df["id"].str.contains("validation")]

print(f"evaluation_df: {evaluation_df}")

evaluation_df:                                   id        F1        F2        F3        F4  \
0      HOBBIES_1_001_CA_1_validation  0.669434  0.569824  0.683594  0.586426   
1      HOBBIES_1_002_CA_1_validation  0.220703  0.188721  0.195312  0.207642   
2      HOBBIES_1_003_CA_1_validation  0.292725  0.276855  0.284912  0.298584   
3      HOBBIES_1_004_CA_1_validation  1.514648  1.352539  1.205078  1.211914   
4      HOBBIES_1_005_CA_1_validation  0.935059  0.808105  0.900879  0.946777   
...                              ...       ...       ...       ...       ...   
60975    FOODS_3_823_WI_3_evaluation  0.352783  0.295654  0.300049  0.378418   
60976    FOODS_3_824_WI_3_evaluation  0.204346  0.207520  0.217285  0.221436   
60977    FOODS_3_825_WI_3_evaluation  0.667969  0.559082  0.593262  0.562988   
60978    FOODS_3_826_WI_3_evaluation  0.920898  0.991211  0.862305  0.969727   
60979    FOODS_3_827_WI_3_evaluation  1.707031  1.809570  1.599609  1.524414   

             F5        F

# export train/test result

In [13]:
parent_dir = pathlib.Path(os.path.abspath(os.curdir)).parent.parent
# Reading competition sample submission and merging our predictions

_EXPORT_FILE_NAME = _EVALUATION_RESULT
print("csv data export start")
evaluation_df.to_csv(os.path.sep.join([str(parent_dir), _OUTPUT_DIR, _EXPORT_FILE_NAME]), index=False)
print('csv data export finished. Size:', evaluation_df.shape)

csv data export start
csv data export finished. Size: (60980, 29)
