In [1]:
import os
import abc

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}
import seaborn as sns

import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing

from tqdm import tqdm_notebook as tqdm

import warnings
warnings.filterwarnings('ignore')

## Objectives

- Baselineを作る。
- 簡単な特徴量エンジニアリングを行う。
- バリデーション戦略について考える。
- 予測精度の評価方法を決める。

## Note

## Scores

## Load Data

In [2]:
# MEMO: 
# - コンペ終了1ヶ月前には sales_train_evaluation.csv が追加される。
# - train = sales_train_validation, test = sales_train_evaluation.
def read_data():
    files = ['calendar', 'sample_submission', 'sales_train_validation', 'sell_prices']
    
    if os.path.exists('/kaggle/input/m5-forecasting-accuracy'):
        data_dir_path = '/kaggle/input/m5-forecasting-accuracy'
        dst_data = {}
        for file in files:
            print(f'Reading {file} ....')
            dst_data[file] = pd.read_csv(data_dir_path + file + '.csv')
    else:
        data_dir_path = '../data/reduced/'
        dst_data = {}
        for file in files:
            print(f'Reading {file} ....')
            dst_data[file] = pd.read_pickle(data_dir_path + file + '.pkl')
    return dst_data.values()

# TODO: sales_train_evaluation.csv が公開されたらtestに代入する。
calendar, submission, train, sell_prices = read_data()
test = pd.DataFrame()

Reading calendar ....
Reading sample_submission ....
Reading sales_train_validation ....
Reading sell_prices ....


In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ["int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [7]:
# MEMO: 前処理と加工を行ったデータをキャッシュとして出力しておく。
def encode_categorical(df, cols):
    for col in cols:
        le = preprocessing.LabelEncoder()
        df[col] = le.fit_transform(df[col].values)
    return df


def encode_calendar(src_df):
    df = src_df.copy()
    drop_calendar_cols = ['date', 'weekday', 'year']
    df.drop(drop_calendar_cols, axis=1, inplace=True)
    # MEMO: N_Unique of event_name_1 == 31 and event_name_2 == 5.
    event_cols = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    df[event_cols] = df[event_cols].fillna('None')
    df = encode_categorical(df, event_cols)
    df[event_cols] = df[event_cols].astype('int8')
    return df


def melt_and_merge(train, test, calendar, sell_prices, use_cache=True):
    dir_path = 'processed'
    train_save_path = os.path.join(dir_path, 'train.pkl')
    test_save_path = os.path.join(dir_path, 'test.pkl')
    
    # If se cache
    if use_cache:
        if os.path.exists(train_save_path) and os.path.exists(test_save_path):
            train = pd.read_pickle(train_save_path)
            test = pd.read_pickle(test_save_path)
            return train, test
           
    # Encode Calender
    encoded_calender = encode_calendar(calendar)
    
    # Encode Train Data
    all_label_map = {}
    label_cols = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
    id_columns = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'] # Use for melt.
    # MEMO: ラベルは全データ共通なので、train/test/sell_prices のmappingに使える。
    for col in label_cols:
        all_labels = sorted(train[col].unique())
        all_label_map[col] = {label: i for i, label in enumerate(all_labels)}
    
    for col in label_cols:
        # TODO: Testデータが手に入ったら mapping する。
        train[col] = train[col].map(all_label_map[col])
        
        if col in ['store_id', 'item_id']:
            sell_prices[col] = sell_prices[col].map(all_label_map[col])
    
    train = pd.melt(train, id_vars=id_columns, var_name='d', value_name='sales')
    train = pd.merge(train, encoded_calender, how='left', on='d')
    train = pd.merge(train, sell_prices, how='left', on=['store_id', 'item_id', 'wm_yr_wk'])
    # MEMO: sell_price を直近価格で過去の値を埋める。
    train['sell_price'] = train.groupby('item_id')['sell_price'].bfill()
#     test['sell_price'] = test.groupby('item_id')['sell_price'].bfill()
    
    # Save cache
    train = train.pipe(reduce_mem_usage)
#     test = test.pipe(reduce_mem_usage)
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)

    train.to_pickle(train_save_path)
    test.to_pickle(test_save_path)
    
    return train, test

train, test = melt_and_merge(train, test, calendar, sell_prices, use_cache=True)

In [8]:
print(train.shape)
train.head()

(58327370, 19)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,wm_yr_wk,wday,month,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_001_CA_1_validation,1437,3,1,0,0,d_1,0,11101,1,1,19,2,3,1,0,0,0,9.578125
1,HOBBIES_1_002_CA_1_validation,1438,3,1,0,0,d_1,0,11101,1,1,19,2,3,1,0,0,0,3.970703
2,HOBBIES_1_003_CA_1_validation,1439,3,1,0,0,d_1,0,11101,1,1,19,2,3,1,0,0,0,2.970703
3,HOBBIES_1_004_CA_1_validation,1440,3,1,0,0,d_1,0,11101,1,1,19,2,3,1,0,0,0,4.339844
4,HOBBIES_1_005_CA_1_validation,1441,3,1,0,0,d_1,0,11101,1,1,19,2,3,1,0,0,0,2.980469


## Feature Engineering

# 起きたら特徴量エンジニアリングやるぞ！！！
- とりあえず、m5-baselineのマネをするところからになりそう。
- どんどん真似するぞ！！！
- jazzもいくぞ！

In [135]:
class Feature(metaclass=abc.ABCMeta):
    prefix = ""
    suffix = ""
    save_dir = "features"
    is_feature = True

    def __init__(self):
        self.name = self.__class__.__name__
        Path(self.save_dir).mkdir(exist_ok=True, parents=True)
        self.train = pd.DataFrame()
        self.test = pd.DataFrame()
        self.categoricals = pd.Series()
        self.train_path = Path(self.save_dir) / f"{self.name}_train.pkl"
        self.test_path = Path(self.save_dir) / f"{self.name}_test.pkl"
        self.categoricals_path = Path(self.save_dir) / f"{self.name}_categoricals.pkl"

    def run(self, train_df, test_df=None, log=False):
        self.create_features(train_df, test_df)
        prefix = self.prefix + "_" if self.prefix else ""
        suffix = self.suffix + "_" if self.suffix else ""
        self.train.columns = pd.Index([str(c) for c in self.train.columns])
        self.test.columns = pd.Index([str(c) for c in self.test.columns])
        self.train.columns = prefix + self.train.columns + suffix
        self.test.columns = prefix + self.test.columns + suffix
        return self

    @abc.abstractmethod
    def create_features(self, train_df, test_df):
        raise NotImplementedError

    def save(self):
        self.train.to_pickle(str(self.train_path))
        self.test.to_pickle(str(self.test_path))
        self.categoricals.to_pickle(str(self.categoricals_path))

In [None]:
"""
Abbreviations
"""
class BaseFeature(Feature):
    
    def add_common_features(df):
        return df
    
    def create_features(self, train, test):
        # Set train, test, categoricals
        self.train = train
        self.test = test
        self.categoricals = pd.Series(categoricals)

In [None]:
%%time
BaseFeature().run(raw_train, raw_test).save()

## Training Model and Prediction

## Submission