In [2]:
import gc
import os
import time
import pickle
import warnings
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
from typing import Union
import matplotlib.pyplot as plt
from sklearn import preprocessing, metrics
from tqdm import tqdm
from scipy.sparse import csr_matrix

warnings.filterwarnings('ignore')
"""
TODO 365 + 28日分をtrainデータとしてlag特徴量を作成し、古い28日分は欠損値が出るので削除

"""

# メモリ使用量の削減
def reduce_mem_usage(df, verbose=False):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(
            end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


# function to read the data and merge it
# (ignoring some columns, this is a very fst model)
def read_data():
    print('Reading files...')

    calendar_df = pd.read_csv('../input/m5-forecasting-accuracy/calendar.csv')
    calendar_df = reduce_mem_usage(calendar_df)
    print('Calendar: ' + str(calendar_df.shape))

    sell_prices_df = pd.read_csv('../input/m5-forecasting-accuracy/sell_prices.csv')
    sell_prices_df = reduce_mem_usage(sell_prices_df)
    print('Sell prices: ' + str(sell_prices_df.shape))

    train_df = pd.read_csv('../input/m5-forecasting-accuracy/sales_train_validation.csv')
    print('Sales train validation: ' + str(train_df.shape))

    submission_df = pd.read_csv('../input/m5-forecasting-accuracy/sample_submission.csv')
    print("Submission: " + str(submission_df.shape))

    return calendar_df, sell_prices_df, train_df, submission_df


def melt_and_merge(calendar_df, sell_prices_df, train_df, submission_df, num_train_data):

    # trainは直近１年間のデータのみ使用
    drop_columns = [f"d_{d}" for d in range(1, (1913 - num_train_data) + 1)]
    train_df.drop(drop_columns, inplace = True, axis=1)
    print("\ntrainは直近１年間のデータのみ使用")
    print('Sales train validation(remain only one year): ' + str(train_df.shape))

    # 商品情報を抽出
    product_df = train_df.loc[:, "id":"state_id"]

    # 列方向に連なっていたのを変形し行方向に連ねるように整理
    train_df = pd.melt(train_df, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
                       var_name='day', value_name='demand')

    train_day = train_df["day"].unique()
    print("train_data: {0} ~ {1} -> {2}".format(train_day[0], train_day[-1], len(train_day)))

    # seperate test dataframes
    stage1_eval_df = submission_df[submission_df["id"].str.contains("validation")]
    stage2_eval_df = submission_df[submission_df["id"].str.contains("evaluation")]

    # change column names
    stage1_eval_df.columns = ["id"] + [f"d_{d}" for d in range(1914, 1942)]  # F1 ~ F28 => d_1914 ~ d_1941
    stage2_eval_df.columns = ["id"] + [f"d_{d}" for d in range(1942, 1970)]  # F1 ~ F28 => d_1942 ~ d_1969

    # melt, mergeを使ってsubmission用のdataframeを上のsales_train_validationと同様の形式に変形
    stage1_eval_df = stage1_eval_df.merge(product_df, how='left', on='id')
    stage1_eval_df = pd.melt(stage1_eval_df, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
                             var_name='day', value_name='demand')
    stage1_day = stage1_eval_df["day"].unique()
    print("[STAGE1] eval_data: {0} ~ {1} -> {2}".format(stage1_day[0], stage1_day[-1], len(stage1_day)))

    # train_df, stage1_eval_dfと同様にstage2_eval_dfとproduct_dfをmergeさせたい
    # しかしidが_evaluationのままだとデータが一致せずmergeできないので一時的に_validationにidを変更
    stage2_eval_df['id'] = stage2_eval_df.loc[:, 'id'].str.replace('_evaluation', '_validation')
    stage2_eval_df = stage2_eval_df.merge(product_df, how='left', on='id')
    stage2_eval_df['id'] = stage2_eval_df.loc[:, 'id'].str.replace('_validation', '_evaluation')
    stage2_eval_df = pd.melt(stage2_eval_df, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
                             var_name='day', value_name='demand')
    stage2_day = stage2_eval_df["day"].unique()
    print("[STAGE2] eval_data: {0} ~ {1} -> {2}".format(stage2_day[0], stage2_day[-1], len(stage2_day)))

    train_df['part'] = 'train'
    stage1_eval_df['part'] = 'stage1'
    stage2_eval_df['part'] = 'stage2'

    data_df = pd.concat([train_df, stage1_eval_df, stage2_eval_df], axis=0)
    data_df = reduce_mem_usage(data_df)
    # print("\n[INFO] data_df(after merge valid & eval) ->")
    # data_df.head()

    # 不要なdataframeの削除
    del train_df, stage1_eval_df, stage2_eval_df, product_df

    # drop some calendar features
    calendar_df.drop(['weekday', 'wday', 'month', 'year'], inplace=True, axis=1)

    # delete stage2_eval_df for now
    data_df = data_df[data_df['part'] != 'stage2']
    print("[CHECK] Remove the stage2 eval data")

    # notebook crash with the entire dataset (maybee use tensorflow, dask, pyspark xD)
    data_df = pd.merge(data_df, calendar_df, how='left', left_on=['day'], right_on=['d'])
    data_df.drop('d', inplace=True, axis=1)

    # get the sell price data (this feature should be very important)
    data_df = data_df.merge(sell_prices_df, on=['store_id', 'item_id', 'wm_yr_wk'], how='left')
    # print("\n[INFO] data_df(after merge calendar & prices) ->")
    # print(data_df.head(5))
    # print(data_df.columns)

    return data_df


# label encoding
def encode_categorical(data_df):
    nan_features = ['event_name_1', 'event_type_1',
                    'event_name_2', 'event_type_2']
    for feature in nan_features:
        # label encodingのためnanを文字列に変換
        data_df[feature].fillna('unknown', inplace=True)

    cat = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id',
           'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in cat:
        encoder = preprocessing.LabelEncoder()
        data_df[feature] = encoder.fit_transform(data_df[feature])

    return data_df

In [3]:
calendar_df, sell_prices_df, train_df, submission_df = read_data()

# preprocessing
num_train_data = 421
data_df = melt_and_merge(calendar_df, sell_prices_df, train_df, submission_df, num_train_data)
data_df = encode_categorical(data_df)

Reading files...
Calendar: (1969, 14)
Sell prices: (6841121, 4)
Sales train validation: (30490, 1919)
Submission: (60980, 29)

trainは直近１年間のデータのみ使用
Sales train validation(remain only one year): (30490, 427)
train_data: d_1493 ~ d_1913 -> 421
[STAGE1] eval_data: d_1914 ~ d_1941 -> 28
[STAGE2] eval_data: d_1942 ~ d_1969 -> 28
[CHECK] Remove the stage2 eval data


In [9]:
# 特徴量エンジニアリング
def feature_engineering(data_df):
    """
    1日後のリード特徴量
    1日前のラグ特徴量
    """

    print("\n[START] feature engineering ->")

    # ラグ特徴量
    data_df['lag7'] = data_df.groupby(['id'])['demand'].shift(7)
    data_df['lag28'] = data_df.groupby(['id'])['demand'].shift(28)
    data_df['rmean_lag7_7'] = data_df.groupby(['id'])['lag7'].transform(lambda x: x.rolling(7).mean())
    data_df['rmean_lag7_28'] = data_df.groupby(['id'])['lag7'].transform(lambda x: x.rolling(28).mean())
    data_df['rmean_lag28_7'] = data_df.groupby(['id'])['lag28'].transform(lambda x: x.rolling(7).mean())
    data_df['rmean_lag28_28'] = data_df.groupby(['id'])['lag28'].transform(lambda x: x.rolling(28).mean())

        
    # price features
    # data_df['sell_price_lag1'] = data_df.groupby(['id'])['sell_price'].transform(lambda x: x.shift(1))
    # data_df['sell_price_lag7'] = data_df.groupby(['id'])['sell_price'].transform(lambda x: x.shift(7))
    # data_df['sell_price_lag28'] = data_df.groupby(['id'])['sell_price'].transform(lambda x: x.shift(28))
    # mean_sell_price_df = data_df.groupby('id').mean()
    # mean_sell_price_df.rename(columns={"sell_price": "mean_sell_price"}, inplace=True)
    # data_df = data_df.merge(mean_sell_price_df["mean_sell_price"], on="id")
    # data_df["diff_sell_price"] = data_df["sell_price"] - data_df["mean_sell_price"]
    # data_df["div_sell_price"] = data_df["sell_price"] / data_df["mean_sell_price"]

    # time features
    data_df['date'] = pd.to_datetime(data_df['date'])
    data_df['year'] = data_df['date'].dt.year.astype(np.int16)
    data_df['quarter'] = data_df['date'].dt.quarter.astype(np.int8)
    data_df['month'] = data_df['date'].dt.month.astype(np.int8)
    data_df['week'] = data_df['date'].dt.week.astype(np.int8)
    data_df['mday'] = data_df['date'].dt.day.astype(np.int8)
    data_df['wday'] = data_df['date'].dt.dayofweek.astype(np.int8)
    # data_df['is_year_end'] = data_df['date'].dt.is_year_end.astype(np.int8)
    # data_df['is_year_start'] = data_df['date'].dt.is_year_start.astype.astype(np.int8)
    # data_df['is_quarter_end'] = data_df['date'].dt.is_quarter_end.astype(np.int8)
    # data_df['is_quarter_start'] = data_df['date'].is_quarter_start.astype(np.int8)
    # data_df['is_month_end'] = data_df['date'].dt.is_month_end.astype(np.int8)
    # data_df['is_month_start'] = data_df['date'].dt.is_month_start.astype(np.int8)
    # data_df["is_weekend"] = data_df["dayofweek"].isin([5, 6]).astype(np.int8)

    # black friday
    black_friday = ["2011-11-25", "2012-11-23", "2013-11-29", "2014-11-28", "2015-11-27"]
    data_df["black_friday"] = data_df["date"].isin(black_friday) * 1

    print("[FINISH] feature engineering")

    # lag特徴量によって欠損している部分を削除
    print("lag featureによって欠損している部分を削除")
    print("Before train data:{0} ~ {1}".format(data_df["day"].unique()[0], data_df["day"].unique()[-1]))
    # data_df = data_df[data_df["day"] >= "d_1549"]
    data_df.dropna(inplace = True)
    print("↓")
    print("After train data:{0} ~ {1}".format(data_df["day"].unique()[0], data_df["day"].unique()[-1]))
    return data_df

In [11]:
data_df = feature_engineering(data_df)


[START] feature engineering ->
[FINISH] feature engineering
lag featureによって欠損している部分を削除
Before train data:d_1493 ~ d_1941
↓
After train data:d_1548 ~ d_1941


In [24]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11980608 entries, 1676950 to 13690009
Data columns (total 32 columns):
id                object
item_id           int64
dept_id           int64
cat_id            int64
store_id          int64
state_id          int64
day               object
demand            int16
part              object
date              datetime64[ns]
wm_yr_wk          int16
event_name_1      int64
event_type_1      int64
event_name_2      int64
event_type_2      int64
snap_CA           int8
snap_TX           int8
snap_WI           int8
sell_price        float16
lag7              float64
lag28             float64
rmean_lag7_7      float64
rmean_lag7_28     float64
rmean_lag28_7     float64
rmean_lag28_28    float64
year              int16
quarter           int8
month             int8
week              int8
mday              int8
wday              int8
black_friday      int64
dtypes: datetime64[ns](1), float16(1), float64(6), int16(3), int64(10), int8(8), object(3)
me

In [20]:
data_df["day"].unique()

array(['d_1548', 'd_1549', 'd_1550', 'd_1551', 'd_1552', 'd_1553',
       'd_1554', 'd_1555', 'd_1556', 'd_1557', 'd_1558', 'd_1559',
       'd_1560', 'd_1561', 'd_1562', 'd_1563', 'd_1564', 'd_1565',
       'd_1566', 'd_1567', 'd_1568', 'd_1569', 'd_1570', 'd_1571',
       'd_1572', 'd_1573', 'd_1574', 'd_1575', 'd_1576', 'd_1577',
       'd_1578', 'd_1579', 'd_1580', 'd_1581', 'd_1582', 'd_1583',
       'd_1584', 'd_1585', 'd_1586', 'd_1587', 'd_1588', 'd_1589',
       'd_1590', 'd_1591', 'd_1592', 'd_1593', 'd_1594', 'd_1595',
       'd_1596', 'd_1597', 'd_1598', 'd_1599', 'd_1600', 'd_1601',
       'd_1602', 'd_1603', 'd_1604', 'd_1605', 'd_1606', 'd_1607',
       'd_1608', 'd_1609', 'd_1610', 'd_1611', 'd_1612', 'd_1613',
       'd_1614', 'd_1615', 'd_1616', 'd_1617', 'd_1618', 'd_1619',
       'd_1620', 'd_1621', 'd_1622', 'd_1623', 'd_1624', 'd_1625',
       'd_1626', 'd_1627', 'd_1628', 'd_1629', 'd_1630', 'd_1631',
       'd_1632', 'd_1633', 'd_1634', 'd_1635', 'd_1636', 'd_16

In [12]:
data_df[data_df["day"]=="d_1913"]["item_id"].nunique()

3049

In [15]:
d_1913 = data_df[data_df["day"]=="d_1913"]

In [18]:
d_1913[d_1913["id"] == "HOBBIES_1_001_CA_1_validation"]["lag28"]

12805800    1.0
Name: lag28, dtype: float64

In [22]:
data_df[data_df["day"]=="d_1913"]["date"]

12805800   2016-04-24
12805801   2016-04-24
12805802   2016-04-24
12805803   2016-04-24
12805804   2016-04-24
12805805   2016-04-24
12805806   2016-04-24
12805807   2016-04-24
12805808   2016-04-24
12805809   2016-04-24
12805810   2016-04-24
12805811   2016-04-24
12805812   2016-04-24
12805813   2016-04-24
12805814   2016-04-24
12805815   2016-04-24
12805816   2016-04-24
12805817   2016-04-24
12805818   2016-04-24
12805819   2016-04-24
12805820   2016-04-24
12805821   2016-04-24
12805822   2016-04-24
12805823   2016-04-24
12805824   2016-04-24
12805825   2016-04-24
12805826   2016-04-24
12805827   2016-04-24
12805828   2016-04-24
12805829   2016-04-24
              ...    
12836260   2016-04-24
12836261   2016-04-24
12836262   2016-04-24
12836263   2016-04-24
12836264   2016-04-24
12836265   2016-04-24
12836266   2016-04-24
12836267   2016-04-24
12836268   2016-04-24
12836269   2016-04-24
12836270   2016-04-24
12836271   2016-04-24
12836272   2016-04-24
12836273   2016-04-24
12836274  