In [1]:
import gc
import os
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn import preprocessing, metrics

In [2]:
# メモリ使用量の削減
def reduce_mem_usage(df, verbose=False):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
def read_data():
    print('Reading files...')

    calendar_df = pd.read_csv('../input/m5-forecasting-accuracy/calendar.csv')
    calendar_df = reduce_mem_usage(calendar_df)
    print('Calendar: ' + str(calendar_df.shape))

    sell_prices_df = pd.read_csv('../input/m5-forecasting-accuracy/sell_prices.csv')
    sell_prices_df = reduce_mem_usage(sell_prices_df)
    print('Sell prices: ' + str(sell_prices_df.shape))

    train_df = pd.read_csv('../input/m5-forecasting-accuracy/sales_train_validation.csv')
    print('Sales train validation: ' + str(train_df.shape))

    submission_df = pd.read_csv('../input/m5-forecasting-accuracy/sample_submission.csv')
    print("Submission: " + str(submission_df.shape))

    return calendar_df, sell_prices_df, train_df, submission_df

In [5]:
calendar_df, sell_prices_df, train_df, submission_df = read_data()

Reading files...
Mem. usage decreased to  0.12 Mb (41.9% reduction)
Calendar: (1969, 14)
Mem. usage decreased to 130.48 Mb (37.5% reduction)
Sell prices: (6841121, 4)
Sales train validation: (30490, 1919)
Submission: (60980, 29)


In [7]:
calendar_df.head(2)

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0


In [8]:
sell_prices_df.head(2)

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,CA_1,HOBBIES_1_001,11325,9.578125
1,CA_1,HOBBIES_1_001,11326,9.578125


In [9]:
train_df.head(2)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [10]:
submission_df.head(2)

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,HOBBIES_1_002_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
def melt_and_merge(calendar_df, sell_prices_df, train_df, submission_df):

    # trainは直近１年間のデータのみ使用
    day_per_year = 365
    train_end_day = 1913
    drop_columns = [f"d_{d}" for d in range(1, (train_end_day - day_per_year) + 1)]
    train_df.drop(drop_columns, inplace = True, axis=1)
    print("\ntrainは直近１年間のデータのみ使用")
    print('Sales train validation(remain only one year): ' + str(train_df.shape))

    # 商品情報を抽出
    product_df = train_df.loc[:, "id":"state_id"]

    # 列方向に連なっていたのを変形し行方向に連ねるように整理
    train_df = pd.melt(train_df, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
                       var_name='day', value_name='demand')

    train_day = train_df["day"].unique()
    print("train_data: {0} ~ {1} -> {2}".format(train_day[0], train_day[-1], len(train_day)))

    # seperate test dataframes
    stage1_eval_df = submission_df[submission_df["id"].str.contains("validation")]
    stage2_eval_df = submission_df[submission_df["id"].str.contains("evaluation")]

    # change column names
    stage1_eval_df.columns = ["id"] + [f"d_{d}" for d in range(1914, 1942)]  # F1 ~ F28 => d_1914 ~ d_1941
    stage2_eval_df.columns = ["id"] + [f"d_{d}" for d in range(1942, 1970)]  # F1 ~ F28 => d_1942 ~ d_1969

    # melt, mergeを使ってsubmission用のdataframeを上のsales_train_validationと同様の形式に変形
    stage1_eval_df = stage1_eval_df.merge(product_df, how='left', on='id')
    stage1_eval_df = pd.melt(stage1_eval_df, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
                             var_name='day', value_name='demand')
    stage1_day = stage1_eval_df["day"].unique()
    print("[STAGE1] eval_data: {0} ~ {1} -> {2}".format(stage1_day[0], stage1_day[-1], len(stage1_day)))

    # train_df, stage1_eval_dfと同様にstage2_eval_dfとproduct_dfをmergeさせたい
    # しかしidが_evaluationのままだとデータが一致せずmergeできないので一時的に_validationにidを変更
    stage2_eval_df['id'] = stage2_eval_df.loc[:, 'id'].str.replace('_evaluation', '_validation')
    stage2_eval_df = stage2_eval_df.merge(product_df, how='left', on='id')
    stage2_eval_df['id'] = stage2_eval_df.loc[:, 'id'].str.replace('_validation', '_evaluation')
    stage2_eval_df = pd.melt(stage2_eval_df, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
                             var_name='day', value_name='demand')
    stage2_day = stage2_eval_df["day"].unique()
    print("[STAGE2] eval_data: {0} ~ {1} -> {2}".format(stage2_day[0], stage2_day[-1], len(stage2_day)))

    train_df['part'] = 'train'
    stage1_eval_df['part'] = 'stage1'
    stage2_eval_df['part'] = 'stage2'

    data_df = pd.concat([train_df, stage1_eval_df, stage2_eval_df], axis=0)
    data_df = reduce_mem_usage(data_df)
    # print("\n[INFO] data_df(after merge valid & eval) ->")
    # data_df.head()

    # 不要なdataframeの削除
    del train_df, stage1_eval_df, stage2_eval_df, product_df

    # drop some calendar features
    calendar_df.drop(['weekday', 'wday', 'month', 'year'], inplace=True, axis=1)

    # delete stage2_eval_df for now
    data_df = data_df[data_df['part'] != 'stage2']
    print("[CHECK] Remove the stage2 eval data")

    # notebook crash with the entire dataset (maybee use tensorflow, dask, pyspark xD)
    data_df = pd.merge(data_df, calendar_df, how='left', left_on=['day'], right_on=['d'])
    data_df.drop('d', inplace=True, axis=1)

    # get the sell price data (this feature should be very important)
    data_df = data_df.merge(sell_prices_df, on=['store_id', 'item_id', 'wm_yr_wk'], how='left')
    # print("\n[INFO] data_df(after merge calendar & prices) ->")
    # print(data_df.head(5))
    # print(data_df.columns)

    return data_df

In [12]:
data_df = melt_and_merge(calendar_df, sell_prices_df, train_df, submission_df)


trainは直近１年間のデータのみ使用
Sales train validation(remain only one year): (30490, 371)
train_data: d_1549 ~ d_1913 -> 365
[STAGE1] eval_data: d_1914 ~ d_1941 -> 28
[STAGE2] eval_data: d_1942 ~ d_1969 -> 28
Mem. usage decreased to 905.88 Mb (7.5% reduction)
[CHECK] Remove the stage2 eval data


In [13]:
data_df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day,demand,part,date,wm_yr_wk,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1549,0,train,2015-04-26,11513,,,,,0,0,0,8.257812
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1549,0,train,2015-04-26,11513,,,,,0,0,0,3.970703
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1549,0,train,2015-04-26,11513,,,,,0,0,0,2.970703
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1549,3,train,2015-04-26,11513,,,,,0,0,0,4.640625
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1549,0,train,2015-04-26,11513,,,,,0,0,0,2.880859


In [14]:
# label encoding
def encode_categorical(data_df):
    nan_features = ['event_name_1', 'event_type_1',
                    'event_name_2', 'event_type_2']
    for feature in nan_features:
        # label encodingのためnanを文字列に変換
        data_df[feature].fillna('unknown', inplace=True)

    cat = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id',
           'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in cat:
        encoder = preprocessing.LabelEncoder()
        data_df[feature] = encoder.fit_transform(data_df[feature])

    return data_df

In [15]:
data_df = encode_categorical(data_df)

In [16]:
data_df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day,demand,part,date,wm_yr_wk,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_001_CA_1_validation,1437,3,1,0,0,d_1549,0,train,2015-04-26,11513,30,4,0,0,0,0,0,8.257812
1,HOBBIES_1_002_CA_1_validation,1438,3,1,0,0,d_1549,0,train,2015-04-26,11513,30,4,0,0,0,0,0,3.970703
2,HOBBIES_1_003_CA_1_validation,1439,3,1,0,0,d_1549,0,train,2015-04-26,11513,30,4,0,0,0,0,0,2.970703
3,HOBBIES_1_004_CA_1_validation,1440,3,1,0,0,d_1549,3,train,2015-04-26,11513,30,4,0,0,0,0,0,4.640625
4,HOBBIES_1_005_CA_1_validation,1441,3,1,0,0,d_1549,0,train,2015-04-26,11513,30,4,0,0,0,0,0,2.880859


In [19]:
# 特徴量エンジニアリング
def feature_engineering(data_df):
    """
    1日後のリード特徴量
    1日前のラグ特徴量
    """

    print("\n[START] feature engineering ->")

    # ラグ特徴量
    data_df['lag7'] = data_df.groupby(['id'])['demand'].transform(lambda x: x.shift(7))
    data_df['lag28'] = data_df.groupby(['id'])['demand'].transform(lambda x: x.shift(28))

    data_df['rmean_lag7_7'] = data_df.groupby(['id'])['lag7'].transform(lambda x: x.shift(7).rolling(7).mean())
    data_df['rmean_lag7_28'] = data_df.groupby(['id'])['lag7'].transform(lambda x: x.shift(7).rolling(28).mean())
    data_df['rmean_lag28_7'] = data_df.groupby(['id'])['lag28'].transform(lambda x: x.shift(28).rolling(7).mean())
    data_df['rmean_lag28_28'] = data_df.groupby(['id'])['lag28'].transform(lambda x: x.shift(28).rolling(28).mean())

    # price features
    data_df['sell_price_lag1'] = data_df.groupby(['id'])['sell_price'].transform(lambda x: x.shift(1))
    data_df['sell_price_lag7'] = data_df.groupby(['id'])['sell_price'].transform(lambda x: x.shift(7))
    data_df['sell_price_lag28'] = data_df.groupby(['id'])['sell_price'].transform(lambda x: x.shift(28))
    mean_sell_price_df = data_df.groupby('id').mean()
    mean_sell_price_df.rename(columns={"sell_price": "mean_sell_price"}, inplace=True)
    data_df = data_df.merge(mean_sell_price_df["mean_sell_price"], on="id")
    data_df["diff_sell_price"] = data_df["sell_price"] - data_df["mean_sell_price"]
    data_df["div_sell_price"] = data_df["sell_price"] / data_df["mean_sell_price"]

    # time features
    data_df['date'] = pd.to_datetime(data_df['date'])
    data_df['year'] = data_df['date'].dt.year.astype(np.int16)
    data_df['quarter'] = data_df['date'].dt.quarter.astype(np.int8)
    data_df['month'] = data_df['date'].dt.month.astype(np.int8)
    data_df['week'] = data_df['date'].dt.week.astype(np.int8)
    data_df['mday'] = data_df['date'].dt.day.astype(np.int8)
    data_df['wday'] = data_df['date'].dt.dayofweek.astype(np.int8)
    # data_df['is_year_end'] = data_df['date'].dt.is_year_end.astype(np.int8)
    # data_df['is_year_start'] = data_df['date'].dt.is_year_start.astype.astype(np.int8)
    # data_df['is_quarter_end'] = data_df['date'].dt.is_quarter_end.astype(np.int8)
    # data_df['is_quarter_start'] = data_df['date'].is_quarter_start.astype(np.int8)
    # data_df['is_month_end'] = data_df['date'].dt.is_month_end.astype(np.int8)
    # data_df['is_month_start'] = data_df['date'].dt.is_month_start.astype(np.int8)
    # data_df["is_weekend"] = data_df["dayofweek"].isin([5, 6]).astype(np.int8)

    # black friday
    black_friday = ["2011-11-25", "2012-11-23", "2013-11-29", "2014-11-28", "2015-11-27"]
    data_df["black_friday"] = data_df["date"].isin(black_friday) * 1

    print("[FINISH] feature engineering")

    return data_df

In [20]:
data_df = feature_engineering(data_df)


[START] feature engineering ->
[FINISH] feature engineering


In [21]:
data_df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day,demand,part,date,...,mean_sell_price,diff_sell_price,div_sell_price,year,quarter,month,week,mday,wday,black_friday
0,HOBBIES_1_001_CA_1_validation,1437,3,1,0,0,d_1549,0,train,2015-04-26,...,8.273438,-0.015625,0.998047,2015,2,4,17,26,6,0
1,HOBBIES_1_001_CA_1_validation,1437,3,1,0,0,d_1550,1,train,2015-04-27,...,8.273438,-0.015625,0.998047,2015,2,4,18,27,0,0
2,HOBBIES_1_001_CA_1_validation,1437,3,1,0,0,d_1551,0,train,2015-04-28,...,8.273438,-0.015625,0.998047,2015,2,4,18,28,1,0
3,HOBBIES_1_001_CA_1_validation,1437,3,1,0,0,d_1552,0,train,2015-04-29,...,8.273438,-0.015625,0.998047,2015,2,4,18,29,2,0
4,HOBBIES_1_001_CA_1_validation,1437,3,1,0,0,d_1553,0,train,2015-04-30,...,8.273438,-0.015625,0.998047,2015,2,4,18,30,3,0


In [31]:
data_df.tail()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day,demand,part,date,...,mean_sell_price,diff_sell_price,div_sell_price,year,quarter,month,week,mday,wday,black_friday
11982565,FOODS_3_827_WI_3_validation,1436,2,0,9,2,d_1937,0,stage1,2016-05-18,...,1.0,0.0,1.0,2016,2,5,20,18,2,0
11982566,FOODS_3_827_WI_3_validation,1436,2,0,9,2,d_1938,0,stage1,2016-05-19,...,1.0,0.0,1.0,2016,2,5,20,19,3,0
11982567,FOODS_3_827_WI_3_validation,1436,2,0,9,2,d_1939,0,stage1,2016-05-20,...,1.0,0.0,1.0,2016,2,5,20,20,4,0
11982568,FOODS_3_827_WI_3_validation,1436,2,0,9,2,d_1940,0,stage1,2016-05-21,...,1.0,0.0,1.0,2016,2,5,20,21,5,0
11982569,FOODS_3_827_WI_3_validation,1436,2,0,9,2,d_1941,0,stage1,2016-05-22,...,1.0,0.0,1.0,2016,2,5,20,22,6,0


In [None]:
alphas = [1.035, 1.03, 1.025, 1.02]
weights = [1/len(alphas)]*len(alphas)
sub = 0.

for icount, (alpha, weight) in enumerate(zip(alphas, weights)):

    te = create_dt(False)
    cols = [f"F{i}" for i in range(1,29)]

    for tdelta in range(0, 28):
#     for tdelta in range(0, 2):
        day = fday + timedelta(days=tdelta)
        print(icount, day)
        tst = te[(te.date >= day - timedelta(days=max_lags)) & (te.date <= day)].copy()
        create_fea(tst)
        tst = tst.loc[tst.date == day , train_cols]
        te.loc[te.date == day, "sales"] = alpha*m_lgb.predict(tst) # magic multiplier by kyakovlev



    te_sub = te.loc[te.date >= fday, ["id", "sales"]].copy()
#     te_sub.loc[te.date >= fday+ timedelta(days=h), "id"] = te_sub.loc[te.date >= fday+timedelta(days=h), 
#                                                                           "id"].str.replace("validation$", "evaluation")
    te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]
    te_sub = te_sub.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index()
    te_sub.fillna(0., inplace = True)
    te_sub.sort_values("id", inplace = True)
    te_sub.reset_index(drop=True, inplace = True)
#     te_sub.to_csv("submission.csv",index=False)
    if icount == 0 :
        sub = te_sub
        sub[cols] *= weight
    else:
        sub[cols] += te_sub[cols]*weight
    print(icount, alpha, weight)


sub2 = sub.copy()
sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
sub = pd.concat([sub, sub2], axis=0, sort=False)
sub.to_csv("submission.csv",index=False)