In [0]:
from  datetime import datetime, timedelta
import gc
import numpy as np, pandas as pd
import lightgbm as lgb

> This notebook aims to push the public LB under 0.50. Certainly, the competition is not yet at its peak and there clearly remains room for improvement.

# Credits

* [First R notebook](https://www.kaggle.com/kailex/m5-forecaster-v2)
* [Python translation](https://www.kaggle.com/kneroma/m5-forecast-v2-python)

# Changes
* v5 : try to optimise the LGBM params (go below in lgbm params section to see changes)
* v4 : add df, X_train deletion before training step --> increasing train sample without memeroy issues

In [0]:
Kaggle = False
Colab = True

In [0]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
print(gpu_info)

In [0]:
import os, sys
from pathlib import Path

if Colab:
    from google.colab import drive
    drive.mount('/content/drive')

    path = "/content/drive/My Drive"

    os.chdir(path)
    os.listdir(path)

In [0]:
if Kaggle:
    PATH = '/kaggle/input/'
    outdir = '.'
# PATH = '/Users/helen/Desktop/Data/'
else:
    PATH = 'm5_competition/'
    outdir = Path(PATH+'res')
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    # outdir = Path(PATH+'res/wavenet-dlr-res')
    # if not os.path.exists(outdir):
    #     os.mkdir(outdir)

In [0]:
CAL_DTYPES={"event_name_1": "category", "event_name_2": "category", "event_type_1": "category", 
         "event_type_2": "category", "weekday": "category", 'wm_yr_wk': 'int16', "wday": "int16",
        "month": "int16", "year": "int16", "snap_CA": "int16", 'snap_TX': 'int16', 'snap_WI': 'int16' }
PRICE_DTYPES = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16","sell_price":"float32" }

In [0]:
pd.options.display.max_columns = 50

In [0]:
h = 28 
max_lags = 57
tr_last = 1913
fday = datetime(2016,4, 25) 
fday

In [0]:

def reduce_mem_usage(df: pd.DataFrame,
                     verbose: bool = True) -> pd.DataFrame:
    
    numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if (c_min > np.iinfo(np.int8).min
                      and c_max < np.iinfo(np.int8).max):
                    df[col] = df[col].astype(np.int8)
                elif (c_min > np.iinfo(np.int16).min
                      and c_max < np.iinfo(np.int16).max):
                    df[col] = df[col].astype(np.int16)
                elif (c_min > np.iinfo(np.int32).min
                      and c_max < np.iinfo(np.int32).max):
                    df[col] = df[col].astype(np.int32)
                elif (c_min > np.iinfo(np.int64).min
                      and c_max < np.iinfo(np.int64).max):
                    df[col] = df[col].astype(np.int64)
            else:
                if (c_min > np.finfo(np.float16).min
                        and c_max < np.finfo(np.float16).max):
                    df[col] = df[col].astype(np.float16)
                elif (c_min > np.finfo(np.float32).min
                      and c_max < np.finfo(np.float32).max):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    reduction = (start_mem - end_mem) / start_mem

    msg = f'Mem. usage decreased to {end_mem:5.2f} MB ({reduction * 100:.1f} % reduction)'
    if verbose:
        print(msg)

    return df


In [0]:
def create_dt(is_train = True, nrows = None, first_day = 1200):
    prices = pd.read_csv(PATH+ "m5-forecasting-accuracy/sell_prices.csv", dtype = PRICE_DTYPES)
    for col, col_dtype in PRICE_DTYPES.items():
        if col_dtype == "category":
            prices[col] = prices[col].cat.codes.astype("int16")
            prices[col] -= prices[col].min()
            
    cal = pd.read_csv(PATH+"m5-forecasting-accuracy/calendar.csv", dtype = CAL_DTYPES)
    cal["date"] = pd.to_datetime(cal["date"])
    for col, col_dtype in CAL_DTYPES.items():
        if col_dtype == "category":
            cal[col] = cal[col].cat.codes.astype("int16")
            cal[col] -= cal[col].min()
    
    start_day = max(1 if is_train  else tr_last-max_lags, first_day)
    numcols = [f"d_{day}" for day in range(start_day,tr_last+1)]
    catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    dtype = {numcol:"float32" for numcol in numcols} 
    dtype.update({col: "category" for col in catcols if col != "id"})
    dt = pd.read_csv(PATH+"m5-forecasting-accuracy/sales_train_validation.csv", 
                     nrows = nrows, usecols = catcols + numcols, dtype = dtype)
    
    for col in catcols:
        if col != "id":
            dt[col] = dt[col].cat.codes.astype("int16")
            dt[col] -= dt[col].min()
    
    if not is_train:
        for day in range(tr_last+1, tr_last+ 28 +1):
            dt[f"d_{day}"] = np.nan
    
    dt = pd.melt(dt,
                  id_vars = catcols,
                  value_vars = [col for col in dt.columns if col.startswith("d_")],
                  var_name = "d",
                  value_name = "sales")
    
    dt = dt.merge(cal, on= "d", copy = False)
    dt = dt.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)
    
    return dt

In [0]:
def create_fea(dt):
    lags = [7, 28, 91, 182]
    lag_cols = [f"lag_{lag}" for lag in lags ]
    for lag, lag_col in zip(lags, lag_cols):
        dt[lag_col] = dt[["id","sales"]].groupby("id")["sales"].shift(lag)

    wins = [7, 28, 91, 182]
    for win in wins :
        for lag,lag_col in zip(lags, lag_cols):
            dt[f"rmean_{lag}_{win}"] = dt[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).mean())

    
    
    date_features = {
        
        "wday": "weekday",
        "week": "weekofyear",
        "month": "month",
        "quarter": "quarter",
        "year": "year",
        "mday": "day",
#         "ime": "is_month_end",
#         "ims": "is_month_start",
    }
    
#     dt.drop(["d", "wm_yr_wk", "weekday"], axis=1, inplace = True)
    
    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in dt.columns:
            dt[date_feat_name] = dt[date_feat_name].astype("int16")
        else:
            dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")

In [0]:
FIRST_DAY = 350 # If you want to load all the data set it to '1' -->  Great  memory overflow  risk !

In [0]:
%%time

df = create_dt(is_train=True, first_day= FIRST_DAY)
df = reduce_mem_usage(df)
df.shape

In [0]:
df.head()

In [0]:
df.info()

In [0]:
%%time

create_fea(df)
df.shape
df = reduce_mem_usage(df)

In [0]:
df.info()

In [0]:
df.head()

In [0]:
df.dropna(inplace = True)
df.shape

In [0]:
cat_feats = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id'] + ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]
useless_cols = ["id", "date", "sales","d", "wm_yr_wk", "weekday"]
train_cols = df.columns[~df.columns.isin(useless_cols)]
X_train = df[train_cols]
y_train = df["sales"]

In [0]:
# train_data = lgb.Dataset(X_train, label = y_train, categorical_feature=cat_feats, free_raw_data=False)
# fake_valid_inds = np.random.choice(len(X_train), 1000000, replace = False)
# fake_valid_data = lgb.Dataset(X_train.iloc[fake_valid_inds], label = y_train.iloc[fake_valid_inds],categorical_feature=cat_feats,
#                              free_raw_data=False)   # This is just a subsample of the training set, not a real validation set !

In [0]:
%%time

np.random.seed(777)

fake_valid_inds = np.random.choice(X_train.index.values, 2_000_000, replace = False)
train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds)
train_data = lgb.Dataset(X_train.loc[train_inds] , label = y_train.loc[train_inds], 
                         categorical_feature=cat_feats, free_raw_data=False)
fake_valid_data = lgb.Dataset(X_train.loc[fake_valid_inds], label = y_train.loc[fake_valid_inds],
                              categorical_feature=cat_feats,
                 free_raw_data=False)# This is a random sample, we're not gonna apply any time series train-test-split tricks here!

In [0]:
del df, X_train, y_train, fake_valid_inds,train_inds ; gc.collect()

In [0]:
params = {
        "objective" : "poisson",
        "metric" :"rmse",
        "force_row_wise" : True,
        "learning_rate" : 0.075,
#         "sub_feature" : 0.8,
        "sub_row" : 0.75,
        "bagging_freq" : 1,
        "lambda_l2" : 0.1,
#         "nthread" : 4
        "metric": ["rmse"],
    'verbosity': 1,
    'num_iterations' : 1200,
    'num_leaves': 128,
    "min_data_in_leaf": 100,
}

In [25]:
%%time

m_lgb = lgb.train(params, train_data, valid_sets = [fake_valid_data], verbose_eval=20) 



[20]	valid_0's rmse: 2.86049
[40]	valid_0's rmse: 2.54933
[60]	valid_0's rmse: 2.46711
[80]	valid_0's rmse: 2.44433
[100]	valid_0's rmse: 2.43292
[120]	valid_0's rmse: 2.42233
[140]	valid_0's rmse: 2.41541
[160]	valid_0's rmse: 2.4071
[180]	valid_0's rmse: 2.39727
[200]	valid_0's rmse: 2.39029
[220]	valid_0's rmse: 2.38436
[240]	valid_0's rmse: 2.37583
[260]	valid_0's rmse: 2.37075
[280]	valid_0's rmse: 2.36538
[300]	valid_0's rmse: 2.36187
[320]	valid_0's rmse: 2.3593
[340]	valid_0's rmse: 2.35543
[360]	valid_0's rmse: 2.35206
[380]	valid_0's rmse: 2.34973
[400]	valid_0's rmse: 2.34645
[420]	valid_0's rmse: 2.34387
[440]	valid_0's rmse: 2.34201
[460]	valid_0's rmse: 2.33843
[480]	valid_0's rmse: 2.3367
[500]	valid_0's rmse: 2.33359
[520]	valid_0's rmse: 2.33105
[540]	valid_0's rmse: 2.32827
[560]	valid_0's rmse: 2.32621
[580]	valid_0's rmse: 2.32409
[600]	valid_0's rmse: 2.32119
[620]	valid_0's rmse: 2.31897
[640]	valid_0's rmse: 2.3165
[660]	valid_0's rmse: 2.31495
[680]	valid_0's rm

In [26]:
m_lgb.save_model(os.path.join(outdir,"model-fd{}.lgb".format(FIRST_DAY)))

<lightgbm.basic.Booster at 0x7f7d541ae080>

In [28]:
%%time

alphas = [1.028, 1.023, 1.018]
weights = [1/len(alphas)]*len(alphas)
sub = 0.

for icount, (alpha, weight) in enumerate(zip(alphas, weights)):

    te = create_dt(False)
    cols = [f"F{i}" for i in range(1,29)]

    for tdelta in range(0, 28):
        day = fday + timedelta(days=tdelta)
        print(tdelta, day)
        tst = te[(te.date >= day - timedelta(days=max_lags)) & (te.date <= day)].copy()
        create_fea(tst)
        tst = tst.loc[tst.date == day , train_cols]
        te.loc[te.date == day, "sales"] = alpha*m_lgb.predict(tst) # magic multiplier by kyakovlev



    te_sub = te.loc[te.date >= fday, ["id", "sales"]].copy()
#     te_sub.loc[te.date >= fday+ timedelta(days=h), "id"] = te_sub.loc[te.date >= fday+timedelta(days=h), 
#                                                                           "id"].str.replace("validation$", "evaluation")
    te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]
    te_sub = te_sub.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index()
    te_sub.fillna(0., inplace = True)
    te_sub.sort_values("id", inplace = True)
    te_sub.reset_index(drop=True, inplace = True)
    te_sub.to_csv(f"submission_{icount}.csv",index=False)
    if icount == 0 :
        sub = te_sub
        sub[cols] *= weight
    else:
        sub[cols] += te_sub[cols]*weight
    print(icount, alpha, weight)


sub2 = sub.copy()
sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
sub = pd.concat([sub, sub2], axis=0, sort=False)
sub.to_csv(os.path.join(outdir,"submission.csv"),index=False)

0 2016-04-25 00:00:00
1 2016-04-26 00:00:00
2 2016-04-27 00:00:00
3 2016-04-28 00:00:00
4 2016-04-29 00:00:00
5 2016-04-30 00:00:00
6 2016-05-01 00:00:00
7 2016-05-02 00:00:00
8 2016-05-03 00:00:00
9 2016-05-04 00:00:00
10 2016-05-05 00:00:00
11 2016-05-06 00:00:00
12 2016-05-07 00:00:00
13 2016-05-08 00:00:00
14 2016-05-09 00:00:00
15 2016-05-10 00:00:00
16 2016-05-11 00:00:00
17 2016-05-12 00:00:00
18 2016-05-13 00:00:00
19 2016-05-14 00:00:00
20 2016-05-15 00:00:00
21 2016-05-16 00:00:00
22 2016-05-17 00:00:00
23 2016-05-18 00:00:00
24 2016-05-19 00:00:00
25 2016-05-20 00:00:00
26 2016-05-21 00:00:00
27 2016-05-22 00:00:00
0 1.028 0.3333333333333333
0 2016-04-25 00:00:00
1 2016-04-26 00:00:00
2 2016-04-27 00:00:00
3 2016-04-28 00:00:00
4 2016-04-29 00:00:00
5 2016-04-30 00:00:00
6 2016-05-01 00:00:00
7 2016-05-02 00:00:00
8 2016-05-03 00:00:00
9 2016-05-04 00:00:00
10 2016-05-05 00:00:00
11 2016-05-06 00:00:00
12 2016-05-07 00:00:00
13 2016-05-08 00:00:00
14 2016-05-09 00:00:00
15 2

In [30]:
sub.head(10)

F,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_validation,1.098673,1.02282,0.997819,0.875534,1.234516,1.459061,1.459077,1.292348,1.351955,1.137295,1.299196,1.349206,1.839424,1.437884,1.39645,1.186929,1.256992,1.223022,1.333963,1.861751,1.759072,1.333841,1.234098,1.235177,1.153949,1.242508,1.69075,1.758571
1,FOODS_1_001_CA_2_validation,1.040031,1.308017,0.98389,1.33574,1.336309,1.735975,2.092439,1.297207,1.336175,1.151002,1.262915,1.419095,2.189462,1.545336,1.393283,1.299332,1.398599,1.385259,1.491573,2.216568,2.286248,1.357851,1.24978,1.262961,1.203617,1.34783,1.928699,1.990839
2,FOODS_1_001_CA_3_validation,1.554818,1.49619,1.473606,1.454691,1.581425,1.80598,1.793064,1.620869,1.780457,1.641296,1.875835,2.159771,2.492027,1.883502,1.915424,1.719704,1.87342,1.908869,2.091048,2.880561,3.035407,1.991676,1.900953,1.985316,1.806536,1.909291,2.70765,2.562894
3,FOODS_1_001_CA_4_validation,0.554983,0.417553,0.421589,0.420868,0.535021,0.612739,0.718413,0.672702,0.731614,0.624331,0.669541,0.608409,0.665055,0.545598,0.580606,0.548967,0.604833,0.634653,0.629666,0.737405,0.691096,0.523849,0.494171,0.506868,0.487351,0.509175,0.670414,0.683448
4,FOODS_1_001_TX_1_validation,0.316489,0.30551,0.299616,0.299709,0.307175,0.338282,0.370671,0.679834,0.771715,0.672857,0.778826,0.748299,0.813342,0.644049,0.75472,0.756535,0.767674,0.762346,0.779191,0.922532,0.904106,0.655448,0.613595,0.596276,0.55108,0.590111,0.725678,0.685868
5,FOODS_1_001_TX_2_validation,0.548314,0.530293,0.517855,0.471678,0.534096,0.726891,0.683201,0.703463,0.779146,0.624083,0.816554,0.789702,0.896251,0.661808,0.678579,0.625236,0.720204,0.653901,0.670175,0.842221,0.788409,0.609723,0.614559,0.61211,0.572989,0.647049,0.793125,0.766678
6,FOODS_1_001_TX_3_validation,0.473368,0.405845,0.395354,0.47256,0.485227,0.542841,0.627963,0.644553,0.704863,0.591691,0.664486,0.678166,0.751235,0.563344,0.600821,0.567761,0.598909,0.607372,0.66733,0.746915,0.710848,0.576511,0.543512,0.55384,0.512856,0.554489,0.665139,0.631681
7,FOODS_1_001_WI_1_validation,0.384921,0.474208,0.412361,0.41686,0.497414,0.781901,0.891231,0.788247,0.808481,0.721102,0.911687,0.865463,1.085459,0.813322,0.757734,0.719601,0.789476,0.805179,0.920705,1.316374,1.230173,0.83636,0.789447,0.818003,0.791189,0.943223,1.507604,1.167656
8,FOODS_1_001_WI_2_validation,0.429086,0.469116,0.457969,0.4192,0.496798,0.563328,0.534624,0.623794,0.715672,0.534169,0.61539,0.664011,0.72863,0.581457,0.612398,0.56186,0.63445,0.602782,0.582271,0.731807,0.700585,0.526748,0.521034,0.551505,0.524909,0.532879,0.629011,0.57923
9,FOODS_1_001_WI_3_validation,0.358054,0.35812,0.370933,0.366427,0.425022,0.686437,0.531013,0.61984,0.734767,0.590584,0.699504,0.767726,0.723573,0.586869,0.564464,0.521326,0.585736,0.5886,0.617101,0.797293,0.732423,0.52005,0.501341,0.499891,0.485642,0.542018,0.681906,0.591939


In [32]:
sub.id.nunique(), sub["id"].str.contains("validation$").sum()

(60980, 30490)

In [34]:
sub.shape

(60980, 29)