In [1]:
"""
This is an upgraded version of Ceshine's LGBM starter script.
"""

"\nThis is an upgraded version of Ceshine's LGBM starter script.\n"

In [1]:
from datetime import date, timedelta
import calendar as ca
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

In [2]:
print('Loading Data')
df_train = pd.read_csv('train.csv',dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(float(u)) if float(u) > 0 else 0},
    parse_dates=["date"], skiprows=range(1, 101688780) )
# read data starting from 2017-1-1

Loading Data


In [9]:
item_nbr_u = df_train.item_nbr.unique()

In [11]:
df_test = pd.read_csv("test.csv", dtype={'onpromotion': bool}, parse_dates=["date"]).set_index(['store_nbr', 'item_nbr', 'date'])

In [12]:
date_index = pd.date_range(df_train['date'].min(), df_train['date'].max())

In [13]:
items = pd.read_csv("items.csv").set_index("item_nbr")

In [20]:
promo_2017_train = df_train.set_index(["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack().fillna(False)
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)

In [22]:
promo_2017_test = df_test[["onpromotion"]].unstack().fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)

In [25]:
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)

In [28]:
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
del promo_2017_test, promo_2017_train

In [29]:
df_2017 = df_train.set_index(["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack().fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)

In [40]:
items = items.reindex(df_2017.index.get_level_values(1))

In [41]:
items

Unnamed: 0_level_0,family,class,perishable
item_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
96995,GROCERY I,1093,0
99197,GROCERY I,1067,0
103520,GROCERY I,1028,0
103665,BREAD/BAKERY,2712,1
105574,GROCERY I,1045,0
105575,GROCERY I,1045,0
105577,GROCERY I,1045,0
105693,GROCERY I,1034,0
105737,GROCERY I,1044,0
105857,GROCERY I,1092,0


In [42]:
def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

In [67]:
def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values,
        "promo_60_2017": get_timespan(promo_2017, t2017, 60, 60).sum(axis=1).values,
        "promo_140_2017": get_timespan(promo_2017, t2017, 140, 140).sum(axis=1).values
    })

    for i in range(16):
        X["promo_{}".format(i)] = promo_2017[t2017 + timedelta(days=i)].values.astype(np.uint8)
        
    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').mean(axis=1).values 
        
    for i in range(1,15,1):
        X['day_{}_2017'.format(i)] = get_timespan(df_2017, t2017, i, 1).values.ravel()
        
    for i in range(2,15,1):
        X['mean_{}_2017'.format(i)] = get_timespan(df_2017, t2017, i, i).mean(axis=1).values
        
    X['mean_30_2017'.format(i)] = get_timespan(df_2017, t2017, 30, 30).mean(axis=1).values
    X['mean_60_2017'.format(i)] = get_timespan(df_2017, t2017, 60, 60).mean(axis=1).values
    X['mean_140_2017'.format(i)] = get_timespan(df_2017, t2017, 60, 60).mean(axis=1).values

    if is_train:
        y = df_2017[pd.date_range(t2017, periods=16)].values
        return X, y
        
    return X

In [68]:
t2017 = date(2017, 7, 5)
X_l, y_l = [], []
for i in range(4):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

In [69]:
params = {
    'num_leaves': 31,
    'objective': 'regression',
    'min_data_in_leaf': 200,
    'learning_rate': 0.07,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.85,
    'bagging_freq': 3,
    'metric': 'l2_root',
    'num_threads': 4
}

MAX_ROUNDS = 500
val_pred = []
test_pred = []
cate_vars = []

In [70]:
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * 4) * 0.25 + 1
    )
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        weight=items["perishable"] * 0.25 + 1,
        categorical_feature=cate_vars)
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=50, verbose_eval=100
    )
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

Step 1




Training until validation scores don't improve for 50 rounds.
[100]	training's rmse: 0.542099	valid_1's rmse: 0.538469
[200]	training's rmse: 0.53768	valid_1's rmse: 0.533979
[300]	training's rmse: 0.53487	valid_1's rmse: 0.53114
[400]	training's rmse: 0.532429	valid_1's rmse: 0.528701
[500]	training's rmse: 0.530294	valid_1's rmse: 0.526548
Did not meet early stopping. Best iteration is:
[500]	training's rmse: 0.530294	valid_1's rmse: 0.526548
mean_7_2017: 1277555.85
mean_14_2017: 944640.51
mean_9_2017: 609652.96
mean_8_2017: 174058.60
promo_0: 101190.66
mean_30_2017: 74737.98
mean_5_2017: 66786.85
mean_20_dow0_2017: 63927.49
mean_2_2017: 59243.93
mean_4_dow0_2017: 56033.90
day_1_2017: 51050.91
promo_14_2017: 32334.22
mean_60_2017: 27522.52
mean_12_2017: 18380.96
mean_3_2017: 12211.07
promo_7: 10649.19
day_14_2017: 7190.57
mean_4_dow2_2017: 7090.15
mean_11_2017: 7051.04
mean_140_2017: 6877.38
promo_60_2017: 6501.07
mean_20_dow2_2017: 5942.75
day_7_2017: 5641.03
mean_4_2017: 4852.02
pr

Step 6
Training until validation scores don't improve for 50 rounds.
[100]	training's rmse: 0.590264	valid_1's rmse: 0.594961
[200]	training's rmse: 0.586086	valid_1's rmse: 0.590387
[300]	training's rmse: 0.583041	valid_1's rmse: 0.587266
[400]	training's rmse: 0.58038	valid_1's rmse: 0.584604
[500]	training's rmse: 0.577904	valid_1's rmse: 0.582154
Did not meet early stopping. Best iteration is:
[500]	training's rmse: 0.577904	valid_1's rmse: 0.582154
mean_10_2017: 1180804.63
mean_9_2017: 562631.97
mean_11_2017: 557299.43
mean_14_2017: 324008.82
mean_30_2017: 263752.53
mean_3_2017: 156351.11
promo_5: 79393.85
mean_4_dow5_2017: 73380.68
mean_60_2017: 60880.64
mean_20_dow5_2017: 59637.38
mean_4_2017: 40802.84
mean_7_2017: 31287.64
promo_14_2017: 23314.22
mean_140_2017: 18968.77
mean_2_2017: 13533.98
promo_3: 8176.93
promo_6: 6405.17
promo_7: 6067.47
mean_20_dow6_2017: 6061.55
mean_4_dow6_2017: 5782.87
mean_6_2017: 5207.91
promo_60_2017: 5169.40
mean_20_dow0_2017: 4815.76
mean_13_2017: 

Step 11
Training until validation scores don't improve for 50 rounds.
[100]	training's rmse: 0.607237	valid_1's rmse: 0.608805
[200]	training's rmse: 0.602262	valid_1's rmse: 0.60417
[300]	training's rmse: 0.598614	valid_1's rmse: 0.600729
[400]	training's rmse: 0.595723	valid_1's rmse: 0.597961
[500]	training's rmse: 0.593064	valid_1's rmse: 0.595538
Did not meet early stopping. Best iteration is:
[500]	training's rmse: 0.593064	valid_1's rmse: 0.595538
mean_30_2017: 1334902.29
mean_12_2017: 671737.93
mean_4_dow3_2017: 288205.64
mean_14_2017: 224585.27
mean_6_2017: 201164.09
mean_60_2017: 188656.14
mean_13_2017: 178377.91
mean_5_2017: 170934.71
mean_20_dow3_2017: 145688.82
promo_10: 110084.40
mean_7_2017: 82465.00
mean_4_2017: 34898.76
mean_140_2017: 33993.19
promo_14_2017: 20365.82
mean_4_dow4_2017: 14122.26
promo_12: 10408.05
day_3_2017: 9105.41
promo_60_2017: 8239.82
mean_20_dow2_2017: 7478.03
promo_140_2017: 7371.20
promo_14: 7309.73
promo_7: 6936.01
mean_11_2017: 6498.98
promo_11

Step 16
Training until validation scores don't improve for 50 rounds.
[100]	training's rmse: 0.605599	valid_1's rmse: 0.604925
[200]	training's rmse: 0.601116	valid_1's rmse: 0.60051
[300]	training's rmse: 0.597897	valid_1's rmse: 0.597291
[400]	training's rmse: 0.59523	valid_1's rmse: 0.594743
[500]	training's rmse: 0.59259	valid_1's rmse: 0.592218
Did not meet early stopping. Best iteration is:
[500]	training's rmse: 0.59259	valid_1's rmse: 0.592218
mean_30_2017: 1396877.86
mean_14_2017: 695875.92
mean_60_2017: 147069.77
promo_15: 124656.48
mean_7_2017: 122048.35
mean_20_dow1_2017: 99177.65
mean_13_2017: 46101.34
mean_140_2017: 42735.11
mean_8_2017: 42132.51
mean_5_2017: 33397.42
mean_20_dow2_2017: 25782.07
mean_6_2017: 25268.82
promo_14_2017: 17653.90
mean_4_dow1_2017: 13719.00
promo_14: 12477.34
promo_60_2017: 10548.64
mean_4_dow2_2017: 9466.53
promo_140_2017: 8351.19
day_13_2017: 7431.36
day_1_2017: 7017.98
mean_2_2017: 5977.92
mean_3_2017: 5121.45
mean_20_dow4_2017: 4288.36
mean_

In [71]:
print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose())**0.5)

Validation mse: 0.583640655433


In [72]:
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(y_test, index=df_2017.index, columns=pd.date_range("2017-08-16", periods=16)
                       ).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

Making submission...


In [73]:
submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 10000)
submission.shape

(3370464, 2)

In [74]:
submission.to_csv('try2.csv', float_format='%.4f', index=None)