In [1]:
import pandas as pd
from datetime import timedelta


In [2]:
dtypes = {'id':'uint32', 'item_nbr':'int32', 'store_nbr':'int8', 'unit_sales':'float32'}

In [3]:
train = pd.read_csv('../train.csv/train.csv', usecols=[1,2,3,4], dtype=dtypes, parse_dates=['date'],
                    skiprows=range(1, 86672217) #Skip dates before 2016-08-01
                    )


In [5]:
train.loc[(train.unit_sales<0),'unit_sales'] = 0 

In [6]:
train['unit_sales'] =  train['unit_sales'].apply(pd.np.log1p)

In [7]:
train['dow'] = train['date'].dt.dayofweek

In [9]:
ma_dw = train[['item_nbr','store_nbr','dow','unit_sales']].groupby(
        ['item_nbr','store_nbr','dow'])['unit_sales'].mean().to_frame('madw').reset_index()

In [11]:
ma_wk = ma_dw[['item_nbr','store_nbr','madw']].groupby(
        ['store_nbr', 'item_nbr'])['madw'].mean().to_frame('mawk').reset_index()

In [14]:
train.drop('dow',1,inplace=True)

In [16]:
u_dates = train.date.unique()

In [19]:
u_items = train.item_nbr.unique()

In [22]:
u_items.shape

(4036L,)

In [23]:
u_stores = train.store_nbr.unique()

In [25]:
train.set_index(['date', 'store_nbr', 'item_nbr'], inplace=True)

In [27]:
train = train.reindex(
    pd.MultiIndex.from_product(
        (u_dates, u_stores, u_items),
        names=['date','store_nbr','item_nbr']
    )
).reset_index()

In [32]:
del u_dates, u_stores, u_items

train.loc[:, 'unit_sales'].fillna(0, inplace=True) # fill NaNs
lastdate = train.iloc[train.shape[0]-1].date

In [37]:
del tmp,tmpg,train

ma_is['mais']=ma_is.median(axis=1)
ma_is.reset_index(inplace=True)
ma_is.drop(list(ma_is.columns.values)[3:],1,inplace=True)

In [42]:
test = pd.read_csv('../test.csv/test.csv', dtype=dtypes, parse_dates=['date'])

In [44]:
test['dow'] = test['date'].dt.dayofweek
test = pd.merge(test, ma_is, how='left', on=['item_nbr','store_nbr'])
test = pd.merge(test, ma_wk, how='left', on=['item_nbr',''])
test = pd.merge(test, ma_dw, how='left', on=['item_nbr','store_nbr','dow'])

In [45]:
test

Unnamed: 0,id,date,store_nbr,item_nbr,onpromotion,dow,mais,mawk,madw
0,125497040,2017-08-16,1,96995,False,2,0.126638,0.844024,0.693147
1,125497041,2017-08-16,1,99197,False,2,0.206469,1.010969,0.977616
2,125497042,2017-08-16,1,103501,False,2,0.000000,,
3,125497043,2017-08-16,1,103520,False,2,0.649946,1.114774,1.175135
4,125497044,2017-08-16,1,103665,False,2,1.005429,1.320085,1.282500
5,125497045,2017-08-16,1,105574,False,2,1.619312,1.693974,1.951031
6,125497046,2017-08-16,1,105575,False,2,2.275806,2.258817,2.432405
7,125497047,2017-08-16,1,105576,False,2,0.000000,,
8,125497048,2017-08-16,1,105577,False,2,0.545000,1.022971,1.096418
9,125497049,2017-08-16,1,105693,False,2,0.276514,0.852271,0.939412


In [46]:
del ma_is, ma_wk, ma_dw

#Forecasting Test
test['unit_sales'] = test.mais 
pos_idx = test['mawk'] > 0
test_pos = test.loc[pos_idx]
test.loc[pos_idx, 'unit_sales'] = test_pos['mais'] * test_pos['madw'] / test_pos['mawk']
test.loc[:, "unit_sales"].fillna(0, inplace=True)
test['unit_sales'] = test['unit_sales'].apply(pd.np.expm1) # restoring unit values 

In [47]:
test.loc[test['onpromotion'] == True, 'unit_sales'] *= 1.5

In [48]:
test

Unnamed: 0,id,date,store_nbr,item_nbr,onpromotion,dow,mais,mawk,madw,unit_sales
0,125497040,2017-08-16,1,96995,False,2,0.126638,0.844024,0.693147,0.109601
1,125497041,2017-08-16,1,99197,False,2,0.206469,1.010969,0.977616,0.220984
2,125497042,2017-08-16,1,103501,False,2,0.000000,,,0.000000
3,125497043,2017-08-16,1,103520,False,2,0.649946,1.114774,1.175135,0.984047
4,125497044,2017-08-16,1,103665,False,2,1.005429,1.320085,1.282500,1.655950
5,125497045,2017-08-16,1,105574,False,2,1.619312,1.693974,1.951031,5.456189
6,125497046,2017-08-16,1,105575,False,2,2.275806,2.258817,2.432405,10.596462
7,125497047,2017-08-16,1,105576,False,2,0.000000,,,0.000000
8,125497048,2017-08-16,1,105577,False,2,0.545000,1.022971,1.096418,0.793431
9,125497049,2017-08-16,1,105693,False,2,0.276514,0.852271,0.939412,0.356336


In [1]:
from datetime import date, timedelta

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

In [2]:
df_train = pd.read_csv(
    '../train.csv/train.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    skiprows=range(1, 66458909)  # 2016-01-01
)

df_test = pd.read_csv(
    "../test.csv/test.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)


In [4]:
items = pd.read_csv(
    "../items.csv/items.csv",
).set_index("item_nbr")


In [5]:
df_2017 = df_train.loc[df_train.date>=pd.datetime(2017,1,1)]

In [7]:
del df_train

In [8]:
promo_2017_train = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)


In [10]:
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)

In [12]:
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)

In [17]:
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)

In [19]:
del promo_2017_test, promo_2017_train

In [20]:
df_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)


In [21]:
items = items.reindex(df_2017.index.get_level_values(1))

In [23]:
def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]


In [24]:
def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "day_1_2017": get_timespan(df_2017, t2017, 1, 1).values.ravel(),
        "mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,
        "mean_30_2017": get_timespan(df_2017, t2017, 30, 30).mean(axis=1).values,
        "mean_60_2017": get_timespan(df_2017, t2017, 60, 60).mean(axis=1).values,
        "mean_140_2017": get_timespan(df_2017, t2017, 140, 140).mean(axis=1).values,
        "promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values,
        "promo_60_2017": get_timespan(promo_2017, t2017, 60, 60).sum(axis=1).values,
        "promo_140_2017": get_timespan(promo_2017, t2017, 140, 140).sum(axis=1).values
    })
    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').mean(axis=1).values
    for i in range(16):
        X["promo_{}".format(i)] = promo_2017[
            t2017 + timedelta(days=i)].values.astype(np.uint8)
    if is_train:
        y = df_2017[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X


In [25]:
t2017 = date(2017, 5, 31)
X_l, y_l = [], []
for i in range(6):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)


In [33]:
print("Training and predicting models...")
params = {
    'num_leaves': 31,
    'objective': 'regression',
    'min_data_in_leaf': 300,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 2,
    'metric': 'l2',
    'num_threads': 4
}


Training and predicting models...


In [34]:
MAX_ROUNDS = 2000
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * 6) * 0.25 + 1
    )
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        weight=items["perishable"] * 0.25 + 1,
        categorical_feature=cate_vars)
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=50, verbose_eval=100
    )
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))


Step 1




Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.30165	valid_1's l2: 0.294069
[200]	training's l2: 0.298063	valid_1's l2: 0.292692
[300]	training's l2: 0.29561	valid_1's l2: 0.292274
[400]	training's l2: 0.293648	valid_1's l2: 0.29206
[500]	training's l2: 0.291822	valid_1's l2: 0.291923
Did not meet early stopping. Best iteration is:
[500]	training's l2: 0.291822	valid_1's l2: 0.291923
mean_7_2017: 1942372.00
mean_14_2017: 1150857.69
mean_30_2017: 127412.84
promo_0: 103533.42
mean_20_dow0_2017: 80733.01
day_1_2017: 76932.56
mean_3_2017: 71583.75
mean_4_dow0_2017: 58262.28
promo_14_2017: 29372.56
mean_60_2017: 24099.99
promo_7: 9064.66
promo_60_2017: 6791.67
mean_4_dow5_2017: 6429.96
mean_140_2017: 6363.31
mean_20_dow4_2017: 5894.72
promo_140_2017: 5630.61
mean_4_dow6_2017: 5319.77
mean_20_dow2_2017: 3878.68
mean_4_dow2_2017: 3495.79
mean_4_dow3_2017: 2953.13
mean_20_dow1_2017: 2947.08
promo_14: 2812.74
promo_9: 2710.69
mean_4_dow1_2017: 2697.50
mean_

In [37]:
print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))


('Validation mse:', 0.36243283304926488)


In [38]:
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)


In [39]:
submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('lgb.csv', float_format='%.4f', index=None)


In [1]:
import pandas as pd

In [22]:
submit = pd.read_csv("lgb.csv")

In [26]:
from datetime import date

In [31]:
pd.date_range(start = date(2017,1,1),end = date(2017,2,28))

DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04',
               '2017-01-05', '2017-01-06', '2017-01-07', '2017-01-08',
               '2017-01-09', '2017-01-10', '2017-01-11', '2017-01-12',
               '2017-01-13', '2017-01-14', '2017-01-15', '2017-01-16',
               '2017-01-17', '2017-01-18', '2017-01-19', '2017-01-20',
               '2017-01-21', '2017-01-22', '2017-01-23', '2017-01-24',
               '2017-01-25', '2017-01-26', '2017-01-27', '2017-01-28',
               '2017-01-29', '2017-01-30', '2017-01-31', '2017-02-01',
               '2017-02-02', '2017-02-03', '2017-02-04', '2017-02-05',
               '2017-02-06', '2017-02-07', '2017-02-08', '2017-02-09',
               '2017-02-10', '2017-02-11', '2017-02-12', '2017-02-13',
               '2017-02-14', '2017-02-15', '2017-02-16', '2017-02-17',
               '2017-02-18', '2017-02-19', '2017-02-20', '2017-02-21',
               '2017-02-22', '2017-02-23', '2017-02-24', '2017-02-25',
      

In [33]:
t2017 = date(2017, 5, 31)

In [36]:
from datetime import timedelta

In [39]:
pd.date_range(t2017-timedelta(days=28),periods=6,freq="7D")

DatetimeIndex(['2017-05-03', '2017-05-10', '2017-05-17', '2017-05-24',
               '2017-05-31', '2017-06-07'],
              dtype='datetime64[ns]', freq='7D')