In [1]:
from datetime import date, timedelta

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor

In [2]:
df_train = pd.read_csv(
    '../input/train.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    skiprows=range(1, 66458909)  # 2016-01-01
)

df_test = pd.read_csv(
    "../input/test.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

In [3]:
items = pd.read_csv(
    "../input/items.csv",
).set_index("item_nbr")

df_2017 = df_train.loc[df_train.date>=pd.datetime(2017,1,1)]
del df_train

In [4]:
promo_2017_train = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
del promo_2017_test, promo_2017_train

In [5]:
df_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)

items = items.reindex(df_2017.index.get_level_values(1))

In [6]:
oil_price = pd.read_csv('../input/oil.csv')
oil_price = oil_price[oil_price.date>='2017-01-01']
oil_2017 = df_2017.stack().reset_index()
oil_2017['date'] = pd.to_datetime(oil_2017['date'])
oil_price['date'] = pd.to_datetime(oil_price['date'])
oil_2017 = oil_2017.merge(oil_price, on='date', how='left')
oil_2017.fillna(-1)
oil_2017.drop(0, axis=1,inplace=True)
oil_2017 = oil_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["dcoilwtico"]].unstack(
        level=-1).fillna(0)
oil_2017.columns = oil_2017.columns.get_level_values(1)

In [7]:
oil_2017.head()

Unnamed: 0_level_0,date,2017-01-01 00:00:00,2017-01-02 00:00:00,2017-01-03 00:00:00,2017-01-04 00:00:00,2017-01-05 00:00:00,2017-01-06 00:00:00,2017-01-07 00:00:00,2017-01-08 00:00:00,2017-01-09 00:00:00,2017-01-10 00:00:00,...,2017-08-06 00:00:00,2017-08-07 00:00:00,2017-08-08 00:00:00,2017-08-09 00:00:00,2017-08-10 00:00:00,2017-08-11 00:00:00,2017-08-12 00:00:00,2017-08-13 00:00:00,2017-08-14 00:00:00,2017-08-15 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,0.0,0.0,52.36,53.26,53.77,53.98,0.0,0.0,51.95,50.82,...,0.0,49.37,49.07,49.59,48.54,48.81,0.0,0.0,47.59,47.57
1,99197,0.0,0.0,52.36,53.26,53.77,53.98,0.0,0.0,51.95,50.82,...,0.0,49.37,49.07,49.59,48.54,48.81,0.0,0.0,47.59,47.57
1,103520,0.0,0.0,52.36,53.26,53.77,53.98,0.0,0.0,51.95,50.82,...,0.0,49.37,49.07,49.59,48.54,48.81,0.0,0.0,47.59,47.57
1,103665,0.0,0.0,52.36,53.26,53.77,53.98,0.0,0.0,51.95,50.82,...,0.0,49.37,49.07,49.59,48.54,48.81,0.0,0.0,47.59,47.57
1,105574,0.0,0.0,52.36,53.26,53.77,53.98,0.0,0.0,51.95,50.82,...,0.0,49.37,49.07,49.59,48.54,48.81,0.0,0.0,47.59,47.57


In [8]:
def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

In [9]:
def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "oil_1_2017": get_timespan(oil_2017, t2017, 1, 1).values.ravel(),
        "day_1_2017": get_timespan(df_2017, t2017, 1, 1).values.ravel(),
            
        "mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values,
        "std_3_2017": get_timespan(df_2017, t2017, 3, 3).std(axis=1).values, 
        "mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
        "std_7_2017": get_timespan(df_2017, t2017, 7, 7).std(axis=1).values,
        "mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,
        "std_14_2017": get_timespan(df_2017, t2017, 14, 14).std(axis=1).values,
        "mean_30_2017": get_timespan(df_2017, t2017, 30, 30).mean(axis=1).values,
        "std_30_2017": get_timespan(df_2017, t2017, 30, 30).std(axis=1).values,
        "mean_60_2017": get_timespan(df_2017, t2017, 60, 60).mean(axis=1).values,
        "std_60_2017": get_timespan(df_2017, t2017, 60, 60).std(axis=1).values,
        "mean_140_2017": get_timespan(df_2017, t2017, 140, 140).mean(axis=1).values,
        "std_140_2017": get_timespan(df_2017, t2017, 140, 140).std(axis=1).values,
            
        "oil_mean_3_2017": get_timespan(oil_2017, t2017, 3, 3).mean(axis=1).values,
        "oil_std_3_2017": get_timespan(oil_2017, t2017, 3, 3).std(axis=1).values, 
        "oil_mean_7_2017": get_timespan(oil_2017, t2017, 7, 7).mean(axis=1).values,
        "oil_std_7_2017": get_timespan(oil_2017, t2017, 7, 7).std(axis=1).values,
        "oil_mean_14_2017": get_timespan(oil_2017, t2017, 14, 14).mean(axis=1).values,
        "oil_std_14_2017": get_timespan(oil_2017, t2017, 14, 14).std(axis=1).values,
        "oil_mean_30_2017": get_timespan(oil_2017, t2017, 30, 30).mean(axis=1).values,
        "oil_std_30_2017": get_timespan(oil_2017, t2017, 30, 30).std(axis=1).values,
        "oil_mean_60_2017": get_timespan(oil_2017, t2017, 60, 60).mean(axis=1).values,
        "oil_std_60_2017": get_timespan(oil_2017, t2017, 60, 60).std(axis=1).values,
        "oil_mean_140_2017": get_timespan(oil_2017, t2017, 140, 140).mean(axis=1).values,
        "oil_std_140_2017": get_timespan(oil_2017, t2017, 140, 140).std(axis=1).values,
            
        "promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values,
        "promo_60_2017": get_timespan(promo_2017, t2017, 60, 60).sum(axis=1).values,
        "promo_140_2017": get_timespan(promo_2017, t2017, 140, 140).sum(axis=1).values
    })
    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['std_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').std(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').mean(axis=1).values
        X['std_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').std(axis=1).values
        
        X['oil_mean_4_dow{}_2017'.format(i)] = get_timespan(oil_2017, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['oil_std_4_dow{}_2017'.format(i)] = get_timespan(oil_2017, t2017, 28-i, 4, freq='7D').std(axis=1).values
        X['oil_mean_20_dow{}_2017'.format(i)] = get_timespan(oil_2017, t2017, 140-i, 20, freq='7D').mean(axis=1).values
        X['oil_std_20_dow{}_2017'.format(i)] = get_timespan(oil_2017, t2017, 140-i, 20, freq='7D').std(axis=1).values
    for i in range(16):
        X["promo_{}".format(i)] = promo_2017[
            t2017 + timedelta(days=i)].values.astype(np.uint8)
    if is_train:
        y = df_2017[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [10]:
print("Preparing dataset...")
t2017 = date(2017, 5, 31)
X_l, y_l = [], []
for i in range(6):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

Preparing dataset...


In [13]:
print("Training and predicting models...")


MAX_ROUNDS = 2800
val_pred = []
test_pred = []
cate_vars = []

Training and predicting models...


In [None]:
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    model = CatBoostRegressor(
        iterations=MAX_ROUNDS, learning_rate=0.01,
        depth=4)
        
    model.fit(
        X_train, y_train[:, i],
        cat_features=cate_vars)
    
    val_pred.append(model.predict(X_val))
    test_pred.append(model.predict(X_test))

print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))

print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('catboost_2017_01_08_21_35.csv', float_format='%.4f', index=None)

Step 1
0:	learn: 1.4543824	total: 676ms	remaining: 31m 32s
1:	learn: 1.4426921	total: 1.35s	remaining: 31m 31s
2:	learn: 1.4311491	total: 2s	remaining: 31m 4s
3:	learn: 1.4197695	total: 2.65s	remaining: 30m 54s
4:	learn: 1.4085071	total: 3.34s	remaining: 31m 6s
5:	learn: 1.3973575	total: 3.91s	remaining: 30m 19s
6:	learn: 1.3863349	total: 4.6s	remaining: 30m 36s
7:	learn: 1.3754534	total: 5.25s	remaining: 30m 33s
8:	learn: 1.3647084	total: 5.91s	remaining: 30m 31s
9:	learn: 1.3540731	total: 6.59s	remaining: 30m 37s
10:	learn: 1.3436059	total: 7.3s	remaining: 30m 50s
11:	learn: 1.3333139	total: 8.03s	remaining: 31m 5s
12:	learn: 1.3230388	total: 8.67s	remaining: 30m 59s
13:	learn: 1.3129036	total: 9.4s	remaining: 31m 10s
14:	learn: 1.3028791	total: 10.2s	remaining: 31m 27s
15:	learn: 1.2930041	total: 10.8s	remaining: 31m 27s
16:	learn: 1.2832261	total: 11.5s	remaining: 31m 17s
17:	learn: 1.2735645	total: 12.2s	remaining: 31m 19s
18:	learn: 1.2640849	total: 12.9s	remaining: 31m 22s
19:	l