In [1]:
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

In [2]:
train = pd.read_csv('../input/train.csv', usecols = [1,2,3,4,5],
                   dtype = {'onpromotion': bool},parse_dates = ['date'],
                    skiprows = range(1, 66458909))

In [5]:
train.head()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion
0,2016-01-01,25,105574,12.0,False
1,2016-01-01,25,105575,9.0,False
2,2016-01-01,25,105857,3.0,False
3,2016-01-01,25,108634,3.0,False
4,2016-01-01,25,108701,2.0,True


In [8]:
test = pd.read_csv('../input/test.csv', usecols = [0,1,2,3,4],
                  dtype = {'onpromotion': bool}, parse_dates = ['date']).set_index(['store_nbr','item_nbr','date'])

In [9]:
test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,onpromotion
store_nbr,item_nbr,date,Unnamed: 3_level_1,Unnamed: 4_level_1
1,96995,2017-08-16,125497040,False
1,99197,2017-08-16,125497041,False
1,103501,2017-08-16,125497042,False
1,103520,2017-08-16,125497043,False
1,103665,2017-08-16,125497044,False


In [10]:
train['unit_sales'] = train['unit_sales'].apply(lambda x: np.log1p(float(x)) if float(x)>0 else 0)

In [168]:
items = pd.read_csv('../input/items.csv').set_index('item_nbr')
df_2017 = train[train['date']>date(2017,1,1)]

In [169]:
promo_2017_train = df_2017.set_index(['store_nbr','item_nbr','date'])
print (promo_2017_train.head())
print (promo_2017_train.shape)

                               unit_sales  onpromotion
store_nbr item_nbr date                               
1         103520   2017-01-02    0.693147        False
          105575   2017-01-02    1.386294        False
          105577   2017-01-02    0.693147        False
          105737   2017-01-02    0.693147        False
          108079   2017-01-02    1.098612        False
(23806568, 2)


In [170]:
promo_2017_train = promo_2017_train[['onpromotion']].unstack(level = -1).fillna(False)
#singel bracket return a series, double brackets return a dataframe
#stack method turns column names into index values, and the unstack method turns index values into column names
promo_2017_train.shape

(167515, 226)

In [171]:
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
promo_2017_train.head()

Unnamed: 0_level_0,date,2017-01-02 00:00:00,2017-01-03 00:00:00,2017-01-04 00:00:00,2017-01-05 00:00:00,2017-01-06 00:00:00,2017-01-07 00:00:00,2017-01-08 00:00:00,2017-01-09 00:00:00,2017-01-10 00:00:00,2017-01-11 00:00:00,...,2017-08-06 00:00:00,2017-08-07 00:00:00,2017-08-08 00:00:00,2017-08-09 00:00:00,2017-08-10 00:00:00,2017-08-11 00:00:00,2017-08-12 00:00:00,2017-08-13 00:00:00,2017-08-14 00:00:00,2017-08-15 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,99197,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103520,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103665,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,105574,False,True,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [172]:
promo_2017_test = test[['onpromotion']].unstack(level = -1).fillna(False)

In [173]:
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)

In [174]:
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)

In [175]:
promo_2017 = pd.concat([promo_2017_train,promo_2017_test], axis = 1)

In [176]:
del promo_2017_test, promo_2017_train

In [177]:
df_2017 = df_2017.set_index(['store_nbr', 'item_nbr', 'date'])[['unit_sales']].unstack(level = -1).fillna(0)

In [178]:
df_2017.columns = df_2017.columns.get_level_values(1)
items = items.reindex(df_2017.index.get_level_values(1))

In [179]:
df_2017.tail()

Unnamed: 0_level_0,date,2017-01-02 00:00:00,2017-01-03 00:00:00,2017-01-04 00:00:00,2017-01-05 00:00:00,2017-01-06 00:00:00,2017-01-07 00:00:00,2017-01-08 00:00:00,2017-01-09 00:00:00,2017-01-10 00:00:00,2017-01-11 00:00:00,...,2017-08-06 00:00:00,2017-08-07 00:00:00,2017-08-08 00:00:00,2017-08-09 00:00:00,2017-08-10 00:00:00,2017-08-11 00:00:00,2017-08-12 00:00:00,2017-08-13 00:00:00,2017-08-14 00:00:00,2017-08-15 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
54,2109909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.693147,0.693147,0.0,1.098612,0.693147,0.0,1.386294,1.386294,1.791759,0.0
54,2110456,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7.203406,6.481577,6.586172,3.218876,0.0,0.0,0.0,0.0,4.795791,5.26269
54,2113343,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.098612,0.0,0.0,0.0,0.693147,0.0,0.0,0.0,0.693147,0.0
54,2113914,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.890372,0.0,2.397895,2.397895,1.609438,0.0,0.0,2.833213,2.197225,5.293305
54,2116416,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.693147,0.693147,0.0,0.0,0.0,0.693147,0.693147,1.098612,1.098612,1.098612


In [180]:
def get_timespan(df,dt,minus,periods, freq = 'D'):
    return df[
        pd.date_range(dt-timedelta(days = minus), periods = periods, freq = freq)
    ]
# pd.date_range, of the three parameters: start, end, periods, exactly two must be specified

In [181]:
d = timedelta(days = 0.1)
d.seconds
# the days to seconds translation can only happen when days < 1. 
#0 <= microseconds < 1000000
#0 <= seconds < 3600 * 24 (number of seconds in one day)
#-000000000 <= days <= 999999999
date(2017, 8,16) - date(2017,7,26)


datetime.timedelta(21)

In [182]:
print (get_timespan(df_2017,t2017, 7,7).tail())

date                2017-05-24  2017-05-25  2017-05-26  2017-05-27  \
store_nbr item_nbr                                                   
54        2109909          0.0         0.0         0.0         0.0   
          2110456          0.0         0.0         0.0         0.0   
          2113343          0.0         0.0         0.0         0.0   
          2113914          0.0         0.0         0.0         0.0   
          2116416          0.0         0.0         0.0         0.0   

date                2017-05-28  2017-05-29  2017-05-30  
store_nbr item_nbr                                      
54        2109909          0.0         0.0         0.0  
          2110456          0.0         0.0         0.0  
          2113343          0.0         0.0         0.0  
          2113914          0.0         0.0         0.0  
          2116416          0.0         0.0         0.0  


In [183]:
print (df_2017.head())

date                2017-01-02  2017-01-03  2017-01-04  2017-01-05  \
store_nbr item_nbr                                                   
1         96995       0.000000    0.000000    0.000000    0.000000   
          99197       0.000000    1.386294    0.693147    0.693147   
          103520      0.693147    1.098612    0.000000    1.098612   
          103665      0.000000    0.000000    1.386294    1.098612   
          105574      0.000000    1.791759    2.564949    2.302585   

date                2017-01-06  2017-01-07  2017-01-08  2017-01-09  \
store_nbr item_nbr                                                   
1         96995       0.000000    0.000000    0.000000    0.000000   
          99197       0.693147    1.098612    0.000000    0.000000   
          103520      1.386294    0.693147    0.000000    0.693147   
          103665      1.098612    0.693147    1.098612    0.000000   
          105574      1.945910    1.609438    1.098612    1.386294   

date              

In [184]:
def prepare_dataset(t2017, is_train = True):
    X = pd.DataFrame({
        "day_1_2017": get_timespan(df_2017, t2017, 1, 1).values.ravel(),
        "mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,
        "mean_30_2017": get_timespan(df_2017, t2017, 30, 30).mean(axis=1).values,
        "mean_60_2017": get_timespan(df_2017, t2017, 60, 60).mean(axis=1).values,
        "mean_140_2017": get_timespan(df_2017, t2017, 140, 140).mean(axis=1).values,
        "promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values,
        "promo_60_2017": get_timespan(promo_2017, t2017, 60, 60).sum(axis=1).values,
        "promo_140_2017": get_timespan(promo_2017, t2017, 140, 140).sum(axis=1).values
    })
    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').mean(axis=1).values
    for i in range (16):
        X['promo={}'.format(i)] = promo_2017[
            t2017 + timedelta(days = i)].values.astype(np.uint8)
    if is_train:
        y =df_2017[
            pd.date_range (t2017, periods = 16)
        ].values
        return X, y
    return X
    

In [185]:
print ('preparing dataset ...')
t2017 = date(2017, 5, 31)
test_X, test_y = prepare_dataset(t2017+ timedelta(days = 7))
test_X.head()

preparing dataset ...


Unnamed: 0,day_1_2017,mean_140_2017,mean_14_2017,mean_30_2017,mean_3_2017,mean_60_2017,mean_7_2017,promo_140_2017,promo_14_2017,promo_60_2017,...,promo=6,promo=7,promo=8,promo=9,promo=10,promo=11,promo=12,promo=13,promo=14,promo=15
0,0.0,0.075107,0.198042,0.161734,0.0,0.15694,0.099021,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.174485,0.828418,0.625496,0.732408,0.312748,1.045884,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.609438,0.738711,0.881969,0.828724,0.767528,0.854054,0.939893,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.791759,1.04964,1.073373,1.095595,1.425555,1.025186,1.436773,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2.079442,1.816292,1.551652,1.750672,1.617343,1.753564,1.700184,34,0,14,...,0,0,0,0,0,0,0,0,0,0


In [186]:
t2017 = date(2017, 5, 31)
X_1, y_1 = [],[]
for i in range (6):
    delta = timedelta(days = 7*i)
    X_tmp, y_tmp = prepare_dataset(
        t2017+delta
    )
    X_1.append(X_tmp)
    y_1.append(y_tmp)
X_train = pd.concat(X_1, axis = 0)
y_train = np.concatenate(y_1, axis = 0)
del X_1, y_1
X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train = False)

In [187]:
print (X_test.head())

   day_1_2017  mean_140_2017  mean_14_2017  mean_30_2017  mean_3_2017  \
0    0.000000       0.153952      0.334438      0.275522     0.000000   
1    0.000000       0.376532      0.206455      0.331321     0.000000   
2    0.000000       0.821010      0.573577      0.714515     0.231049   
3    0.693147       1.040541      1.031388      1.017638     0.462098   
4    1.609438       1.765433      1.629185      1.714960     0.998577   

   mean_60_2017  mean_7_2017  promo_140_2017  promo_14_2017  promo_60_2017  \
0      0.160866     0.099021               0              0              0   
1      0.444620     0.156945               0              0              0   
2      0.756274     0.495105               0              0              0   
3      1.001216     0.980990               0              0              0   
4      1.686812     1.560437              24              0              0   

     ...     promo=6  promo=7  promo=8  promo=9  promo=10  promo=11  promo=12  \
0    ...   

In [194]:
import time
start = time.time()
params = {
    'num_leaves': 2**5 - 1,
    'objective': 'regression_l2',
    'max_depth': 8,
    'min_data_in_leaf': 50,
    'learning_rate': 0.05,
    'feature_fraction': 0.75,
    'bagging_fraction': 0.75,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 8
}
#for the num_threads, set it to the real CPU cores, not the number of threads
#device, choose device for the tree learning, you can use GPU to achieve the 
#faster learning
#device, default = cpu, options = cpu, gpu
MAX_ROUNDS = 1000
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * 6) * 0.25 + 1
    )
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        weight=items["perishable"] * 0.25 + 1,
        categorical_feature=cate_vars)
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=50, verbose_eval=50
    )
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))
end = time.time()

Step 1




LightGBMError: b'GPU Tree Learner was not enabled in this build. Recompile with CMake option -DUSE_GPU=1'

In [193]:
print ('the total tunning time is {}'.format(end - start))

the total tunning time is 503.39195251464844


In [196]:
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index = df_2017.index,
    columns = pd.date_range ('2017-08-16', periods = 16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(['store_nbr','item_nbr','date'], inplace = True)


ValueError: Empty data passed with indices specified.

In [133]:
df_preds.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,unit_sales
store_nbr,item_nbr,date,Unnamed: 3_level_1
1,96995,2017-08-16,0.240358
1,96995,2017-08-17,0.246609
1,96995,2017-08-18,0.274632
1,96995,2017-08-19,0.398997
1,96995,2017-08-20,0.397812


In [135]:
test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,onpromotion
store_nbr,item_nbr,date,Unnamed: 3_level_1,Unnamed: 4_level_1
1,96995,2017-08-16,125497040,False
1,99197,2017-08-16,125497041,False
1,103501,2017-08-16,125497042,False
1,103520,2017-08-16,125497043,False
1,103665,2017-08-16,125497044,False


In [136]:
submission = test[['id']].join(df_preds, how = 'left').fillna(0)
#DataFrame.join("how") left: use calling frame's index, 
                    # right: use other frames's index,
                    # outer: from union of callling fram's index with other frame's index
                    # inner: 
submission['unit_sales'] = np.clip(np.expm1(submission['unit_sales']),0,1000)
# np.clip() given an interval, values outside the interval are clipped to the inteval edges. 
submission.to_csv('lgb.csv',index = None)