In [1]:
import pandas as pd
import numpy as np
import cPickle as pickle
import xgboost as xgb

In [2]:
dtype = {'store_nbr' : np.int8, 
         'item_nbr' : np.int32,
         'month' : np.int8,
         'week' : np.int8,
         'year' : np.int8,
         'week_of_month' : np.int8,
         'quarter' : np.int8,
         'perishable' : np.int8
        }

In [3]:
train = pd.read_csv('train_with_agg_features.csv', dtype = dtype)
test = pd.read_csv('test_with_agg_features.csv', dtype = dtype)

In [4]:
items = pd.read_csv('../input/items.csv', usecols=['item_nbr','perishable'], dtype = dtype)

In [5]:
train = train.merge(items, on = 'item_nbr', how = 'left')

In [6]:
train.drop('id', axis=1, inplace=True)

In [7]:
features = list(train.columns.values)

In [8]:
features.remove('date')
features.remove('unit_sales')
features.remove('perishable')

In [9]:
train['onpromotion'] = train['onpromotion'].astype(int)
test['onpromotion'] = test['onpromotion'].astype(int)

In [10]:
mask = np.array(train.date < '2017-07-31', dtype=np.bool)

In [11]:
dtrain = xgb.DMatrix(train[mask][features], train[mask]['unit_sales'].values, weight=train[mask]['perishable'].values*0.25+1)

In [12]:
dvalid = xgb.DMatrix(train[~mask][features], train[~mask]['unit_sales'].values, weight=train[~mask]['perishable'].values*0.25+1)

In [13]:
test_id = test['id']

In [14]:
test.drop('id', inplace=True, axis=1)

In [15]:
dtest = xgb.DMatrix(test[features])

In [16]:
del train, test

In [28]:
param = {'objective' : 'reg:linear',
         'eta' : 0.01,
         'max_depth' : 5,
         'silent' : 1,
         'eval_metric' : 'rmse',
         'min_child_weight' : 4,
         'subsample' : 0.8,
         'colsample_bytree' : 0.7,
         'seed' : 137
        }

In [29]:
num_rounds = 1000
watchlist = [(dtrain,'train'), (dvalid, 'valid') ]
gbm = xgb.train(param, dtrain, num_rounds, watchlist, early_stopping_rounds=100, verbose_eval=True)

[0]	train-rmse:1.49282	valid-rmse:1.46483
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[1]	train-rmse:1.48236	valid-rmse:1.45388
[2]	train-rmse:1.47196	valid-rmse:1.44297
[3]	train-rmse:1.46157	valid-rmse:1.43205
[4]	train-rmse:1.45152	valid-rmse:1.42146
[5]	train-rmse:1.44159	valid-rmse:1.41104
[6]	train-rmse:1.43159	valid-rmse:1.40053
[7]	train-rmse:1.42173	valid-rmse:1.39016
[8]	train-rmse:1.412	valid-rmse:1.37993
[9]	train-rmse:1.40259	valid-rmse:1.37004
[10]	train-rmse:1.3931	valid-rmse:1.36007
[11]	train-rmse:1.38376	valid-rmse:1.35026
[12]	train-rmse:1.37454	valid-rmse:1.3407
[13]	train-rmse:1.36542	valid-rmse:1.33113
[14]	train-rmse:1.35659	valid-rmse:1.32193
[15]	train-rmse:1.34773	valid-rmse:1.31264
[16]	train-rmse:1.33914	valid-rmse:1.30364
[17]	train-rmse:1.33049	valid-rmse:1.29456
[18]	train-rmse:1.32213	valid-rmse:1.28581
[19]	train-rmse:1.31388	valid-rmse:1.2772
[20]	trai

In [19]:
y_test_pred = gbm.predict(dtest, ntree_limit = gbm.best_ntree_limit)

In [20]:
y_test_pred.shape

(3370464,)

In [24]:
submission = pd.DataFrame({'id':test_id.astype(int),'unit_sales':np.clip(np.expm1(y_test_pred), 0, 1000)})

In [27]:
submission.to_csv('sub_xgb_2017_01_05_23_28.csv', index=False)

In [25]:
submission.head()

Unnamed: 0,id,unit_sales
0,125497040,1.137159
1,125497041,2.611093
2,125497042,1.942139
3,125497043,5.031419
4,125497044,2.512344
