In [1]:
import os
import gc
import pickle
import time
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
import matplotlib.pylab as plt
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

In [2]:
data = pd.read_pickle('data-20200228.pkl')
test  = pd.read_csv('test.csv').set_index('ID')

print(len(data.columns))
data.columns

34


Index(['date_block_num', 'shop_id', 'item_id', 'item_cnt_month', 'shop_city',
       'shop_category', 'item_category_id', 'name_2', 'name_3', 'type_code',
       'subtype_code', 'item_cnt_month_lag_1', 'item_cnt_month_lag_2',
       'item_cnt_month_lag_3', 'date_avg_item_cnt_lag_1',
       'date_item_avg_item_cnt_lag_1', 'date_item_avg_item_cnt_lag_2',
       'date_item_avg_item_cnt_lag_3', 'date_shop_avg_item_cnt_lag_1',
       'date_shop_avg_item_cnt_lag_2', 'date_shop_avg_item_cnt_lag_3',
       'date_cat_avg_item_cnt_lag_1', 'date_shop_cat_avg_item_cnt_lag_1',
       'date_shop_item_avg_item_cnt_lag_1_x',
       'date_shop_subtype_avg_item_cnt_lag_1', 'date_city_avg_item_cnt_lag_1',
       'date_item_city_avg_item_cnt_lag_1',
       'date_shop_item_avg_item_cnt_lag_1_y', 'delta_price_lag',
       'delta_revenue_lag_1', 'month', 'days', 'item_shop_first_sale',
       'item_first_sale'],
      dtype='object')

In [3]:


len(data.columns)

34

In [4]:
X_train = data[data.date_block_num < 33].drop(['item_cnt_month'], axis=1)
Y_train = data[data.date_block_num < 33]['item_cnt_month']
X_valid = data[data.date_block_num == 33].drop(['item_cnt_month'], axis=1)
Y_valid = data[data.date_block_num == 33]['item_cnt_month']
X_test = data[data.date_block_num == 34].drop(['item_cnt_month'], axis=1)

In [5]:
del data
gc.collect();

In [13]:
ts = time.time()

model = XGBRegressor(
    max_depth=10,
    n_estimators=1000,
    min_child_weight=0.5, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.1,
    #tree_method='gpu_hist',
    seed=42)

model.fit(
    X_train, 
    Y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, Y_train), (X_valid, Y_valid)], 
    verbose=True, 
    early_stopping_rounds = 20
)

time.time() - ts

[0]	validation_0-rmse:1.12843	validation_1-rmse:1.11629
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 20 rounds.
[1]	validation_0-rmse:1.08520	validation_1-rmse:1.07765
[2]	validation_0-rmse:1.05159	validation_1-rmse:1.04653
[3]	validation_0-rmse:1.01767	validation_1-rmse:1.02028
[4]	validation_0-rmse:0.98904	validation_1-rmse:0.99960
[5]	validation_0-rmse:0.96626	validation_1-rmse:0.98061
[6]	validation_0-rmse:0.94691	validation_1-rmse:0.96598
[7]	validation_0-rmse:0.92872	validation_1-rmse:0.95441
[8]	validation_0-rmse:0.91393	validation_1-rmse:0.94590
[9]	validation_0-rmse:0.90082	validation_1-rmse:0.93776
[10]	validation_0-rmse:0.88947	validation_1-rmse:0.93133
[11]	validation_0-rmse:0.87901	validation_1-rmse:0.92500
[12]	validation_0-rmse:0.87039	validation_1-rmse:0.92176
[13]	validation_0-rmse:0.86292	validation_1-rmse:0.91792
[14]	validation_0-rmse:0.85670	validation_1-rmse:0.91

6547.827740907669

In [14]:
Y_pred = model.predict(X_valid).clip(0, 20)
Y_test = model.predict(X_test).clip(0, 20)

submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": Y_test
})
submission.to_csv('xgb_submission_20200228.csv', index=False)

# save predictions for an ensemble
pickle.dump(Y_pred, open('xgb_train.pickle', 'wb'))
pickle.dump(Y_test, open('xgb_test.pickle', 'wb'))

In [11]:
import xgboost
xgboost.__version__

'1.0.1'