In [38]:
import pandas as pd
import os
from tqdm import tqdm_notebook
import numpy as np

from itertools import product
from sklearn.preprocessing import LabelEncoder

from xgboost import XGBRegressor
from xgboost import plot_importance

import time

In [25]:
PATH = './data'
os.listdir(PATH)

['test.csv.gz',
 'sample_submission.csv.gz',
 'shops.csv',
 'sales_train.csv.gz',
 'item_categories.csv',
 'items.csv']

In [26]:
test     = pd.read_csv(os.path.join(PATH,'test.csv.gz'))
shops    = pd.read_csv(os.path.join(PATH,'shops.csv'))
sales    = pd.read_csv(os.path.join(PATH,'sales_train.csv.gz'))
item_cat = pd.read_csv(os.path.join(PATH,'item_categories.csv'))
item     = pd.read_csv(os.path.join(PATH,'items.csv'))

few shops are duplicated as 0 and 57 , 1 and 58, 10 and 11

In [27]:
sales.loc[sales.shop_id == 0,'shop_id'] = 57
sales.loc[sales.shop_id == 1,'shop_id'] = 58
sales.loc[sales.shop_id == 10,'shop_id'] = 11

In [28]:
shops.loc[shops.shop_name == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])
shops.loc[shops.city == '!Якутск', 'city'] = 'Якутск'
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])
shops = shops[['shop_id','city_code']]

item_cat['split'] = item_cat['item_category_name'].str.split('-')
item_cat['type'] = item_cat['split'].map(lambda x: x[0])
item_cat['type_code'] = LabelEncoder().fit_transform(item_cat['type'])

item_cat['subtype'] = item_cat['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
item_cat['subtype_code'] = LabelEncoder().fit_transform(item_cat['subtype'])
item_cat.drop(['item_category_name','split','type','subtype'],axis=1,inplace=True)

item.drop(['item_name'], axis=1, inplace=True)

In [29]:
index_col = ['shop_id','item_id','date_block_num']

In [30]:
ts = time.time()

matrix = []
for i in range(34):
    train = sales[sales.date_block_num==i]
    matrix.append(np.array(list(product(train.shop_id.unique(), train.item_id.unique(), [i])), dtype='int16'))
    del train
    
matrix = pd.DataFrame(np.vstack(matrix), columns=index_col)
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix['item_id'] = matrix['item_id'].astype(np.int16)

time.time() - ts

5.397815704345703

In [31]:
ts = time.time()

group = sales.groupby(index_col).agg({'item_cnt_day':'sum'})
group.columns = ['item_cnt_month']
group.reset_index(inplace=True)

matrix = pd.merge(matrix,group,on=index_col,how='left')
matrix['item_cnt_month'] = matrix['item_cnt_month'].fillna(0).clip(0,20).astype(np.float16)

time.time() - ts

4.11910605430603

In [32]:
test['date_block_num'] = 34
test['date_block_num'] = test['date_block_num'].astype(np.int8)
test['shop_id'] = test['shop_id'].astype(np.int8)
test['item_id'] = test['item_id'].astype(np.int16)
test = test[index_col]

In [33]:
ts = time.time()
matrix = pd.concat([matrix, test], ignore_index=True, sort=False, keys=index_col)
matrix.fillna(0, inplace=True) # 34 month
time.time() - ts

0.12503433227539062

In [34]:
matrix.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month
0,59,22154,0,1.0
1,59,2552,0,0.0
2,59,2554,0,0.0
3,59,2555,0,0.0
4,59,2564,0,0.0


## Shops/Item/Item_cat features

In [35]:
ts = time.time()
matrix = pd.merge(matrix,shops,on=['shop_id'],how='left')
matrix = pd.merge(matrix,item,on=['item_id'],how='left')
matrix = pd.merge(matrix,item_cat,on=['item_category_id'],how='left')
matrix['city_code'] = matrix['city_code'].astype(np.int8)
matrix['item_category_id'] = matrix['item_category_id'].astype(np.int8)
matrix['type_code'] = matrix['type_code'].astype(np.int8)
matrix['subtype_code'] = matrix['subtype_code'].astype(np.int8)
time.time() - ts

3.3496220111846924

In [37]:
matrix.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,city_code,item_category_id,type_code,subtype_code
0,59,22154,0,1.0,30,37,11,1
1,59,2552,0,0.0,30,58,13,27
2,59,2554,0,0.0,30,58,13,27
3,59,2555,0,0.0,30,56,13,3
4,59,2564,0,0.0,30,59,13,40


In [44]:
def lag_feature(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

In [45]:
lags = [6,12]
matrix = lag_feature(matrix,lags,'item_cnt_month')

In [52]:
matrix = matrix[matrix.date_block_num > 12]
data = matrix.copy()

X_train = data[data.date_block_num < 33].drop(['item_cnt_month'], axis=1)
Y_train = data[data.date_block_num < 33]['item_cnt_month']
X_valid = data[data.date_block_num == 33].drop(['item_cnt_month'], axis=1)
Y_valid = data[data.date_block_num == 33]['item_cnt_month']
X_test = data[data.date_block_num == 34].drop(['item_cnt_month'], axis=1)

In [53]:
ts = time.time()

model = XGBRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42)

model.fit(
    X_train, 
    Y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, Y_train), (X_valid, Y_valid)], 
    verbose=True, 
    early_stopping_rounds = 10)

time.time() - ts

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


[0]	validation_0-rmse:1.1536	validation_1-rmse:1.13759
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 10 rounds.
[1]	validation_0-rmse:1.13659	validation_1-rmse:1.11883
[2]	validation_0-rmse:1.12337	validation_1-rmse:1.10268
[3]	validation_0-rmse:1.11125	validation_1-rmse:1.08866
[4]	validation_0-rmse:1.09619	validation_1-rmse:1.07667
[5]	validation_0-rmse:1.08575	validation_1-rmse:1.06624
[6]	validation_0-rmse:1.07598	validation_1-rmse:1.05767
[7]	validation_0-rmse:1.06901	validation_1-rmse:1.05001
[8]	validation_0-rmse:1.06091	validation_1-rmse:1.04464
[9]	validation_0-rmse:1.05507	validation_1-rmse:1.03921
[10]	validation_0-rmse:1.051	validation_1-rmse:1.03435
[11]	validation_0-rmse:1.04731	validation_1-rmse:1.03077
[12]	validation_0-rmse:1.04388	validation_1-rmse:1.02815
[13]	validation_0-rmse:1.04061	validation_1-rmse:1.02511
[14]	validation_0-rmse:1.03779	validation_1-rmse:1.02265

[133]	validation_0-rmse:0.969441	validation_1-rmse:0.977938
[134]	validation_0-rmse:0.969297	validation_1-rmse:0.977901
[135]	validation_0-rmse:0.968927	validation_1-rmse:0.977762
[136]	validation_0-rmse:0.968836	validation_1-rmse:0.977672
[137]	validation_0-rmse:0.968742	validation_1-rmse:0.977648
[138]	validation_0-rmse:0.968523	validation_1-rmse:0.977487
[139]	validation_0-rmse:0.968466	validation_1-rmse:0.977397
[140]	validation_0-rmse:0.968392	validation_1-rmse:0.977311
[141]	validation_0-rmse:0.968254	validation_1-rmse:0.977306
[142]	validation_0-rmse:0.96821	validation_1-rmse:0.977281
[143]	validation_0-rmse:0.968094	validation_1-rmse:0.977251
[144]	validation_0-rmse:0.967986	validation_1-rmse:0.977304
[145]	validation_0-rmse:0.967827	validation_1-rmse:0.977199
[146]	validation_0-rmse:0.967681	validation_1-rmse:0.977285
[147]	validation_0-rmse:0.967539	validation_1-rmse:0.977284
[148]	validation_0-rmse:0.967441	validation_1-rmse:0.97724
[149]	validation_0-rmse:0.967172	validatio

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, eta=0.3, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=8, min_child_weight=300, missing=None, n_estimators=1000,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
             silent=None, subsample=0.8, verbosity=1)

1666.4006321430206

In [55]:
Y_pred = model.predict(X_valid).clip(0, 20)
Y_test = model.predict(X_test).clip(0, 20)
submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": Y_test
})
submission.to_csv('xgb_submission.csv', index=False)
