In [21]:
import pandas as pd
import os
from tqdm import tqdm_notebook
import numpy as np

from itertools import product
from sklearn.preprocessing import LabelEncoder

from xgboost import XGBRegressor
from xgboost import plot_importance

import time

In [22]:
PATH = './data'
os.listdir(PATH)

['test.csv.gz',
 'sample_submission.csv.gz',
 'shops.csv',
 'sales_train.csv.gz',
 'item_categories.csv',
 'items.csv']

In [23]:
test     = pd.read_csv(os.path.join(PATH,'test.csv.gz'))
shops    = pd.read_csv(os.path.join(PATH,'shops.csv'))
sales    = pd.read_csv(os.path.join(PATH,'sales_train.csv.gz'))
item_cat = pd.read_csv(os.path.join(PATH,'item_categories.csv'))
item     = pd.read_csv(os.path.join(PATH,'items.csv'))

few shops are duplicated as 0 and 57 , 1 and 58, 10 and 11

In [24]:
sales.loc[sales.shop_id == 0,'shop_id'] = 57
sales.loc[sales.shop_id == 1,'shop_id'] = 58
sales.loc[sales.shop_id == 10,'shop_id'] = 11

In [25]:
shops.loc[shops.shop_name == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])
shops.loc[shops.city == '!Якутск', 'city'] = 'Якутск'
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])
shops = shops[['shop_id','city_code']]

item_cat['split'] = item_cat['item_category_name'].str.split('-')
item_cat['type'] = item_cat['split'].map(lambda x: x[0])
item_cat['type_code'] = LabelEncoder().fit_transform(item_cat['type'])

item_cat['subtype'] = item_cat['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
item_cat['subtype_code'] = LabelEncoder().fit_transform(item_cat['subtype'])
item_cat.drop(['item_category_name','split','type','subtype'],axis=1,inplace=True)

item.drop(['item_name'], axis=1, inplace=True)

In [26]:
index_col = ['shop_id','item_id','date_block_num']

In [27]:
ts = time.time()

matrix = []
for i in range(34):
    train = sales[sales.date_block_num==i]
    matrix.append(np.array(list(product(train.shop_id.unique(), train.item_id.unique(), [i])), dtype='int16'))
    del train
    
matrix = pd.DataFrame(np.vstack(matrix), columns=index_col)
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix['item_id'] = matrix['item_id'].astype(np.int16)

time.time() - ts

5.115501165390015

In [28]:
ts = time.time()

group = sales.groupby(index_col).agg({'item_cnt_day':'sum'})
group.columns = ['item_cnt_month']
group.reset_index(inplace=True)

matrix = pd.merge(matrix,group,on=index_col,how='left')
matrix['item_cnt_month'] = matrix['item_cnt_month'].fillna(0).clip(0,20).astype(np.float16)

time.time() - ts

4.018275260925293

In [29]:
test['date_block_num'] = 34
test['date_block_num'] = test['date_block_num'].astype(np.int8)
test['shop_id'] = test['shop_id'].astype(np.int8)
test['item_id'] = test['item_id'].astype(np.int16)
test = test[index_col]

In [30]:
ts = time.time()
matrix = pd.concat([matrix, test], ignore_index=True, sort=False, keys=index_col)
matrix.fillna(0, inplace=True) # 34 month
time.time() - ts

0.12876605987548828

## Shops/Item/Item_cat features

In [31]:
ts = time.time()
matrix = pd.merge(matrix,shops,on=['shop_id'],how='left')
matrix = pd.merge(matrix,item,on=['item_id'],how='left')
matrix = pd.merge(matrix,item_cat,on=['item_category_id'],how='left')
matrix['city_code'] = matrix['city_code'].astype(np.int8)
matrix['item_category_id'] = matrix['item_category_id'].astype(np.int8)
matrix['type_code'] = matrix['type_code'].astype(np.int8)
matrix['subtype_code'] = matrix['subtype_code'].astype(np.int8)
time.time() - ts

3.3303709030151367

In [32]:
def lag_feature(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
        df[col+'_lag_'+str(i)] = df[col+'_lag_'+str(i)].fillna(0).astype(np.float16)
    return df

In [33]:
matrix = lag_feature(matrix,[6,12],'item_cnt_month')

## Mean Encoding

In [34]:
ts = time.time()
col = ['date_block_num']
group = matrix.groupby(col).agg({'item_cnt_month':'mean'})
group.columns = ['date_avg_item_cnt']
group.reset_index(inplace=True)

matrix = pd.merge(matrix,group,on=col,how='left')
matrix = lag_feature(matrix,[6,12],'date_avg_item_cnt')
matrix['date_avg_item_cnt'] = matrix['date_avg_item_cnt'].fillna(0).astype(np.float16)
time.time() - ts

12.71218466758728

In [35]:
ts = time.time()
col = ['shop_id','date_block_num']
group = matrix.groupby(col).agg({'item_cnt_month':'mean'})
group.columns = ['date_shop_avg_item_cnt']
group.reset_index(inplace=True)

matrix = pd.merge(matrix,group,on=col,how='left')
matrix = lag_feature(matrix,[6,12],'date_shop_avg_item_cnt')
matrix['date_shop_avg_item_cnt'] = matrix['date_shop_avg_item_cnt'].fillna(0).astype(np.float16)
time.time() - ts

13.761604309082031

In [36]:
ts = time.time()
col = ['item_id','date_block_num']
group = matrix.groupby(col).agg({'item_cnt_month':'mean'})
group.columns = ['date_item_avg_item_cnt']
group.reset_index(inplace=True)

matrix = pd.merge(matrix,group,on=col,how='left')
matrix = lag_feature(matrix,[6,12],'date_item_avg_item_cnt')
matrix['date_item_avg_item_cnt'] = matrix['date_item_avg_item_cnt'].fillna(0).astype(np.float16)
time.time() - ts

15.634974956512451

In [37]:
matrix.drop(['date_avg_item_cnt','date_shop_avg_item_cnt','date_item_avg_item_cnt'],axis=1,inplace=True)

In [38]:
matrix.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,city_code,item_category_id,type_code,subtype_code,item_cnt_month_lag_6,item_cnt_month_lag_12,date_avg_item_cnt_lag_6,date_avg_item_cnt_lag_12,date_shop_avg_item_cnt_lag_6,date_shop_avg_item_cnt_lag_12,date_item_avg_item_cnt_lag_6,date_item_avg_item_cnt_lag_12
0,59,22154,0,1.0,30,37,11,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,59,2552,0,0.0,30,58,13,27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,59,2554,0,0.0,30,58,13,27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,59,2555,0,0.0,30,56,13,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,59,2564,0,0.0,30,59,13,40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
matrix = matrix[matrix.date_block_num > 12]
data = matrix.copy()

X_train = data[data.date_block_num < 33].drop(['item_cnt_month'], axis=1)
Y_train = data[data.date_block_num < 33]['item_cnt_month']
X_valid = data[data.date_block_num == 33].drop(['item_cnt_month'], axis=1)
Y_valid = data[data.date_block_num == 33]['item_cnt_month']
X_test = data[data.date_block_num == 34].drop(['item_cnt_month'], axis=1)

In [40]:
ts = time.time()

model = XGBRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42)

model.fit(
    X_train, 
    Y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, Y_train), (X_valid, Y_valid)], 
    verbose=True, 
    early_stopping_rounds = 10)

time.time() - ts

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


[0]	validation_0-rmse:1.15315	validation_1-rmse:1.13585
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 10 rounds.
[1]	validation_0-rmse:1.13467	validation_1-rmse:1.11595
[2]	validation_0-rmse:1.12	validation_1-rmse:1.09892
[3]	validation_0-rmse:1.09999	validation_1-rmse:1.08606
[4]	validation_0-rmse:1.08605	validation_1-rmse:1.0746
[5]	validation_0-rmse:1.07602	validation_1-rmse:1.06501
[6]	validation_0-rmse:1.06816	validation_1-rmse:1.05762
[7]	validation_0-rmse:1.05868	validation_1-rmse:1.05044
[8]	validation_0-rmse:1.05137	validation_1-rmse:1.04418
[9]	validation_0-rmse:1.04656	validation_1-rmse:1.03919
[10]	validation_0-rmse:1.04178	validation_1-rmse:1.0336
[11]	validation_0-rmse:1.03734	validation_1-rmse:1.03002
[12]	validation_0-rmse:1.03343	validation_1-rmse:1.02712
[13]	validation_0-rmse:1.03014	validation_1-rmse:1.02599
[14]	validation_0-rmse:1.02686	validation_1-rmse:1.02358
[

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, eta=0.3, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=8, min_child_weight=300, missing=None, n_estimators=1000,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
             silent=None, subsample=0.8, verbosity=1)

1655.7230489253998

In [41]:
Y_pred = model.predict(X_valid).clip(0, 20)
Y_test = model.predict(X_test).clip(0, 20)
submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": Y_test
})
submission.to_csv('xgb_submission.csv', index=False)
