# Prepare data

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

# Load data
train_df = pd.read_csv("../sales_train.csv.gz")
test_df  = pd.read_csv("../test.csv.gz")

categories = pd.read_csv('../item_categories.csv')
items = pd.read_csv('../items.csv')
shops = pd.read_csv('../shops.csv')

In [2]:
%store -r __prepare_data_4

In [3]:
%%time
__prepare_data_4

CPU times: user 3min, sys: 1min 44s, total: 4min 45s
Wall time: 4min 53s


In [4]:
# aggregate daily sales to monthly sales
monthly = train_df.groupby(["item_id","shop_id","date_block_num"]).item_cnt_day.sum()
monthly = monthly.reset_index()
monthly.columns = ["item_id","shop_id","date_block_num","target"]

# this is still zero-suppressed data frame
monthly.head()

Unnamed: 0,item_id,shop_id,date_block_num,target
0,0,54,20,1.0
1,1,55,15,2.0
2,1,55,18,1.0
3,1,55,19,1.0
4,1,55,20,1.0


# Explore sales
* find how many items didn't sell last 1, 2, and 3 months (discontinued items)
* find how many stores didn't sell anything last 1, 2, and 3 month (closed stores)
* find how many items were selling throught at lease one year
* find how many stores were open through at least one year
* check intersections

## Explore shop sales

prepare data for shops and for items in one go below

In [5]:
# unique shops, items, and dates
shops = pd.DataFrame({'shop_id': monthly.shop_id.unique()}).sort_values(by=['shop_id'])
items = pd.DataFrame({'item_id': monthly.item_id.unique()}).sort_values(by=['item_id'])
dates = pd.DataFrame({'date_block_num': range(35)}) # month #34 will be our prediction target
# trick to get an outer join in pandas
shops['dup_key'] = 1
items['dup_key'] = 1
dates['dup_key'] = 1

# create combinations of all shops and dates as well as combinations of all items and dates
scaffold_s = pd.merge(shops, dates, how='outer', on=['dup_key'])[['shop_id','date_block_num']]
scaffold_i = pd.merge(items, dates, how='outer', on=['dup_key'])[['item_id','date_block_num']]

# trade zero suppression for NaNs at dates with no sales, aggregate sales per shop or item
gapless_shops = pd.merge(scaffold_s,
                         (monthly                         
                          .groupby(['shop_id','date_block_num'])
                          .agg({'target': [sum,len,np.mean],
                                'item_id':lambda x: len(x.unique())}) # important to calculate this before the scaffold
                          .reset_index()),
                         how='left',
                         on=['shop_id','date_block_num'])

gapless_items = pd.merge(scaffold_i,
                         (monthly
                          .groupby(['item_id','date_block_num'])
                          .agg({'target': [sum,len,np.mean],
                                'shop_id':lambda x: len(x.unique())}) # also makes sense to average over open shops only
                          .reset_index()),
                         how='left',
                         on=['item_id','date_block_num'])

gapless_shops.columns = ['shop_id', 'date_block_num', 'target_sum', 'target_len', 'target_mean', 'n_items']
gapless_items.columns = ['item_id', 'date_block_num', 'target_sum', 'target_len', 'target_mean', 'n_shops']

  new_axis = axis.drop(labels, errors=errors)


In [6]:
# construct operators (matricies) that, applied to a time series, yield their lags:
#   1) create and flatten a unit matrix
current = np.identity(35).reshape(-1)
#   2) shift left by one, two, and tree positions, zero-pad on the right and zero-out top rows
lag1 = np.concatenate(([0]*35,  current[36:], [0])).reshape(-1,35)
lag2 = np.concatenate(([0]*70,  current[72:], [0]*2)).reshape(-1,35)
lag3 = np.concatenate(([0]*105, current[108:],[0]*3)).reshape(-1,35)


def construct_lags2(sales, cols):
    """
    sales - two dimensional data frame
    """
    sales_lag1 = pd.DataFrame(np.dot(lag1, sales.values), columns=cols)
    sales_lag2 = pd.DataFrame(np.dot(lag2, sales.values), columns=cols)
    sales_lag3 = pd.DataFrame(np.dot(lag3, sales.values), columns=cols)

    no_sales = (sales_lag1 + sales_lag2 + sales_lag3 < 1).stack().reset_index()
    no_sales.columns = ['date_block_num', sales.columns.name, 'no_sales']

    lags = pd.concat([sales_lag1.stack(), sales_lag2.stack(), sales_lag3.stack()], axis=1)
    lags.reset_index(inplace=True)
    lags.columns = ['date_block_num', sales.columns.name, 'lag1', 'lag2', 'lag3']
    return pd.merge(no_sales, lags, how='left', on=['date_block_num', sales.columns.name])

In [7]:
# pivot the "melted" table into [date,shop] sales matrix and find shops closed within 3 consequetive lags
sales_shops = gapless_shops.pivot(columns='shop_id', values='target_sum', index='date_block_num')
no_sales_shops = construct_lags2(sales_shops.fillna(0), sorted(sales_shops.columns.unique().tolist()))['no_sales']

gapless_shops['target_ave'] = gapless_shops['target_sum']/gapless_shops['n_items']

ave_sales_shops = (gapless_shops
                   .pivot(columns='shop_id', values='target_ave', index='date_block_num')
                   .fillna(0))

stock_shops = (gapless_shops
               .pivot(columns='shop_id', values='n_items', index='date_block_num')
               .fillna(0))

lagged_sales_shops = construct_lags2(ave_sales_shops, sorted(ave_sales_shops.columns.unique().tolist()))
lagged_stock_shops = construct_lags2(stock_shops, sorted(stock_shops.columns.unique().tolist()))

lagged_sales_shops['no_sales'] = no_sales_shops
lagged_sales_shops['stock_lag1'] = lagged_stock_shops["lag1"]
lagged_sales_shops['stock_lag2'] = lagged_stock_shops["lag2"]
lagged_sales_shops['stock_lag3'] = lagged_stock_shops["lag3"]

lagged_sales_shops.sort_values(by=['shop_id','date_block_num']).head()

Unnamed: 0,date_block_num,shop_id,no_sales,lag1,lag2,lag3,stock_lag1,stock_lag2,stock_lag3
0,0,0,True,0.0,0.0,0.0,0.0,0.0,0.0
60,1,0,False,2.338784,0.0,0.0,2385.0,0.0,0.0
120,2,0,False,2.515189,2.338784,0.0,2436.0,2385.0,0.0
180,3,0,False,0.0,2.515189,2.338784,0.0,2436.0,2385.0
240,4,0,False,0.0,0.0,2.515189,0.0,0.0,2436.0


In [8]:
qwe = (pd.merge(lagged_sales_shops, all_data, how='inner', on=['date_block_num', 'shop_id'])
       .groupby(['date_block_num','shop_id'])
       .agg({'total_monthly_shop_sales_lag_1': lambda x: np.NA if x.empty else x.iloc[0],
             'lag1':                           lambda x: np.NA if x.empty else x.iloc[0],
             'stock_lag1':                     lambda x: np.NA if x.empty else x.iloc[0]}))

In [9]:
qwe.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,total_monthly_shop_sales_lag_1,lag1,stock_lag1
date_block_num,shop_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
34,55,0.29928,4.195745,470.0
34,56,0.22058,1.574813,802.0
34,57,0.411047,1.904605,1216.0
34,58,0.257343,1.798507,804.0
34,59,0.14576,1.58,500.0


In [10]:
all_data[all_data.date_block_num==33].groupby(["date_block_num", "shop_id"]).target.sum().tail()

date_block_num  shop_id
33              55         1620.0
                56         1194.0
                57         2225.0
                58         1393.0
                59          789.0
Name: target, dtype: float64

In [11]:
all_data[all_data.date_block_num==33].groupby(["date_block_num", "shop_id"]).target.size().head()

date_block_num  shop_id
33              2          5413
                3          5413
                4          5413
                5          5413
                6          5413
Name: target, dtype: int64

In [12]:
print(len(items))
print(len(all_data[all_data.date_block_num==3].item_id.unique()))
gapless_shops[gapless_shops.date_block_num==33].tail()#['n_items']

21807
8145


Unnamed: 0,shop_id,date_block_num,target_sum,target_len,target_mean,n_items,target_ave
1958,55,33,1972.0,470.0,4.195745,470.0,4.195745
1993,56,33,1263.0,802.0,1.574813,802.0,1.574813
2028,57,33,2316.0,1216.0,1.904605,1216.0,1.904605
2063,58,33,1446.0,804.0,1.798507,804.0,1.798507
2098,59,33,790.0,500.0,1.58,500.0,1.58


In [13]:
gapless_shops[gapless_shops.date_block_num==33].tail() # unclipped

Unnamed: 0,shop_id,date_block_num,target_sum,target_len,target_mean,n_items,target_ave
1958,55,33,1972.0,470.0,4.195745,470.0,4.195745
1993,56,33,1263.0,802.0,1.574813,802.0,1.574813
2028,57,33,2316.0,1216.0,1.904605,1216.0,1.904605
2063,58,33,1446.0,804.0,1.798507,804.0,1.798507
2098,59,33,790.0,500.0,1.58,500.0,1.58


In [14]:
all_data = pd.merge(all_data, lagged_sales_shops, how='left', on=['date_block_num','shop_id'])

In [31]:
predictors2 = predictors + ["no_sales", "lag1", "lag2", "lag3", "stock_lag1", "stock_lag2", "stock_lag3"]

In [32]:
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

train = all_data[(all_data.date_block_num>11)&(all_data.date_block_num<33)].fillna(0) # reduced traning set

X_train, X_test, y_train, y_test = \
    train_test_split(train[predictors2], train.target, test_size=0.2, random_state=123)

model = RandomForestRegressor(n_estimators=64, max_features=6, n_jobs=6) #len(predictors)/2
model.fit(X_train, y_train)

ypred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, ypred))
print("RMSE: %f" % (rmse))

RMSE: 0.802854


In [18]:
%%time
from xgboost import XGBRegressor

xgmodel = XGBRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42)

xgmodel.fit(
    X_train, 
    y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, y_train), (X_test, y_test)], 
    verbose=True, 
    early_stopping_rounds = 10)


  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


[0]	validation_0-rmse:1.12813	validation_1-rmse:1.15951
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 10 rounds.
[1]	validation_0-rmse:1.09295	validation_1-rmse:1.11244
[2]	validation_0-rmse:1.0634	validation_1-rmse:1.07527
[3]	validation_0-rmse:1.02751	validation_1-rmse:1.0456
[4]	validation_0-rmse:0.998912	validation_1-rmse:1.01949
[5]	validation_0-rmse:0.97803	validation_1-rmse:0.996824
[6]	validation_0-rmse:0.960078	validation_1-rmse:0.976954
[7]	validation_0-rmse:0.943892	validation_1-rmse:0.960623
[8]	validation_0-rmse:0.930542	validation_1-rmse:0.947286
[9]	validation_0-rmse:0.920226	validation_1-rmse:0.936647
[10]	validation_0-rmse:0.910699	validation_1-rmse:0.925162
[11]	validation_0-rmse:0.90308	validation_1-rmse:0.915944
[12]	validation_0-rmse:0.896755	validation_1-rmse:0.908184
[13]	validation_0-rmse:0.892154	validation_1-rmse:0.902272
[14]	validation_0-rmse:0.886876	valida

[132]	validation_0-rmse:0.811667	validation_1-rmse:0.814003
[133]	validation_0-rmse:0.811561	validation_1-rmse:0.813912
[134]	validation_0-rmse:0.811381	validation_1-rmse:0.813748
[135]	validation_0-rmse:0.811018	validation_1-rmse:0.81339
[136]	validation_0-rmse:0.810914	validation_1-rmse:0.813277
[137]	validation_0-rmse:0.810764	validation_1-rmse:0.813151
[138]	validation_0-rmse:0.810615	validation_1-rmse:0.812984
[139]	validation_0-rmse:0.81027	validation_1-rmse:0.812659
[140]	validation_0-rmse:0.810183	validation_1-rmse:0.81257
[141]	validation_0-rmse:0.810008	validation_1-rmse:0.812367
[142]	validation_0-rmse:0.809838	validation_1-rmse:0.812198
[143]	validation_0-rmse:0.809795	validation_1-rmse:0.812152
[144]	validation_0-rmse:0.809494	validation_1-rmse:0.811835
[145]	validation_0-rmse:0.80933	validation_1-rmse:0.811647
[146]	validation_0-rmse:0.809107	validation_1-rmse:0.811429
[147]	validation_0-rmse:0.808934	validation_1-rmse:0.811251
[148]	validation_0-rmse:0.808834	validation_

[270]	validation_0-rmse:0.791483	validation_1-rmse:0.794242
[271]	validation_0-rmse:0.79145	validation_1-rmse:0.7942
[272]	validation_0-rmse:0.791377	validation_1-rmse:0.794117
[273]	validation_0-rmse:0.791276	validation_1-rmse:0.794005
[274]	validation_0-rmse:0.791237	validation_1-rmse:0.793965
[275]	validation_0-rmse:0.791176	validation_1-rmse:0.793899
[276]	validation_0-rmse:0.790971	validation_1-rmse:0.793703
[277]	validation_0-rmse:0.790715	validation_1-rmse:0.79343
[278]	validation_0-rmse:0.790672	validation_1-rmse:0.793377
[279]	validation_0-rmse:0.7906	validation_1-rmse:0.793301
[280]	validation_0-rmse:0.790542	validation_1-rmse:0.793241
[281]	validation_0-rmse:0.790424	validation_1-rmse:0.793138
[282]	validation_0-rmse:0.790381	validation_1-rmse:0.793088
[283]	validation_0-rmse:0.790338	validation_1-rmse:0.79305
[284]	validation_0-rmse:0.790181	validation_1-rmse:0.792886
[285]	validation_0-rmse:0.790054	validation_1-rmse:0.792757
[286]	validation_0-rmse:0.789996	validation_1-r

[408]	validation_0-rmse:0.780699	validation_1-rmse:0.783511
[409]	validation_0-rmse:0.780654	validation_1-rmse:0.783468
[410]	validation_0-rmse:0.7806	validation_1-rmse:0.78342
[411]	validation_0-rmse:0.780522	validation_1-rmse:0.783346
[412]	validation_0-rmse:0.780451	validation_1-rmse:0.783291
[413]	validation_0-rmse:0.780413	validation_1-rmse:0.783253
[414]	validation_0-rmse:0.780382	validation_1-rmse:0.783238
[415]	validation_0-rmse:0.780348	validation_1-rmse:0.783213
[416]	validation_0-rmse:0.780301	validation_1-rmse:0.783154
[417]	validation_0-rmse:0.780262	validation_1-rmse:0.783117
[418]	validation_0-rmse:0.780208	validation_1-rmse:0.783069
[419]	validation_0-rmse:0.780155	validation_1-rmse:0.783006
[420]	validation_0-rmse:0.780031	validation_1-rmse:0.782866
[421]	validation_0-rmse:0.779998	validation_1-rmse:0.782836
[422]	validation_0-rmse:0.779926	validation_1-rmse:0.782758
[423]	validation_0-rmse:0.779792	validation_1-rmse:0.78262
[424]	validation_0-rmse:0.779742	validation_

[545]	validation_0-rmse:0.772298	validation_1-rmse:0.775107
[546]	validation_0-rmse:0.772258	validation_1-rmse:0.775061
[547]	validation_0-rmse:0.772208	validation_1-rmse:0.775005
[548]	validation_0-rmse:0.772163	validation_1-rmse:0.774958
[549]	validation_0-rmse:0.772141	validation_1-rmse:0.774942
[550]	validation_0-rmse:0.772101	validation_1-rmse:0.774906
[551]	validation_0-rmse:0.772082	validation_1-rmse:0.774886
[552]	validation_0-rmse:0.772042	validation_1-rmse:0.774846
[553]	validation_0-rmse:0.772013	validation_1-rmse:0.77482
[554]	validation_0-rmse:0.771987	validation_1-rmse:0.774803
[555]	validation_0-rmse:0.771967	validation_1-rmse:0.774775
[556]	validation_0-rmse:0.771905	validation_1-rmse:0.774716
[557]	validation_0-rmse:0.771882	validation_1-rmse:0.774687
[558]	validation_0-rmse:0.771856	validation_1-rmse:0.774657
[559]	validation_0-rmse:0.771831	validation_1-rmse:0.774624
[560]	validation_0-rmse:0.771801	validation_1-rmse:0.774603
[561]	validation_0-rmse:0.771768	validati

[683]	validation_0-rmse:0.766436	validation_1-rmse:0.769172
[684]	validation_0-rmse:0.766424	validation_1-rmse:0.769156
[685]	validation_0-rmse:0.766396	validation_1-rmse:0.76912
[686]	validation_0-rmse:0.76635	validation_1-rmse:0.769078
[687]	validation_0-rmse:0.766319	validation_1-rmse:0.769042
[688]	validation_0-rmse:0.766297	validation_1-rmse:0.76902
[689]	validation_0-rmse:0.766269	validation_1-rmse:0.768992
[690]	validation_0-rmse:0.76625	validation_1-rmse:0.768969
[691]	validation_0-rmse:0.766219	validation_1-rmse:0.768941
[692]	validation_0-rmse:0.766195	validation_1-rmse:0.768914
[693]	validation_0-rmse:0.766142	validation_1-rmse:0.768854
[694]	validation_0-rmse:0.76608	validation_1-rmse:0.768803
[695]	validation_0-rmse:0.766061	validation_1-rmse:0.768781
[696]	validation_0-rmse:0.766038	validation_1-rmse:0.768759
[697]	validation_0-rmse:0.766015	validation_1-rmse:0.768731
[698]	validation_0-rmse:0.765934	validation_1-rmse:0.768648
[699]	validation_0-rmse:0.765901	validation_1

[821]	validation_0-rmse:0.761455	validation_1-rmse:0.764082
[822]	validation_0-rmse:0.761439	validation_1-rmse:0.764074
[823]	validation_0-rmse:0.761416	validation_1-rmse:0.76404
[824]	validation_0-rmse:0.761365	validation_1-rmse:0.763996
[825]	validation_0-rmse:0.761326	validation_1-rmse:0.763959
[826]	validation_0-rmse:0.761313	validation_1-rmse:0.763949
[827]	validation_0-rmse:0.761281	validation_1-rmse:0.763921
[828]	validation_0-rmse:0.761254	validation_1-rmse:0.763892
[829]	validation_0-rmse:0.76123	validation_1-rmse:0.76387
[830]	validation_0-rmse:0.761203	validation_1-rmse:0.763852
[831]	validation_0-rmse:0.76118	validation_1-rmse:0.763826
[832]	validation_0-rmse:0.761156	validation_1-rmse:0.763799
[833]	validation_0-rmse:0.761139	validation_1-rmse:0.763775
[834]	validation_0-rmse:0.761114	validation_1-rmse:0.763749
[835]	validation_0-rmse:0.76109	validation_1-rmse:0.763728
[836]	validation_0-rmse:0.761018	validation_1-rmse:0.763636
[837]	validation_0-rmse:0.760992	validation_1

[959]	validation_0-rmse:0.757216	validation_1-rmse:0.759732
[960]	validation_0-rmse:0.757072	validation_1-rmse:0.759599
[961]	validation_0-rmse:0.757046	validation_1-rmse:0.759576
[962]	validation_0-rmse:0.757003	validation_1-rmse:0.759528
[963]	validation_0-rmse:0.756981	validation_1-rmse:0.759506
[964]	validation_0-rmse:0.756952	validation_1-rmse:0.759473
[965]	validation_0-rmse:0.756916	validation_1-rmse:0.75943
[966]	validation_0-rmse:0.756901	validation_1-rmse:0.759414
[967]	validation_0-rmse:0.756862	validation_1-rmse:0.759371
[968]	validation_0-rmse:0.756833	validation_1-rmse:0.759353
[969]	validation_0-rmse:0.756802	validation_1-rmse:0.759312
[970]	validation_0-rmse:0.756777	validation_1-rmse:0.759283
[971]	validation_0-rmse:0.756761	validation_1-rmse:0.75926
[972]	validation_0-rmse:0.756733	validation_1-rmse:0.759226
[973]	validation_0-rmse:0.75672	validation_1-rmse:0.759213
[974]	validation_0-rmse:0.756688	validation_1-rmse:0.759189
[975]	validation_0-rmse:0.756671	validation

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, eta=0.3, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=8, min_child_weight=300, missing=None, n_estimators=1000,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
             silent=None, subsample=0.8, verbosity=1)

Result of running the above on baseline set of predictors:

[100]	validation_0-rmse:0.819027	validation_1-rmse:0.821394
[999]	validation_0-rmse:0.755926	validation_1-rmse:0.758396
CPU times: user 6h 40min 50s, sys: 3min 47s, total: 6h 44min 38s
Wall time: 8h 36min 43s

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, eta=0.3, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=8, min_child_weight=300, missing=None, n_estimators=1000,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
             silent=None, subsample=0.8, verbosity=1)

In [28]:
from tabulate import tabulate
headers = ["name", "score"]
values = sorted(zip(X_test.columns, xgmodel.feature_importances_), key=lambda x: x[1] * -1)
print(tabulate(values, headers, tablefmt="plain"))

name                                               score
total_monthly_city_item_sales_lag_1           0.259705
target_lag_1                                  0.224477
total_monthly_shop_subcategory_sales_lag_1    0.052103
target_lag_2                                  0.0351695
total_monthly_item_sales_lag_1                0.0344082
item_category_id                              0.0291885
target_lag_3                                  0.0282539
type_code                                     0.0249709
total_monthly_shop_category_sales_lag_1       0.0227484
total_monthly_sales_lag_1                     0.020068
subtype_code                                  0.019753
total_monthly_shop_sales_lag_12               0.0195617
target_lag_6                                  0.0192548
total_monthly_shop_sales_lag_1                0.016111
total_monthly_item_sales_lag_2                0.0158148
date_block_num                                0.0153869
total_monthly_item_sales_lag_6                0.01526

In [33]:
%%time
from xgboost import XGBRegressor

xgmodel2 = XGBRegressor(
    max_depth=8,
    n_estimators=100,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42)

xgmodel2.fit(
    X_train, 
    y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, y_train), (X_test, y_test)], 
    verbose=True, 
    early_stopping_rounds = 10)

  if getattr(data, 'base', None) is not None and \


[0]	validation_0-rmse:1.13969	validation_1-rmse:1.16152
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 10 rounds.
[1]	validation_0-rmse:1.10131	validation_1-rmse:1.11249
[2]	validation_0-rmse:1.07134	validation_1-rmse:1.0733
[3]	validation_0-rmse:1.0282	validation_1-rmse:1.04446
[4]	validation_0-rmse:1.00266	validation_1-rmse:1.02161
[5]	validation_0-rmse:0.980696	validation_1-rmse:0.995302
[6]	validation_0-rmse:0.963531	validation_1-rmse:0.976133
[7]	validation_0-rmse:0.946256	validation_1-rmse:0.960218
[8]	validation_0-rmse:0.932016	validation_1-rmse:0.947468
[9]	validation_0-rmse:0.920972	validation_1-rmse:0.936503
[10]	validation_0-rmse:0.912262	validation_1-rmse:0.926946
[11]	validation_0-rmse:0.903827	validation_1-rmse:0.916295
[12]	validation_0-rmse:0.896814	validation_1-rmse:0.908058
[13]	validation_0-rmse:0.891176	validation_1-rmse:0.901402
[14]	validation_0-rmse:0.885808	valid

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, eta=0.3, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=8, min_child_weight=300, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
             silent=None, subsample=0.8, verbosity=1)

In [34]:
headers = ["name", "score"]
values = sorted(zip(X_test.columns, xgmodel2.feature_importances_), key=lambda x: x[1] * -1)
print(tabulate(values, headers, tablefmt="plain"))

name                                               score
target_lag_1                                  0.288597
total_monthly_city_item_sales_lag_1           0.220565
target_lag_2                                  0.0499196
total_monthly_shop_subcategory_sales_lag_1    0.0427704
target_lag_3                                  0.0355518
total_monthly_item_sales_lag_1                0.0271202
stock_lag3                                    0.0231447
item_category_id                              0.0215563
total_monthly_shop_sales_lag_12               0.0211006
stock_lag2                                    0.0175127
target_lag_6                                  0.0171282
subtype_code                                  0.016695
total_monthly_shop_category_sales_lag_1       0.0151175
total_monthly_shop_supercategory_sales_lag_1  0.0129055
total_monthly_shop_sales_lag_6                0.0125209
total_monthly_item_sales_lag_12               0.0123925
total_monthly_item_sales_lag_2                0.01

In [30]:
# combine
month_34 = all_data[all_data.date_block_num == 34].fillna(0)
month_34['item_cnt_month'] = xgmodel.predict(month_34[predictors])
test = pd.merge(test_df,month_34, on=["shop_id","item_id"]).fillna(0)

test = test.loc[:,['ID', 'item_cnt_month']]
test.set_index("ID", inplace=True)
test["item_cnt_month"] = test["item_cnt_month"] #.round().astype("int64") do not round up, you are screwing RSS

test[test.item_cnt_month>20] = 20
test.to_csv("basic_predictors_xgb.csv")

In [16]:
train = all_data[(all_data.date_block_num>11)&(all_data.date_block_num<34)].fillna(0)

X_train = train[train.date_block_num <  33][predictors]
X_valid = train[train.date_block_num == 33][predictors]
y_train = train[train.date_block_num <  33].target
y_valid = train[train.date_block_num == 33].target

In [17]:
%%time
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=64, max_features=6, n_jobs=6) #len(predictors)/2
model.fit(X_train, y_train)
#from joblib import dump, load
#dump(model,"model1.joblib")

CPU times: user 1h 13min 23s, sys: 58.8 s, total: 1h 14min 22s
Wall time: 23min 11s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features=6, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=64, n_jobs=6,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)

In [26]:
from sklearn.metrics import mean_squared_error

ypred = model.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, ypred))
print("RMSE good: %f" % (rmse))
print("Average good: {0}".format(np.mean(ypred)))

RMSE good: 0.943238
Average good: 0.3053031265220093


In [27]:
%%time
from sklearn.ensemble import RandomForestRegressor
full_model = RandomForestRegressor(n_estimators=64, max_features=6, n_jobs=6)
full_model.fit(train[predictors], train.target)

CPU times: user 1h 6min 2s, sys: 58.1 s, total: 1h 7min
Wall time: 18min 16s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features=6, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=64, n_jobs=6,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)

In [28]:
# combine
month_34 = all_data[all_data.date_block_num == 34].fillna(0)
month_34['item_cnt_month'] = full_model.predict(month_34[predictors])
test = pd.merge(test_df,month_34, on=["shop_id","item_id"]).fillna(0)

test = test.loc[:,['ID', 'item_cnt_month']]
test.set_index("ID", inplace=True)
test["item_cnt_month"] = test["item_cnt_month"] #.round().astype("int64") do not round up, you are screwing RSS

test[test.item_cnt_month>20] = 20
test.to_csv("few_more_predictors.csv")

## automatically set 0 for "bad" items
#test = pd.merge(test_df, month_34, how="left", on=["shop_id","item_id"]).fillna(0)
#
## zeroing out
#print(test[test.shop_id.isin(closed_shops)].shape)
#
#suppress = test.item_id.isin(items_never_sold).apply(lambda x: 0 if x else 1)
#test.item_cnt_month *= suppress
#
#test = test.loc[:,['ID', 'item_cnt_month']]
#test.set_index("ID", inplace=True)
#test["item_cnt_month"] = test["item_cnt_month"].round().astype("int64")
#
## clipping
#test[test.item_cnt_month>20] = 20
#
#test.to_csv("item_trends.csv")

In [40]:
# public score: 1.04069

In [50]:
%macro -q __trends_combined 3-35

In [51]:
%store __trends_combined

Stored '__trends_combined' (Macro)


## Seasonal components

First of all, it totally make sense to introduce some more surrogate variables:
* month of the year (1-12)
* cristmas sale (True/False)
* sales 12 month ago
* current trend

In [41]:
# identify shops and items with sales history of > 1 year (needed for seasonal component)

monthly.groupby(['shop_id']).agg({'date_block_num': [min,max]})


Unnamed: 0_level_0,date_block_num,date_block_num
Unnamed: 0_level_1,min,max
shop_id,Unnamed: 1_level_2,Unnamed: 2_level_2
0,0,1
1,0,1
2,0,33
3,0,33
4,0,33
5,1,33
6,0,33
7,0,33
8,0,3
9,9,33


In [42]:
# Create data frame associating data_block_num with date (January, 2013 is date_block_num==0)
timeline = pd.DataFrame({'begin_date': pd.date_range(start='2013-01-01',end='2015-11-01', freq='MS'),
                         'date_block_num': range(35)} )

timeline.tail()

Unnamed: 0,begin_date,date_block_num
30,2015-07-01,30
31,2015-08-01,31
32,2015-09-01,32
33,2015-10-01,33
34,2015-11-01,34


In [43]:
#Lags for mean encodings too!