# Prepare data

In [1]:
import pandas as pd

train_df = pd.read_csv("./sales_train.csv.gz")
test_df  = pd.read_csv("./test.csv.gz")

categories = pd.read_csv('./item_categories.csv')
items = pd.read_csv('./items.csv')
shops = pd.read_csv('./shops.csv')

Build scafold grid indexed by all combinatinos of shops and items observed every month

In [2]:
import pandas as pd
import numpy as np
from itertools import product

# create grid of all possible shops and items remembered to be sold there
grid = []
for date_block_num in train_df["date_block_num"].unique():
    all_item_ids = train_df[train_df.date_block_num==date_block_num]["item_id"].unique()
    all_shop_ids = train_df[train_df.date_block_num==date_block_num]["shop_id"].unique()
    grid.append( np.array(list(product(*[all_item_ids, all_shop_ids, [date_block_num]]))) )

grid = pd.DataFrame(np.vstack(grid), columns = ["item_id", "shop_id", "date_block_num"], dtype="int32")

Augment the grid with counts of monthly sales from the training set and clip those to 20 as suggested

In [3]:
agg = train_df.groupby(["item_id", "shop_id", "date_block_num"], as_index=False).agg({"item_cnt_day":"sum"})

agg.columns = ["item_id", "shop_id", "date_block_num", "target"] # this is all that all_data will have for now!

all_data = pd.merge(grid, agg, how="left", on=["item_id", "shop_id", "date_block_num"])
all_data.sort_values(['date_block_num','shop_id','item_id'],inplace=True)

all_data['date_block_num'] = all_data['date_block_num'].astype(np.int8)
all_data['shop_id'] = all_data['shop_id'].astype(np.int8)
all_data['item_id'] = all_data['item_id'].astype(np.int16)

all_data['target'] = (all_data['target']
                      .fillna(0)
                      .clip(0,20) #### this is important to not having it here, why do we have it here?
                      .astype(np.float16))

Extend the data frame to the target month usgin shop/item combinations of interest from the test set

In [4]:
test_df['date_block_num'] = 34
test_df['target'] = 0.
test_df['date_block_num'] = test_df['date_block_num'].astype(np.int8)
test_df['shop_id'] = test_df['shop_id'].astype(np.int8)
test_df['item_id'] = test_df['item_id'].astype(np.int16)
all_data = pd.concat([all_data, test_df[['date_block_num','shop_id','item_id','target']]],
                     ignore_index=True,
                     sort=False,
                     keys=['date_block_num','shop_id','item_id'])

Handle categories

In [5]:
from sklearn.preprocessing import LabelEncoder

categories['split'] = categories['item_category_name'].str.split('-')
categories['type'] = categories['split'].map(lambda x: x[0].strip())
categories['type_code'] = LabelEncoder().fit_transform(categories['type'])

categories['subtype'] = categories['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
categories['subtype_code'] = LabelEncoder().fit_transform(categories['subtype'])
categories = categories[['item_category_id','type_code', 'subtype_code']]

items = pd.merge(categories, items, how="right", on=["item_category_id"])

all_data = pd.merge(all_data, items, how='left', on=['item_id'])
all_data['item_category_id'] = all_data['item_category_id'].astype(np.int8)
all_data['type_code'] = all_data['type_code'].astype(np.int8)
all_data['subtype_code'] = all_data['subtype_code'].astype(np.int8)

shops.loc[shops.shop_name == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])
shops.loc[shops.city == '!Якутск', 'city'] = 'Якутск'
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])
shops = shops[['shop_id','city_code']]

all_data = pd.merge(all_data, shops, how='left', on=['shop_id'])
all_data['city_code'] = all_data['city_code'].astype(np.int8)

# Introduce mean encodings

In [6]:
# remember, as long as month ("date_block_num") is in the groupby list, month #34 doesn't affect the means
def add_encoded_target_mean(data, groupby, name):
    aggregated_sales = data.groupby(groupby).target.mean()
    aggregated_sales.name = name
    return data.merge(aggregated_sales.reset_index(), how = "left", on = groupby)

In [7]:
# total sales for a month
all_data = add_encoded_target_mean(all_data, ["date_block_num"], "total_monthly_sales")

# total sales for a month per shop
all_data = add_encoded_target_mean(all_data, ["date_block_num", "shop_id"], "total_monthly_shop_sales")

# total sales for a month per item
all_data = add_encoded_target_mean(all_data, ["date_block_num", "item_id"], "total_monthly_item_sales")

# total sales for a month per category
all_data = add_encoded_target_mean(all_data, ["date_block_num", "item_category_id"], "total_monthly_category_sales")

# total sales for a month per supercategory
all_data = add_encoded_target_mean(all_data, ["date_block_num", "type_code"], "total_monthly_supercategory_sales")

# total sales for a month per subcategory
all_data = add_encoded_target_mean(all_data, ["date_block_num", "subtype_code"], "total_monthly_subcategory_sales")

# total sales for a month per category in a shop
all_data = add_encoded_target_mean(all_data, ["date_block_num", "item_category_id", "shop_id"], "total_monthly_shop_category_sales")

# total sales for a month per supercategory in a shop
all_data = add_encoded_target_mean(all_data, ["date_block_num", "type_code", "shop_id"], "total_monthly_shop_supercategory_sales")

# total sales for a month per subcategory in a shop
all_data = add_encoded_target_mean(all_data, ["date_block_num", "subtype_code", "shop_id"], "total_monthly_shop_subcategory_sales")

In [8]:
all_data = add_encoded_target_mean(all_data, ['date_block_num', 'city_code'], "total_monthly_city_sales")

all_data = add_encoded_target_mean(all_data, ['date_block_num', 'item_id', 'city_code'], "total_monthly_city_item_sales")

In [9]:
# of course, average item_price doesn't care for closed shops, but averaging must be done on non-exploded view!
group = train_df.groupby(['item_id']).agg({'item_price': 'mean'})
group.columns = ['item_avg_item_price']
group.reset_index(inplace=True)

# now merge that back into the all_data
all_data = pd.merge(all_data, group, how='left', on=['item_id'])
all_data['item_avg_item_price'] = all_data['item_avg_item_price'].astype(np.float16)

group = train_df.groupby(['date_block_num','item_id']).agg({'item_price': 'mean'})
group.columns = ['date_item_avg_item_price']
group.reset_index(inplace=True)

# merge that back into the all_data again
all_data = pd.merge(all_data, group, how='left', on=['date_block_num','item_id'])
all_data['item_avg_item_price'] = all_data['item_avg_item_price'].astype(np.float16)

# Introduce lags

In [10]:
def construct_lags(data, lags, columns):
    index = ["date_block_num", "shop_id", "item_id"]
    lagged_data = []
    for l in lags:
        lag = data[index + columns].copy()
        lag.columns = index + [col+"_lag_"+str(l) for col in columns]
        lag["date_block_num"] += l
        lagged_data.append(lag)

    if not lags:
        return data[["date_block_num"]]
    
    retval = lagged_data.pop()

    for l in lagged_data:
        retval = pd.merge(retval, l, how='outer', on=index)

    return retval.fillna(0) # non-existing lags are set to 0

In [11]:
trg_lags = construct_lags(all_data, [1,2,3,6,12], ["target"]) # item_cnt_month_lag

mean_lags = construct_lags(all_data,
                          [1],
                          ["total_monthly_sales",                     # date_avg_item_cnt      # just one lag is used
                           "total_monthly_category_sales",            # date_cat_avg_item_cnt  # only one lag used
                           "total_monthly_supercategory_sales",       # date_type_avg_item_cnt # dropped
                           "total_monthly_subcategory_sales",         # date_subtype_avg_item_cnt      # dropped
                           "total_monthly_shop_category_sales",       # date_shop_cat_avg_item_cnt     # single lag
                           "total_monthly_shop_supercategory_sales",  # date_shop_type_avg_item_cnt    # dropped
                           "total_monthly_shop_subcategory_sales",    # date_shop_subtype_avg_item_cnt # dropped
                           "total_monthly_city_sales",
                           "total_monthly_city_item_sales"
                          ])

mean_lags_2 = construct_lags(all_data,
                          [1,2,3,6,12],
                          ["total_monthly_shop_sales",
                           "total_monthly_item_sales"
                          ])

all_data = pd.merge(all_data, trg_lags,  how='left', on=['date_block_num','shop_id','item_id'])
all_data = pd.merge(all_data, mean_lags, how='left', on=['date_block_num','shop_id','item_id'])
all_data = pd.merge(all_data, mean_lags_2, how='left', on=['date_block_num','shop_id','item_id'])

In [12]:
from sklearn.model_selection import train_test_split

train = all_data[(all_data.date_block_num>11)&(all_data.date_block_num<33)].fillna(0) # reduced traning set

predictors = [
    "target_lag_1",
    "target_lag_2",
    "target_lag_3",
    "target_lag_6",
    "target_lag_12",
    "total_monthly_sales_lag_1",
    "total_monthly_shop_sales_lag_1", # all lags needed
    "total_monthly_shop_sales_lag_2", # all lags needed
    "total_monthly_shop_sales_lag_3", # all lags needed
    "total_monthly_shop_sales_lag_6", # all lags needed
    "total_monthly_shop_sales_lag_12", # all lags needed
    "total_monthly_item_sales_lag_1", # all lags needed
    "total_monthly_item_sales_lag_2", # all lags needed
    "total_monthly_item_sales_lag_3", # all lags needed
    "total_monthly_item_sales_lag_6", # all lags needed
    "total_monthly_item_sales_lag_12", # all lags needed
    "total_monthly_category_sales_lag_1",
    "total_monthly_supercategory_sales_lag_1",
    "total_monthly_subcategory_sales_lag_1",
    "total_monthly_shop_category_sales_lag_1",
    "total_monthly_shop_supercategory_sales_lag_1",    
    "total_monthly_shop_subcategory_sales_lag_1",
    "total_monthly_city_sales_lag_1",
    "total_monthly_city_item_sales_lag_1"]

X_train, X_test, y_train, y_test = \
    train_test_split(train[predictors], train.target, test_size=0.2, random_state=123)

In [13]:
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=64, max_features=3, n_jobs=6) #len(predictors)/2
model.fit(X_train, y_train)

ypred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, ypred))
print("RMSE: %f" % (rmse))

#from joblib import dump, load
#joblib.dump(model2,"model2.joblib")
del model
import gc
gc.collect()

RMSE: 0.852216


40

In [14]:
%%time
from sklearn.ensemble import RandomForestRegressor
full_model = RandomForestRegressor(n_estimators=64, max_features=6, n_jobs=6) #len(predictors)/2
full_model.fit(train[predictors], train.target)

CPU times: user 1h 10min 50s, sys: 53.6 s, total: 1h 11min 44s
Wall time: 23min 8s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features=6, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=64, n_jobs=6,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)

In [15]:
from tabulate import tabulate
headers = ["name", "score"]
values = sorted(zip(X_test.columns, full_model.feature_importances_), key=lambda x: x[1] * -1)
print(tabulate(values, headers, tablefmt="plain"))

name                                               score
target_lag_1                                  0.161009
total_monthly_item_sales_lag_1                0.126639
total_monthly_city_item_sales_lag_1           0.11626
total_monthly_shop_subcategory_sales_lag_1    0.0581648
total_monthly_shop_category_sales_lag_1       0.0579171
target_lag_2                                  0.056999
total_monthly_item_sales_lag_2                0.0524599
total_monthly_shop_supercategory_sales_lag_1  0.0330398
target_lag_3                                  0.0300672
total_monthly_item_sales_lag_3                0.0296108
total_monthly_shop_sales_lag_1                0.0287698
total_monthly_subcategory_sales_lag_1         0.0273727
total_monthly_category_sales_lag_1            0.0264437
total_monthly_supercategory_sales_lag_1       0.0248158
total_monthly_shop_sales_lag_2                0.0235311
total_monthly_city_sales_lag_1                0.0225989
total_monthly_item_sales_lag_6                0.0203

In [16]:
month_34 = all_data[all_data.date_block_num == 34].fillna(0)
month_34['item_cnt_month'] = full_model.predict(month_34[predictors])
test = pd.merge(test_df,month_34, on=["shop_id","item_id"]).fillna(0)

test = test.loc[:,['ID', 'item_cnt_month']]
test.set_index("ID", inplace=True)
test["item_cnt_month"] = test["item_cnt_month"] #.round().astype("int64") do not round up, you are screwing RSS

test[test.item_cnt_month>20] = 20
test.to_csv("macro3.csv")

In [17]:
%macro -q __prepare_data_3 2-10

In [18]:
%store __prepare_data_3

Stored '__prepare_data_3' (Macro)


In [19]:
all_data.to_pickle('all_data_3.pkl')

In [20]:
pd.set_option('display.max_columns', None)
all_data[(all_data.date_block_num==12)&(all_data.shop_id==2)&(all_data.item_id==27)].head()

Unnamed: 0,item_id,shop_id,date_block_num,target,item_category_id,type_code,subtype_code,item_name,city_code,total_monthly_sales,total_monthly_shop_sales,total_monthly_item_sales,total_monthly_category_sales,total_monthly_supercategory_sales,total_monthly_subcategory_sales,total_monthly_shop_category_sales,total_monthly_shop_supercategory_sales,total_monthly_shop_subcategory_sales,total_monthly_city_sales,total_monthly_city_item_sales,item_avg_item_price,date_item_avg_item_price,target_lag_12,target_lag_1,target_lag_2,target_lag_3,target_lag_6,total_monthly_sales_lag_1,total_monthly_category_sales_lag_1,total_monthly_supercategory_sales_lag_1,total_monthly_subcategory_sales_lag_1,total_monthly_shop_category_sales_lag_1,total_monthly_shop_supercategory_sales_lag_1,total_monthly_shop_subcategory_sales_lag_1,total_monthly_city_sales_lag_1,total_monthly_city_item_sales_lag_1,total_monthly_shop_sales_lag_12,total_monthly_item_sales_lag_12,total_monthly_shop_sales_lag_1,total_monthly_item_sales_lag_1,total_monthly_shop_sales_lag_2,total_monthly_item_sales_lag_2,total_monthly_shop_sales_lag_3,total_monthly_item_sales_lag_3,total_monthly_shop_sales_lag_6,total_monthly_item_sales_lag_6
4488756,27,2,12,0.0,19,5,10,"007 Legends [PS3, русская версия]",0,0.311352,0.114289,0.021739,0.731211,0.734114,0.727291,0.762082,0.640468,0.716923,0.114289,0.0,1461.0,998.0,1.0,0.0,0.0,0.0,0.0,0.411253,1.082474,1.034303,1.075009,0.955326,0.808824,0.934844,0.1481,0.0,0.14122,0.155556,0.1481,0.086957,0.10063,0.044444,0.089066,0.130435,0.096014,0.065217


date_block_num: 12    =
shop_id: 2            =
item_id: 27           =
item_cnt_month: 0     =
city_code: 0          =
item_category_id: 19  =
type_code: 5          =
subtype_code: 10      =
item_cnt_month_lag_1: 0   , target_lag_1 =
item_cnt_month_lag_2: 0   , target_lag_2 =
item_cnt_month_lag_3: 0   , target_lag_3 =
item_cnt_month_lag_6: 0   , target_lag_6 =
item_cnt_month_lag_12: 1  , target_lag_12 =
date_avg_item_cnt_lag_1: 0.411377  , total_monthly_sales_lag_1: 0.411253
date_item_avg_item_cnt_lag_1: 0.086975 , total_monthly_item_sales_lag_1: 0.086975
date_item_avg_item_cnt_lag_2: 0.044434 , total_monthly_item_sales_lag_2: 0.044444
date_item_avg_item_cnt_lag_3: 0.130493 , total_monthly_item_sales_lag_3: 0.130435
date_item_avg_item_cnt_lag_6: 0.065247 , total_monthly_item_sales_lag_6: 0.065217
date_item_avg_item_cnt_lag_12: 0.155518, total_monthly_item_sales_lag_12: 0.155556
date_shop_avg_item_cnt_lag_1: 0.148071,  total_monthly_shop_sales_lag_1: 0.1481
date_shop_avg_item_cnt_lag_2: 0.100647,  total_monthly_shop_sales_lag_2: 0.10063
date_shop_avg_item_cnt_lag_3: 0.08905,   total_monthly_shop_sales_lag_3: 0.089066
date_shop_avg_item_cnt_lag_6: 0.096008,  total_monthly_shop_sales_lag_6: 0.096014
date_shop_avg_item_cnt_lag_12: 0.141235, total_monthly_shop_sales_lag_12:0.14122
date_cat_avg_item_cnt_lag_1: 1.082031,   total_monthly_category_sales_lag_1: 1.082474
date_shop_cat_avg_item_cnt_lag_1: 0.955566,     total_monthly_shop_category_sales_lag_1: 0.955326
date_shop_type_avg_item_cnt_lag_1: 0.808594,    total_monthly_shop_supercategory_sales_lag_1: 0.808824
date_shop_subtype_avg_item_cnt_lag_1: 0.935059, total_monthly_shop_subcategory_sales_lag_1: 0.934844
date_city_avg_item_cnt_lag_1: 0.148071,         total_monthly_city_sales_lag_1: 0.1481
date_item_city_avg_item_cnt_lag_1: 0.0,         total_monthly_city_item_sales_lag_1: 0.0
date_type_avg_item_cnt_lag_1: 1.03418    , total_monthly_supercategory_sales_lag_1: 1.034303
date_subtype_avg_item_cnt_lag_1: 1.075195, total_monthly_subcategory_sales_lag_1: 1.075009
delta_price_lag: -0.282715
delta_revenue_lag_1: 1.211914
month:0
days: 31
item_shop_last_sale: 1
item_last_sale: 1
item_shop_first_sale: 12
item_first_sale: 12


In [21]:
# public score of 1.01206
X_test.shape

(1237385, 24)

In [22]:
# old set was:

predictors = [
    "target_lag_1",
    "target_lag_2",
    "target_lag_3",
    "target_lag_6",
    "target_lag_12",
    "total_monthly_sales_lag_1",
    "total_monthly_shop_sales_lag_1",
    "total_monthly_item_sales_lag_1",
    "total_monthly_category_sales_lag_1",
    "total_monthly_supercategory_sales_lag_1",
    "total_monthly_subcategory_sales_lag_1",
    "total_monthly_shop_category_sales_lag_1",
    "total_monthly_shop_supercategory_sales_lag_1",    
    "total_monthly_shop_subcategory_sales_lag_1"]

X_train, X_test, y_train, y_test = \
    train_test_split(train[predictors], train.target, test_size=0.2, random_state=123)


from sklearn.ensemble import RandomForestRegressor
full_model = RandomForestRegressor(n_estimators=64, max_features=6, n_jobs=6) #len(predictors)/2
full_model.fit(train[predictors], train.target)

month_34 = all_data[all_data.date_block_num == 34].fillna(0)
month_34['item_cnt_month'] = full_model.predict(month_34[predictors])
test = pd.merge(test_df,month_34, on=["shop_id","item_id"]).fillna(0)

test = test.loc[:,['ID', 'item_cnt_month']]
test.set_index("ID", inplace=True)
test["item_cnt_month"] = test["item_cnt_month"] #.round().astype("int64") do not round up, you are screwing RSS

test[test.item_cnt_month>20] = 20
test.to_csv("macro3_like_old.csv")

In [23]:
# without total_monthly_item_sales_lag_ and _city_ the public score is 1.07343