In [1]:
import pandas as pd
train_df = pd.read_csv("./sales_train.csv.gz")
test_df  = pd.read_csv("./test.csv.gz")
categories = pd.read_csv('./item_categories.csv')
items = pd.read_csv('./items.csv')

In [2]:
import pandas as pd
import numpy as np
from itertools import product

# create grid of all possible shops and items remembered to be sold there
grid = []
for date_block_num in train_df["date_block_num"].unique():
    all_item_ids = train_df[train_df.date_block_num==date_block_num]["item_id"].unique()
    all_shop_ids = train_df[train_df.date_block_num==date_block_num]["shop_id"].unique()
    grid.append( np.array(list(product(*[all_item_ids, all_shop_ids, [date_block_num]]))) )
 
grid = pd.DataFrame(np.vstack(grid), columns = ["item_id", "shop_id", "date_block_num"], dtype="int32")

agg = train_df.groupby(["item_id", "shop_id", "date_block_num"], as_index=False).agg({"item_cnt_day":"sum"})
agg.columns = ["item_id", "shop_id", "date_block_num", "target"]

all_data = pd.merge(grid, agg, how="left", on=["item_id", "shop_id", "date_block_num"]).fillna(0)
all_data.sort_values(['date_block_num','shop_id','item_id'],inplace=True)

# prepare 5 lags for every month
lags = pd.DataFrame(train_df.groupby(["shop_id","item_id","date_block_num"]).item_cnt_day.sum())

lag_1 = lags.reset_index(level="date_block_num")
lag_1["date_block_num"] = lag_1["date_block_num"] + 1
lag_2 = lags.reset_index(level="date_block_num")
lag_2["date_block_num"] = lag_2["date_block_num"] + 2
lag_3 = lags.reset_index(level="date_block_num")
lag_3["date_block_num"] = lag_3["date_block_num"] + 3
lag_5 = lags.reset_index(level="date_block_num")
lag_5["date_block_num"] = lag_5["date_block_num"] + 5
lag_6 = lags.reset_index(level="date_block_num")
lag_6["date_block_num"] = lag_6["date_block_num"] + 6
lag_11 = lags.reset_index(level="date_block_num")
lag_11["date_block_num"] = lag_11["date_block_num"] + 11
lag_12 = lags.reset_index(level="date_block_num")
lag_12["date_block_num"] = lag_12["date_block_num"] + 12

lag_1.set_index('date_block_num', append=True, inplace=True)
lag_1.columns = ["target_lag_1"]
lag_2.set_index('date_block_num', append=True, inplace=True)
lag_2.columns = ["target_lag_2"]
lag_3.set_index('date_block_num', append=True, inplace=True)
lag_3.columns = ["target_lag_3"]
lag_5.set_index('date_block_num', append=True, inplace=True)
lag_5.columns = ["target_lag_5"]
lag_6.set_index('date_block_num', append=True, inplace=True)
lag_6.columns = ["target_lag_6"]
lag_11.set_index('date_block_num', append=True, inplace=True)
lag_11.columns = ["target_lag_11"]
lag_12.set_index('date_block_num', append=True, inplace=True)
lag_12.columns = ["target_lag_12"]

lags = lags.join(lag_1)
lags = lags.join(lag_2)
lags = lags.join(lag_3)
lags = lags.join(lag_5)
lags = lags.join(lag_6)
lags = lags.join(lag_11)
lags = lags.join(lag_12)

all_data.set_index(["shop_id","item_id","date_block_num"], inplace=True)
all_data = all_data.join(lags, how="left")
all_data.reset_index(inplace=True)


# non-existing lags are set to 0 (makes little sense for the beginning)
all_data.fillna(0, inplace=True)


# mean encoding plus a single lag
trg_means = all_data.groupby(["date_block_num"]).target.mean()
trg_means.name = "target_enc"

trg_lag_1 = trg_means.reset_index("date_block_num") # this turns it into a DataFrame
trg_lag_1["date_block_num"] = trg_lag_1["date_block_num"] + 1
trg_lag_1.set_index('date_block_num', append=True, inplace=True)
trg_lag_1.columns = ["target_enc_lag_1"]

all_data = all_data.merge(trg_means.reset_index(), how="left", on=["date_block_num"])
all_data = all_data.merge(trg_lag_1.reset_index(), how="left", on=["date_block_num"])


# mean encoding for items plus a single lag
trg_means = all_data.groupby(["item_id","date_block_num"]).target.mean()
trg_means.name = "item_target_enc"

trg_lag_1 = trg_means.reset_index("date_block_num") # this turns it into a DataFrame
trg_lag_1["date_block_num"] = trg_lag_1["date_block_num"] + 1
trg_lag_1.set_index('date_block_num', append=True, inplace=True)
trg_lag_1.columns = ["item_target_enc_lag_1"]

all_data = all_data.merge(trg_means.reset_index(), how="left", on=["item_id","date_block_num"])
all_data = all_data.merge(trg_lag_1.reset_index(), how="left", on=["item_id","date_block_num"])


# mean encoding for shops plus a single lag
trg_means = all_data.groupby(["shop_id","date_block_num"]).target.mean()
trg_means.name = "shop_target_enc"

trg_lag_1 = trg_means.reset_index("date_block_num")
trg_lag_1["date_block_num"] = trg_lag_1["date_block_num"] + 1
trg_lag_1.set_index('date_block_num', append=True, inplace=True)
trg_lag_1.columns = ["shop_target_enc_lag_1"]

all_data = all_data.merge(trg_means.reset_index(), how="left", on=["shop_id","date_block_num"])
all_data = all_data.merge(trg_lag_1.reset_index(), how="left", on=["shop_id","date_block_num"])


# overal mean encoding for items and shops
trg_means = all_data.groupby("item_id").target.mean()
all_data["item_target_enc_ave"] = all_data["item_id"].map(trg_means)
trg_means = all_data.groupby("shop_id").target.mean()
all_data["shop_target_enc_ave"] = all_data["shop_id"].map(trg_means)


# last recorded month
all_data_33 = all_data[all_data.date_block_num == 33]
all_data_33 = all_data_33.drop(['date_block_num'], axis=1)

In [3]:
%macro -q __prepare_data 2

In [4]:
%store __prepare_data

Stored '__prepare_data' (Macro)


In [5]:
# introduce categories
from sklearn.preprocessing import LabelEncoder

categories['split'] = categories['item_category_name'].str.split('-')
categories['type'] = categories['split'].map(lambda x: x[0].strip())
categories['type_code'] = LabelEncoder().fit_transform(categories['type'])

categories['subtype'] = categories['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
categories['subtype_code'] = LabelEncoder().fit_transform(categories['subtype'])
categories = categories[['item_category_id','type_code', 'subtype_code']]

items = pd.merge(categories, items, how="right", on=["item_category_id"])

all_data = pd.merge(all_data, items, how='left', on=['item_id'])
all_data['item_category_id'] = all_data['item_category_id'].astype(np.int8)
all_data['type_code'] = all_data['type_code'].astype(np.int8)
all_data['subtype_code'] = all_data['subtype_code'].astype(np.int8)


# mean encoding for item_category plus a single lag
trg_means = all_data.groupby(["item_category_id","date_block_num"]).target.mean()
trg_means.name = "item_cat_target_enc"

trg_lag_1 = trg_means.reset_index("date_block_num") # this turns it into a DataFrame
trg_lag_1["date_block_num"] = trg_lag_1["date_block_num"] + 1
trg_lag_1.set_index('date_block_num', append=True, inplace=True)
trg_lag_1.columns = ["item_cat_target_enc_lag_1"]

all_data = all_data.merge(trg_means.reset_index(), how="left", on=["item_category_id","date_block_num"])
all_data = all_data.merge(trg_lag_1.reset_index(), how="left", on=["item_category_id","date_block_num"])


# mean encoding for shop and item_category plus a single lag
trg_means = all_data.groupby(["shop_id","item_category_id","date_block_num"]).target.mean()
trg_means.name = "shop_item_cat_target_enc"

trg_lag_1 = trg_means.reset_index("date_block_num") # this turns it into a DataFrame
trg_lag_1["date_block_num"] = trg_lag_1["date_block_num"] + 1
trg_lag_1.set_index('date_block_num', append=True, inplace=True)
trg_lag_1.columns = ["shop_item_cat_target_enc_lag_1"]

all_data = all_data.merge(trg_means.reset_index(), how="left", on=["shop_id","item_category_id","date_block_num"])
all_data = all_data.merge(trg_lag_1.reset_index(), how="left", on=["shop_id","item_category_id","date_block_num"])


# mean encoding for shop and type_code (super category) plus a single lag
trg_means = all_data.groupby(["shop_id","type_code","date_block_num"]).target.mean()
trg_means.name = "shop_super_cat_target_enc"

trg_lag_1 = trg_means.reset_index("date_block_num") # this turns it into a DataFrame
trg_lag_1["date_block_num"] = trg_lag_1["date_block_num"] + 1
trg_lag_1.set_index('date_block_num', append=True, inplace=True)
trg_lag_1.columns = ["shop_super_cat_target_enc_lag_1"]

all_data = all_data.merge(trg_means.reset_index(), how="left", on=["shop_id","type_code","date_block_num"])
all_data = all_data.merge(trg_lag_1.reset_index(), how="left", on=["shop_id","type_code","date_block_num"])


# mean encoding for type_code (super category) plus a single lag
trg_means = all_data.groupby(["type_code","date_block_num"]).target.mean()
trg_means.name = "super_cat_target_enc"

trg_lag_1 = trg_means.reset_index("date_block_num") # this turns it into a DataFrame
trg_lag_1["date_block_num"] = trg_lag_1["date_block_num"] + 1
trg_lag_1.set_index('date_block_num', append=True, inplace=True)
trg_lag_1.columns = ["super_cat_target_enc_lag_1"]

all_data = all_data.merge(trg_means.reset_index(), how="left", on=["type_code","date_block_num"])
all_data = all_data.merge(trg_lag_1.reset_index(), how="left", on=["type_code","date_block_num"])


# last recorded month
all_data_33 = all_data[all_data.date_block_num == 33]
all_data_33 = all_data_33.drop(['date_block_num'], axis=1)

In [6]:
%macro -q __add_categories 5

In [7]:
%store __add_categories

Stored '__add_categories' (Macro)


In [8]:
def prepare_submission(model, predictors, aliases, output):
    """A helper to prepare sumbission file
    Args:
        model - anything implementing predict
        predictors - ordered list of the predictors used for the model
        aliases - dictionary of predictors of the model and their names in data
        output - csv file name
    """
    # subset

    X = all_data_33.loc[:,[aliases[colname] for colname in predictors]]
    # rename
    X.columns = predictors
    # predict
    y = model.predict(X)

    all_data_33["item_cnt_month"] = y
    test = test_df.join(all_data_33.set_index(["shop_id","item_id"]), on=["shop_id","item_id"]).fillna(0)

    test = test.loc[:,['ID', 'item_cnt_month']]
    test.set_index("ID", inplace=True)
    test["item_cnt_month"] = test["item_cnt_month"].round().astype("int64")

    # clip
    test[test.item_cnt_month>20] = 20
    # save
    test.to_csv(output)

In [9]:
%macro -q __prepare_submission 8

In [10]:
%store __prepare_submission

Stored '__prepare_submission' (Macro)


In [11]:
%store

Stored variables and their in-db values:
__add_categories                 -> IPython.macro.Macro('# introduce categories\nfrom 
__prepare_data                   -> IPython.macro.Macro('import pandas as pd\nimport n
__prepare_submission             -> IPython.macro.Macro('def prepare_submission(model,
