# Prepare data

In [1]:
import pandas as pd

train_df = pd.read_csv("./sales_train.csv.gz")
test_df  = pd.read_csv("./test.csv.gz")

categories = pd.read_csv('./item_categories.csv')
items = pd.read_csv('./items.csv')

Build scafold grid indexed by all combinatinos of shops and items observed every month

In [2]:
import pandas as pd
import numpy as np
from itertools import product

# create grid of all possible shops and items remembered to be sold there
grid = []
for date_block_num in train_df["date_block_num"].unique():
    all_item_ids = train_df[train_df.date_block_num==date_block_num]["item_id"].unique()
    all_shop_ids = train_df[train_df.date_block_num==date_block_num]["shop_id"].unique()
    grid.append( np.array(list(product(*[all_item_ids, all_shop_ids, [date_block_num]]))) )

grid = pd.DataFrame(np.vstack(grid), columns = ["item_id", "shop_id", "date_block_num"], dtype="int32")

Augment the grid with counts of monthly sales from the training set and clip those to 20 as suggested

In [3]:
agg = train_df.groupby(["item_id", "shop_id", "date_block_num"], as_index=False).agg({"item_cnt_day":"sum"})

agg.columns = ["item_id", "shop_id", "date_block_num", "target"] # this is all that all_data will have for now!

all_data = pd.merge(grid, agg, how="left", on=["item_id", "shop_id", "date_block_num"])
all_data.sort_values(['date_block_num','shop_id','item_id'],inplace=True)

all_data['date_block_num'] = all_data['date_block_num'].astype(np.int8)
all_data['shop_id'] = all_data['shop_id'].astype(np.int8)
all_data['item_id'] = all_data['item_id'].astype(np.int16)

all_data['target'] = (all_data['target']
                      .fillna(0)
                      .clip(0,20)
                      .astype(np.float16))

Extend the data frame to the target month usgin shop/item combinations of interest from the test set

In [4]:
test_df['date_block_num'] = 34
test_df['target'] = 0.
test_df['date_block_num'] = test_df['date_block_num'].astype(np.int8)
test_df['shop_id'] = test_df['shop_id'].astype(np.int8)
test_df['item_id'] = test_df['item_id'].astype(np.int16)
all_data = pd.concat([all_data, test_df[['date_block_num','shop_id','item_id','target']]],
                     ignore_index=True,
                     sort=False,
                     keys=['date_block_num','shop_id','item_id'])

Handle categories

In [5]:
from sklearn.preprocessing import LabelEncoder

categories['split'] = categories['item_category_name'].str.split('-')
categories['type'] = categories['split'].map(lambda x: x[0].strip())
categories['type_code'] = LabelEncoder().fit_transform(categories['type'])

categories['subtype'] = categories['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
categories['subtype_code'] = LabelEncoder().fit_transform(categories['subtype'])
categories = categories[['item_category_id','type_code', 'subtype_code']]

items = pd.merge(categories, items, how="right", on=["item_category_id"])

all_data = pd.merge(all_data, items, how='left', on=['item_id'])
all_data['item_category_id'] = all_data['item_category_id'].astype(np.int8)
all_data['type_code'] = all_data['type_code'].astype(np.int8)
all_data['subtype_code'] = all_data['subtype_code'].astype(np.int8)

# Introduce mean encodings

In [6]:
# remember, as long as month ("date_block_num") is in the groupby list, month #34 doesn't affect the means
def add_encoded_target_mean(data, groupby, name):
    aggregated_sales = data.groupby(groupby).target.mean()
    aggregated_sales.name = name
    return data.merge(aggregated_sales.reset_index(), how = "left", on = groupby)

In [7]:
# total sales for a month
all_data = add_encoded_target_mean(all_data, ["date_block_num"], "total_monthly_sales")

# total sales for a month per shop
all_data = add_encoded_target_mean(all_data, ["date_block_num", "shop_id"], "total_monthly_shop_sales")

# total sales for a month per item
all_data = add_encoded_target_mean(all_data, ["date_block_num", "item_id"], "total_monthly_item_sales")

# total sales for a month per category
all_data = add_encoded_target_mean(all_data, ["date_block_num", "item_category_id"], "total_monthly_category_sales")

# total sales for a month per supercategory
all_data = add_encoded_target_mean(all_data, ["date_block_num", "type_code"], "total_monthly_supercategory_sales")

# total sales for a month per subcategory
all_data = add_encoded_target_mean(all_data, ["date_block_num", "subtype_code"], "total_monthly_subcategory_sales")

# total sales for a month per category in a shop
all_data = add_encoded_target_mean(all_data, ["date_block_num", "item_category_id", "shop_id"], "total_monthly_shop_category_sales")

# total sales for a month per supercategory in a shop
all_data = add_encoded_target_mean(all_data, ["date_block_num", "type_code", "shop_id"], "total_monthly_shop_supercategory_sales")

# total sales for a month per subcategory in a shop
all_data = add_encoded_target_mean(all_data, ["date_block_num", "subtype_code", "shop_id"], "total_monthly_shop_subcategory_sales")

# Introduce lags

In [8]:
def construct_lags(data, lags, columns):
    index = ["date_block_num", "shop_id", "item_id"]
    lagged_data = []
    for l in lags:
        lag = data[index + columns].copy()
        lag.columns = index + [col+"_lag_"+str(l) for col in columns]
        lag["date_block_num"] += l
        lagged_data.append(lag)

    if not lags:
        return data[["date_block_num"]]
    
    retval = lagged_data.pop()

    for l in lagged_data:
        retval = pd.merge(retval, l, how='left', on=index)

    return retval.fillna(0) # non-existing lags are set to 0

In [9]:
trg_lags = construct_lags(all_data, [1,2,3,6,12], ["target"])

mean_lags = construct_lags(all_data,
                          [1],
                          ["total_monthly_sales",
                           "total_monthly_shop_sales",
                           "total_monthly_item_sales",
                           "total_monthly_category_sales",
                           "total_monthly_supercategory_sales",
                           "total_monthly_subcategory_sales",
                           "total_monthly_shop_category_sales",
                           "total_monthly_shop_supercategory_sales",
                           "total_monthly_shop_subcategory_sales"
                          ])

all_data = pd.merge(all_data, trg_lags,  how='left', on=['date_block_num','shop_id','item_id'])
all_data = pd.merge(all_data, mean_lags, how='left', on=['date_block_num','shop_id','item_id'])

In [10]:
from sklearn.model_selection import train_test_split

train = all_data[(all_data.date_block_num>12)&(all_data.date_block_num<34)].fillna(0)

predictors = [
    "target_lag_1",
    "target_lag_2",
    "target_lag_3",
    "target_lag_6",
    "target_lag_12",
    "total_monthly_sales_lag_1",
    "total_monthly_shop_sales_lag_1",
    "total_monthly_item_sales_lag_1",
    "total_monthly_category_sales_lag_1",
    "total_monthly_supercategory_sales_lag_1",
    "total_monthly_subcategory_sales_lag_1",
    "total_monthly_shop_category_sales_lag_1",
    "total_monthly_shop_supercategory_sales_lag_1",    
    "total_monthly_shop_subcategory_sales_lag_1"]

X_train, X_test, y_train, y_test = \
    train_test_split(train[predictors], train.target, test_size=0.2, random_state=123)

In [2]:
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

#model2 = RandomForestRegressor(n_estimators=64, max_features=3, n_jobs=6) #len(predictors)/2
#model2.fit(X_train, y_train)

from joblib import dump, load
joblib.dump(model2,"model2.joblib")

ypred = model2.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, ypred))
print("RMSE: %f" % (rmse))

NameError: name 'joblib' is not defined

In [12]:
%%time
from sklearn.ensemble import RandomForestRegressor
full_model = RandomForestRegressor(n_estimators=64, max_features=3, n_jobs=6) #len(predictors)/2
full_model.fit(train[predictors], train.target)

CPU times: user 31min 43s, sys: 23.4 s, total: 32min 6s
Wall time: 8min 25s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features=3, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=64, n_jobs=6,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)

In [13]:
from tabulate import tabulate
headers = ["name", "score"]
values = sorted(zip(X_test.columns, full_model.feature_importances_), key=lambda x: x[1] * -1)
print(tabulate(values, headers, tablefmt="plain"))

name                                              score
total_monthly_item_sales_lag_1                0.381233
total_monthly_shop_category_sales_lag_1       0.0994527
total_monthly_shop_subcategory_sales_lag_1    0.0849444
target_lag_1                                  0.0674132
total_monthly_shop_sales_lag_1                0.0553516
total_monthly_shop_supercategory_sales_lag_1  0.0519963
total_monthly_category_sales_lag_1            0.0509722
target_lag_2                                  0.0423134
total_monthly_subcategory_sales_lag_1         0.0386334
total_monthly_supercategory_sales_lag_1       0.031973
target_lag_3                                  0.0281671
target_lag_6                                  0.0248578
total_monthly_sales_lag_1                     0.0233411
target_lag_12                                 0.0193503


In [14]:
month_34 = all_data[all_data.date_block_num == 34].fillna(0)
month_34['item_cnt_month'] = full_model.predict(month_34[predictors])
test = pd.merge(test_df,month_34, on=["shop_id","item_id"]).fillna(0)

test = test.loc[:,['ID', 'item_cnt_month']]
test.set_index("ID", inplace=True)
test["item_cnt_month"] = test["item_cnt_month"].round().astype("int64")

test[test.item_cnt_month>20] = 20
test.to_csv("macro2.csv")

In [11]:
%macro -q __prepare_data 2-10

In [12]:
%store __prepare_data

Stored '__prepare_data' (Macro)
