In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from freetextmethods import word_counts
from downcast import downcast_df

In [2]:
pd.options.mode.use_inf_as_na = True
DATA_FOLDER = '/home/roger/code/kaggle/predict-future-sales/data/'
PRED_FOLDER = '/home/roger/code/kaggle/predict-future-sales/predictions/'
SUBMISSION_FOLDER = "predictions/"
index_vars = ["date_block_num", "shop_id", "item_id"]

transactions       = pd.read_csv(DATA_FOLDER + 'sales_train.csv')
items              = pd.read_csv(DATA_FOLDER + 'items.csv')
item_categories    = pd.read_csv(DATA_FOLDER + 'item_categories_en.csv')
item_categories_en = pd.read_csv(DATA_FOLDER + 'item_categories_en.csv')
shops              = pd.read_csv(DATA_FOLDER + 'shops_en.csv')
shops_en           = pd.read_csv(DATA_FOLDER + 'shops_en.csv')
test_ids           = pd.read_csv(DATA_FOLDER + 'test.csv')
sample             = pd.read_csv(DATA_FOLDER + 'sample_submission.csv')

# Adding English translations to item_categories and shops
item_categories = item_categories.merge(
    item_categories_en[["item_category_id", "item_category_name"]]
        .rename(columns={"item_category_name":"item_category_name_en"}),
    on="item_category_id")
item_categories.head()

shops = shops.merge(
    shops_en[["shop_id", "shop_name"]]
        .rename(columns={"shop_name":"shop_name_en"}),
    on="shop_id")

In [3]:
ignore_vars = [
    "first_sold_month",
    "month-1",
    "month-2",
    "month-3",
    "month-4",
    "month-5",
    "month-6",
    "month-7",
    "month-8",
    "month-9",
    "month-10",
    "month-11",
    "month-12",
    
    "item_cnt_month_na",
    "item_cnt_month_orig",
    "item_name",
    "item_category_name",
    "item_category_name_en",
    "shop_name",
    "shop_name_en",
    "sales_cnt_month",
]

ignore_vars += [c+"_na" for c in ignore_vars]

cat_vars = [
    "shop_id",
    "item_id",
    "month",
    "season",
    "quarter",
    "item_category_id",
]
# cat_vars += [c for c in items_bow.columns if c.startswith("item_bow_")]
# cat_vars += [c for c in items_cat_bow.columns if c.startswith("cat_bow_")]
# cat_vars += [c for c in shops_bow.columns if c.startswith("shop_bow_")]

In [4]:
def see(shop_id,item_id):
    return sales_by_month[(sales_by_month["item_id"]==item_id)&(sales_by_month["shop_id"]==shop_id)]

In [5]:
print(transactions.dtypes, "\n")
print(items.dtypes, "\n")
print(item_categories.dtypes, "\n")
print(shops.dtypes, "\n")

date               object
date_block_num      int64
shop_id             int64
item_id             int64
item_price        float64
item_cnt_day      float64
dtype: object 

item_name           object
item_id              int64
item_category_id     int64
dtype: object 

item_category_name       object
item_category_id          int64
item_category_name_en    object
dtype: object 

shop_name       object
shop_id          int64
shop_name_en    object
dtype: object 



## Contents

1. Features before split
2. Validation split
3. Features after split
4. Catboost
    1. Feature importance


9. Submission

## 1. Features Creation

In [6]:
# item_name_length
items["item_name_length"] = items["item_name"].apply(lambda x:len(x.strip()))
# items["item_name_length"].value_counts().sort_index().plot()
# plt.title("Item name length frequencies")
# plt.plot()

In [7]:
items_bow, item_bow_ref = word_counts(items, "item_name", "item_bow_", words_to_keep=100)
items_cat_bow, items_cat_bow_ref = word_counts(item_categories, "item_category_name", "cat_bow_")
shops_bow, shops_bow_ref = word_counts(shops, "shop_name", "shop_bow_")

In [8]:
def int_to_bool(df, search_str="_bow_"):
    for c in df.columns:
        if c.find(search_str)>-1:
            df[c] = df[c].astype(bool)
            
    return df

items_bow = int_to_bool(items_bow)
items_cat_bow = int_to_bool(items_cat_bow)
shops_bow = int_to_bool(shops_bow)

# testing effect without BOW
items_bow = items
items_cat_bow = item_categories
shops_bow = shops

In [9]:
# Aggregate by sale month
sales_by_month = (transactions[index_vars + ["item_cnt_day"]]
                   .groupby(index_vars)
                   .agg(["sum", "count"])
                   .rename(columns={"item_cnt_day":"item_cnt_month"})
                   .reset_index())

sales_by_month.columns = ["sales_cnt_month" if c[1]=="count" else c[0] for c in sales_by_month.columns]
sales_by_month["item_cnt_month_orig"] = sales_by_month["item_cnt_month"]
sales_by_month["item_cnt_month"] = sales_by_month["item_cnt_month"].clip(0, 20)
sales_by_month = sales_by_month[sales_by_month["item_cnt_month"]>0]

In [10]:
# Adding synthetic 0's
VAL_MONTH = 33
sales_by_month_skeleton = pd.DataFrame()

for m in range(0, VAL_MONTH + 1):
    shop_ids = sales_by_month[sales_by_month["date_block_num"] == m]["shop_id"].unique()
    item_ids = sales_by_month[sales_by_month["date_block_num"] == m]["item_id"].unique()
    #   items sold cross product shops with sales during month m
    cross_join = pd.merge(left=pd.Series(shop_ids, name="shop_id"), 
                          right=pd.Series(item_ids, name="item_id"), 
                          how="cross")
    cross_join["date_block_num"] = m
    if m>0:
        cross_join.index = cross_join.index + sales_by_month_skeleton.index.max() + 1
    sales_by_month_skeleton = pd.concat([sales_by_month_skeleton, cross_join], axis="rows")

sales_by_month_skeleton["item_cnt_month_zeros"] = 0.0

sales_by_month = sales_by_month.merge(
            sales_by_month_skeleton,
            on=index_vars,
            how="outer")
sales_by_month["item_cnt_month"] = sales_by_month[["item_cnt_month", "item_cnt_month_zeros"]].max(axis="columns")
sales_by_month = sales_by_month.drop(columns="item_cnt_month_zeros")

del cross_join, sales_by_month_skeleton

In [11]:
# Appending test set
TEST_MONTH = 34

test_x = test_ids.drop(columns="ID")
test_x["date_block_num"] = TEST_MONTH
test_x["item_cnt_month"] = np.nan
test_x.index = test_x.index+sales_by_month.index.max()+1

In [12]:
# Joining training and test sets
sales_by_month = pd.concat([sales_by_month, test_x])

In [13]:
# Months since first sold
first_sold_month = (sales_by_month[sales_by_month["item_cnt_month"] > 0]
                    .groupby(["shop_id", "item_id"])
                    .agg({"date_block_num":"min"})
                    .rename(columns={"date_block_num":"first_sold_month"})
                    .reset_index())
first_sold_month = first_sold_month.astype({"first_sold_month":object})
sales_by_month = sales_by_month.merge(first_sold_month, on=["shop_id", "item_id"], how="left")


# 20210904 - See if model performs better if it thinks month 0 is first month of sale
# sales_by_month["months_since_first_sale"] = sales_by_month["date_block_num"]-sales_by_month["first_sold_month"]
# Cannot tell how many months item was sold before dataset started. Impute with nan.
sales_by_month["months_since_first_sale"] = np.where(
    sales_by_month["first_sold_month"] > 0,
    sales_by_month["date_block_num"]-sales_by_month["first_sold_month"],
    np.nan)

# (sales_by_month[sales_by_month["first_sold_month"]!=0]["months_since_first_sale"]
# .value_counts()
# .sort_index()
# .plot(title="Sales by month after first sale"))
# plt.show()

In [14]:
# Months since last sale excluding current month
last_sold_month=pd.DataFrame()
for m in range(TEST_MONTH+1):
    l = (sales_by_month[(sales_by_month["item_cnt_month"] > 0)&(sales_by_month["date_block_num"]<m)]
                       .groupby(["shop_id", "item_id"])
                       .agg({"date_block_num":"max"})
                       .rename(columns={"date_block_num":"last_sold_month"})
                       .reset_index())
    l["date_block_num"] = m
    if m > 1:
        l.index = l.index + last_sold_month.index.max() + 1
    last_sold_month = pd.concat([last_sold_month, l], axis="rows")

sales_by_month = sales_by_month.merge(last_sold_month, on=index_vars, how="left")
sales_by_month["months_since_last_sale"] = sales_by_month["date_block_num"]-sales_by_month["last_sold_month"]
del l, last_sold_month

In [15]:
# Adding lagged items sold
# item_cnt_month-i counts lagged by i months.
for i in range(1,13):
    sales_by_month[f"month-{i}"] = sales_by_month["date_block_num"]-i
    
for i in range(1,13):
    left_vars = [f"month-{i}", "shop_id", "item_id"]

    sales_by_month = sales_by_month.merge(
                sales_by_month[index_vars+["item_cnt_month"]].rename(columns={"item_cnt_month":f"item_cnt_month-{i}"}),
                left_on=left_vars,
                right_on=index_vars,
                how="left",
                suffixes=("","_y"))
    
#   Replaces nan with 0 if item has sold before month-i
    sales_by_month[f"item_cnt_month-{i}"] = np.select(
        condlist=[sales_by_month[f"month-{i}"] >= sales_by_month[f"first_sold_month"]],
        choicelist=[np.nan_to_num(sales_by_month[f"item_cnt_month-{i}"])],
        default=np.nan
    )
    
    sales_by_month = sales_by_month.drop(columns="date_block_num_y")

In [16]:
# Adding 1 month lagged number of sales

sales_by_month = sales_by_month.merge(
                sales_by_month[index_vars+["sales_cnt_month"]].rename(columns={"sales_cnt_month":f"last_mth_sale_cnt"}),
                left_on=[f"month-1", "shop_id", "item_id"],
                right_on=index_vars,
                how="left",
                suffixes=("","_y"))
sales_by_month = sales_by_month.drop(columns=["date_block_num_y"])
# "sales_cnt_month", 

In [17]:
def encode_mean(df, agg_levels, target_var:str ,time_var:str ,var_name:str):
    
    if type(agg_levels) == str:
        agg_levels = [agg_levels]
    
    means = pd.DataFrame()
    for m in range(TEST_MONTH):
        month_means = (df[sales_by_month[time_var]<=m]
                       .groupby(agg_levels)
                       .agg({target_var:"mean"})
                       .reset_index())
        month_means[time_var] = m + 1
        if m > 0:
            month_means.index = month_means.index + means.index.max() + 1
        means = pd.concat([means, month_means], axis="rows")
    means=means.rename(columns={target_var:var_name})
    
    return df.merge(means, on=agg_levels+[time_var], how="left")

In [18]:
start_month = "01.01.2013"
seasons={
    "01":"summer",
    "02":"summer",
    "03":"autumn",
    "04":"autumn",
    "05":"autumn",
    "06":"winter",
    "07":"winter",
    "08":"winter",
    "09":"spring",
    "10":"spring",
    "11":"spring",
    "12":"summer",
}

sales_by_month["month"] = sales_by_month["date_block_num"].apply(lambda x: f"{x%12 + 1:02}")
sales_by_month["season"] = sales_by_month["month"].map(seasons)
sales_by_month["quarter"] = (sales_by_month["month"].apply(int)+2)//3
sales_by_month["quarter"] = sales_by_month["quarter"].apply(str)

# fig, ax = plt.subplots(1, 3)
# fig.set_size_inches(12,6)
# ax[0].hist(sales_by_month["month"].sort_values())
# ax[0].set_title(label="Frequencies by month")
# ax[1].hist(sales_by_month["season"].sort_values())
# ax[1].set_title(label="Frequencies by season")
# ax[2].hist(sales_by_month["quarter"].sort_values())
# ax[2].set_title(label="Frequencies by quarter")
# plt.plot()

## 2. Validation Split

In [19]:
# import sys
# import importlib
# importlib.reload(sys.modules['validationsplit'])
from validationsplit import split_train_val, split_x_y, val_score

In [20]:
sales_by_month, sales_by_month_na  = downcast_df(sales_by_month, categorical_cols=cat_vars)
items_bow, items_na = downcast_df(items_bow, categorical_cols=cat_vars)
items_cat_bow , items_cat_na= downcast_df(items_cat_bow, categorical_cols=cat_vars)
shops_bow, shops_na = downcast_df(shops_bow, categorical_cols=cat_vars)

Number of integer columns with NA values:20
Number of integer columns with NA values:0
Number of integer columns with NA values:0
Number of integer columns with NA values:0


In [21]:
ds = (sales_by_month
    .merge(items_bow, on="item_id", how="left")
    .merge(items_cat_bow, on="item_category_id", how="left")
    .merge(shops_bow, on="shop_id", how="left"))
ds = ds[[c for c in ds.columns if c not in ignore_vars]]

In [22]:
bow_cols = [c for c in ds.columns if c.find("_bow_")>-1]
not_bow_cols = [c for c in ds.columns if c.find("_bow_")==-1]

In [23]:
train, val = split_train_val(ds, 33)
_, test = split_train_val(ds, 34)
train_x, train_y = split_x_y(train, "item_cnt_month")
val_x, val_y = split_x_y(val, "item_cnt_month")
test_x, _ = split_x_y(test, "item_cnt_month")

In [24]:
del _, train, val

## 4. Fitting catboost

In [25]:
import catboost as cb
MODEL_FOLDER = "catboost_models/"

In [26]:
model = cb.CatBoostRegressor(learning_rate=0.04
                            ,loss_function="RMSE"
                            ,iterations=1000
                            ,task_type="GPU"
                            ,boosting_type="Plain"
                            ,max_ctr_complexity=2)

In [27]:
train_y = train_y.astype(np.uint8)
val_y = val_y.astype(np.uint8)

In [28]:
train_pool = cb.Pool(train_x,
                     train_y,
                     cat_features=cat_vars)
val_pool = cb.Pool(val_x,
                   val_y,
                   cat_features=cat_vars)

In [29]:
MODEL_NAME = "cb_20210905_mean_encode"
model.fit(train_pool, eval_set=val_pool)
model.save_model(MODEL_FOLDER+MODEL_NAME, format="cbm")
# model.load_model(MODEL_FOLDER+MODEL_NAME, format="cbm")

0:	learn: 1.2175229	test: 1.1283243	best: 1.1283243 (0)	total: 1.11s	remaining: 18m 32s
1:	learn: 1.2094371	test: 1.1207274	best: 1.1207274 (1)	total: 2.23s	remaining: 18m 31s
2:	learn: 1.1960949	test: 1.1121497	best: 1.1121497 (2)	total: 2.94s	remaining: 16m 18s
3:	learn: 1.1833953	test: 1.1043023	best: 1.1043023 (3)	total: 3.75s	remaining: 15m 35s
4:	learn: 1.1715260	test: 1.0971872	best: 1.0971872 (4)	total: 4.56s	remaining: 15m 7s
5:	learn: 1.1601537	test: 1.0907342	best: 1.0907342 (5)	total: 5.27s	remaining: 14m 32s
6:	learn: 1.1498803	test: 1.0850927	best: 1.0850927 (6)	total: 6.08s	remaining: 14m 21s
7:	learn: 1.1397310	test: 1.0798344	best: 1.0798344 (7)	total: 6.88s	remaining: 14m 12s
8:	learn: 1.1305435	test: 1.0754234	best: 1.0754234 (8)	total: 7.68s	remaining: 14m 6s
9:	learn: 1.1216290	test: 1.0713979	best: 1.0713979 (9)	total: 8.44s	remaining: 13m 56s
10:	learn: 1.1136130	test: 1.0674154	best: 1.0674154 (10)	total: 9.32s	remaining: 13m 58s
11:	learn: 1.1055525	test: 1.064

92:	learn: 0.9183654	test: 1.0323223	best: 1.0315189 (91)	total: 1m 22s	remaining: 13m 24s
93:	learn: 0.9174674	test: 1.0309040	best: 1.0309040 (93)	total: 1m 23s	remaining: 13m 25s
94:	learn: 0.9170221	test: 1.0312406	best: 1.0309040 (93)	total: 1m 24s	remaining: 13m 25s
95:	learn: 0.9147539	test: 1.0290364	best: 1.0290364 (95)	total: 1m 25s	remaining: 13m 26s
96:	learn: 0.9145335	test: 1.0289767	best: 1.0289767 (96)	total: 1m 26s	remaining: 13m 24s
97:	learn: 0.9142709	test: 1.0296101	best: 1.0289767 (96)	total: 1m 27s	remaining: 13m 23s
98:	learn: 0.9138937	test: 1.0299531	best: 1.0289767 (96)	total: 1m 28s	remaining: 13m 23s
99:	learn: 0.9117930	test: 1.0277086	best: 1.0277086 (99)	total: 1m 29s	remaining: 13m 24s
100:	learn: 0.9111985	test: 1.0259387	best: 1.0259387 (100)	total: 1m 30s	remaining: 13m 22s
101:	learn: 0.9109965	test: 1.0256922	best: 1.0256922 (101)	total: 1m 30s	remaining: 13m 19s
102:	learn: 0.9108137	test: 1.0256304	best: 1.0256304 (102)	total: 1m 31s	remaining: 1

181:	learn: 0.8468596	test: 0.9771113	best: 0.9771113 (181)	total: 2m 52s	remaining: 12m 55s
182:	learn: 0.8462159	test: 0.9761260	best: 0.9761260 (182)	total: 2m 53s	remaining: 12m 54s
183:	learn: 0.8460896	test: 0.9754528	best: 0.9754528 (183)	total: 2m 54s	remaining: 12m 54s
184:	learn: 0.8452561	test: 0.9737883	best: 0.9737883 (184)	total: 2m 55s	remaining: 12m 54s
185:	learn: 0.8445773	test: 0.9742319	best: 0.9737883 (184)	total: 2m 56s	remaining: 12m 54s
186:	learn: 0.8443074	test: 0.9729268	best: 0.9729268 (186)	total: 2m 57s	remaining: 12m 53s
187:	learn: 0.8442409	test: 0.9730738	best: 0.9729268 (186)	total: 2m 58s	remaining: 12m 52s
188:	learn: 0.8434611	test: 0.9714233	best: 0.9714233 (188)	total: 2m 59s	remaining: 12m 52s
189:	learn: 0.8432259	test: 0.9703676	best: 0.9703676 (189)	total: 3m	remaining: 12m 50s
190:	learn: 0.8426067	test: 0.9702302	best: 0.9702302 (190)	total: 3m 1s	remaining: 12m 49s
191:	learn: 0.8423975	test: 0.9697875	best: 0.9697875 (191)	total: 3m 2s	re

270:	learn: 0.8218308	test: 0.9357632	best: 0.9356626 (269)	total: 4m 20s	remaining: 11m 41s
271:	learn: 0.8217435	test: 0.9353138	best: 0.9353138 (271)	total: 4m 21s	remaining: 11m 40s
272:	learn: 0.8213070	test: 0.9345442	best: 0.9345442 (272)	total: 4m 22s	remaining: 11m 39s
273:	learn: 0.8212262	test: 0.9340826	best: 0.9340826 (273)	total: 4m 23s	remaining: 11m 38s
274:	learn: 0.8211998	test: 0.9341731	best: 0.9340826 (273)	total: 4m 24s	remaining: 11m 37s
275:	learn: 0.8211585	test: 0.9339271	best: 0.9339271 (275)	total: 4m 25s	remaining: 11m 35s
276:	learn: 0.8210630	test: 0.9337779	best: 0.9337779 (276)	total: 4m 26s	remaining: 11m 35s
277:	learn: 0.8206989	test: 0.9343646	best: 0.9337779 (276)	total: 4m 27s	remaining: 11m 34s
278:	learn: 0.8203878	test: 0.9344710	best: 0.9337779 (276)	total: 4m 28s	remaining: 11m 33s
279:	learn: 0.8201585	test: 0.9341614	best: 0.9337779 (276)	total: 4m 29s	remaining: 11m 32s
280:	learn: 0.8200420	test: 0.9337118	best: 0.9337118 (280)	total: 4m 

359:	learn: 0.8098920	test: 0.9166247	best: 0.9166247 (359)	total: 5m 46s	remaining: 10m 15s
360:	learn: 0.8098259	test: 0.9165599	best: 0.9165599 (360)	total: 5m 46s	remaining: 10m 14s
361:	learn: 0.8097891	test: 0.9163685	best: 0.9163685 (361)	total: 5m 47s	remaining: 10m 12s
362:	learn: 0.8097489	test: 0.9162614	best: 0.9162614 (362)	total: 5m 48s	remaining: 10m 11s
363:	learn: 0.8097262	test: 0.9162090	best: 0.9162090 (363)	total: 5m 49s	remaining: 10m 10s
364:	learn: 0.8096223	test: 0.9158420	best: 0.9158420 (364)	total: 5m 50s	remaining: 10m 9s
365:	learn: 0.8095991	test: 0.9156243	best: 0.9156243 (365)	total: 5m 50s	remaining: 10m 7s
366:	learn: 0.8094831	test: 0.9156445	best: 0.9156243 (365)	total: 5m 52s	remaining: 10m 7s
367:	learn: 0.8094554	test: 0.9156047	best: 0.9156047 (367)	total: 5m 52s	remaining: 10m 6s
368:	learn: 0.8091040	test: 0.9157946	best: 0.9156047 (367)	total: 5m 54s	remaining: 10m 5s
369:	learn: 0.8088843	test: 0.9158678	best: 0.9156047 (367)	total: 5m 55s	r

449:	learn: 0.8046322	test: 0.9113007	best: 0.9113007 (449)	total: 7m 4s	remaining: 8m 39s
450:	learn: 0.8043425	test: 0.9112280	best: 0.9112280 (450)	total: 7m 5s	remaining: 8m 38s
451:	learn: 0.8043294	test: 0.9111180	best: 0.9111180 (451)	total: 7m 6s	remaining: 8m 37s
452:	learn: 0.8043030	test: 0.9110743	best: 0.9110743 (452)	total: 7m 7s	remaining: 8m 35s
453:	learn: 0.8038500	test: 0.9113404	best: 0.9110743 (452)	total: 7m 8s	remaining: 8m 35s
454:	learn: 0.8038171	test: 0.9112692	best: 0.9110743 (452)	total: 7m 9s	remaining: 8m 34s
455:	learn: 0.8037551	test: 0.9112229	best: 0.9110743 (452)	total: 7m 10s	remaining: 8m 33s
456:	learn: 0.8036953	test: 0.9111709	best: 0.9110743 (452)	total: 7m 10s	remaining: 8m 32s
457:	learn: 0.8036827	test: 0.9110652	best: 0.9110652 (457)	total: 7m 11s	remaining: 8m 31s
458:	learn: 0.8036715	test: 0.9109431	best: 0.9109431 (458)	total: 7m 12s	remaining: 8m 29s
459:	learn: 0.8035568	test: 0.9104220	best: 0.9104220 (459)	total: 7m 13s	remaining: 8

539:	learn: 0.7984998	test: 0.9084572	best: 0.9077958 (507)	total: 8m 26s	remaining: 7m 11s
540:	learn: 0.7984544	test: 0.9084371	best: 0.9077958 (507)	total: 8m 27s	remaining: 7m 10s
541:	learn: 0.7984380	test: 0.9084026	best: 0.9077958 (507)	total: 8m 28s	remaining: 7m 9s
542:	learn: 0.7984055	test: 0.9083571	best: 0.9077958 (507)	total: 8m 29s	remaining: 7m 8s
543:	learn: 0.7983626	test: 0.9082775	best: 0.9077958 (507)	total: 8m 30s	remaining: 7m 7s
544:	learn: 0.7983230	test: 0.9082496	best: 0.9077958 (507)	total: 8m 31s	remaining: 7m 6s
545:	learn: 0.7983134	test: 0.9082329	best: 0.9077958 (507)	total: 8m 32s	remaining: 7m 5s
546:	learn: 0.7981043	test: 0.9081568	best: 0.9077958 (507)	total: 8m 33s	remaining: 7m 4s
547:	learn: 0.7980894	test: 0.9081843	best: 0.9077958 (507)	total: 8m 33s	remaining: 7m 3s
548:	learn: 0.7980835	test: 0.9081959	best: 0.9077958 (507)	total: 8m 34s	remaining: 7m 2s
549:	learn: 0.7978920	test: 0.9081129	best: 0.9077958 (507)	total: 8m 36s	remaining: 7m 

629:	learn: 0.7939984	test: 0.9051307	best: 0.9050841 (628)	total: 9m 49s	remaining: 5m 46s
630:	learn: 0.7939546	test: 0.9050943	best: 0.9050841 (628)	total: 9m 50s	remaining: 5m 45s
631:	learn: 0.7938963	test: 0.9051121	best: 0.9050841 (628)	total: 9m 51s	remaining: 5m 44s
632:	learn: 0.7938578	test: 0.9050864	best: 0.9050841 (628)	total: 9m 52s	remaining: 5m 43s
633:	learn: 0.7938303	test: 0.9050270	best: 0.9050270 (633)	total: 9m 52s	remaining: 5m 42s
634:	learn: 0.7934337	test: 0.9059095	best: 0.9050270 (633)	total: 9m 53s	remaining: 5m 41s
635:	learn: 0.7930580	test: 0.9065271	best: 0.9050270 (633)	total: 9m 55s	remaining: 5m 40s
636:	learn: 0.7930501	test: 0.9064607	best: 0.9050270 (633)	total: 9m 55s	remaining: 5m 39s
637:	learn: 0.7930444	test: 0.9064734	best: 0.9050270 (633)	total: 9m 56s	remaining: 5m 38s
638:	learn: 0.7929130	test: 0.9067473	best: 0.9050270 (633)	total: 9m 57s	remaining: 5m 37s
639:	learn: 0.7928881	test: 0.9066487	best: 0.9050270 (633)	total: 9m 58s	remain

718:	learn: 0.7892767	test: 0.9059810	best: 0.9050270 (633)	total: 11m 11s	remaining: 4m 22s
719:	learn: 0.7892361	test: 0.9059456	best: 0.9050270 (633)	total: 11m 12s	remaining: 4m 21s
720:	learn: 0.7892193	test: 0.9059779	best: 0.9050270 (633)	total: 11m 13s	remaining: 4m 20s
721:	learn: 0.7892125	test: 0.9059900	best: 0.9050270 (633)	total: 11m 14s	remaining: 4m 19s
722:	learn: 0.7890649	test: 0.9059258	best: 0.9050270 (633)	total: 11m 15s	remaining: 4m 18s
723:	learn: 0.7890389	test: 0.9058881	best: 0.9050270 (633)	total: 11m 16s	remaining: 4m 17s
724:	learn: 0.7890126	test: 0.9057802	best: 0.9050270 (633)	total: 11m 17s	remaining: 4m 16s
725:	learn: 0.7890076	test: 0.9057969	best: 0.9050270 (633)	total: 11m 18s	remaining: 4m 15s
726:	learn: 0.7889993	test: 0.9058214	best: 0.9050270 (633)	total: 11m 18s	remaining: 4m 14s
727:	learn: 0.7889900	test: 0.9058368	best: 0.9050270 (633)	total: 11m 19s	remaining: 4m 13s
728:	learn: 0.7889549	test: 0.9057546	best: 0.9050270 (633)	total: 11m

807:	learn: 0.7861239	test: 0.9041605	best: 0.9035224 (750)	total: 12m 33s	remaining: 2m 58s
808:	learn: 0.7860105	test: 0.9040767	best: 0.9035224 (750)	total: 12m 33s	remaining: 2m 58s
809:	learn: 0.7859736	test: 0.9040600	best: 0.9035224 (750)	total: 12m 34s	remaining: 2m 57s
810:	learn: 0.7859260	test: 0.9040108	best: 0.9035224 (750)	total: 12m 35s	remaining: 2m 56s
811:	learn: 0.7859203	test: 0.9039540	best: 0.9035224 (750)	total: 12m 36s	remaining: 2m 55s
812:	learn: 0.7859145	test: 0.9040004	best: 0.9035224 (750)	total: 12m 37s	remaining: 2m 54s
813:	learn: 0.7858865	test: 0.9039111	best: 0.9035224 (750)	total: 12m 38s	remaining: 2m 53s
814:	learn: 0.7858645	test: 0.9038910	best: 0.9035224 (750)	total: 12m 39s	remaining: 2m 52s
815:	learn: 0.7858620	test: 0.9038712	best: 0.9035224 (750)	total: 12m 40s	remaining: 2m 51s
816:	learn: 0.7857884	test: 0.9035936	best: 0.9035224 (750)	total: 12m 41s	remaining: 2m 50s
817:	learn: 0.7857857	test: 0.9035670	best: 0.9035224 (750)	total: 12m

896:	learn: 0.7833723	test: 0.9055437	best: 0.9035224 (750)	total: 13m 53s	remaining: 1m 35s
897:	learn: 0.7833709	test: 0.9055303	best: 0.9035224 (750)	total: 13m 53s	remaining: 1m 34s
898:	learn: 0.7833652	test: 0.9054500	best: 0.9035224 (750)	total: 13m 54s	remaining: 1m 33s
899:	learn: 0.7833581	test: 0.9053319	best: 0.9035224 (750)	total: 13m 55s	remaining: 1m 32s
900:	learn: 0.7833531	test: 0.9053672	best: 0.9035224 (750)	total: 13m 56s	remaining: 1m 31s
901:	learn: 0.7833497	test: 0.9053660	best: 0.9035224 (750)	total: 13m 57s	remaining: 1m 30s
902:	learn: 0.7832885	test: 0.9055197	best: 0.9035224 (750)	total: 13m 58s	remaining: 1m 30s
903:	learn: 0.7832827	test: 0.9055297	best: 0.9035224 (750)	total: 13m 59s	remaining: 1m 29s
904:	learn: 0.7832798	test: 0.9055142	best: 0.9035224 (750)	total: 13m 59s	remaining: 1m 28s
905:	learn: 0.7832748	test: 0.9055809	best: 0.9035224 (750)	total: 14m	remaining: 1m 27s
906:	learn: 0.7832705	test: 0.9055319	best: 0.9035224 (750)	total: 14m 1s	

986:	learn: 0.7810596	test: 0.9080377	best: 0.9035224 (750)	total: 15m 14s	remaining: 12s
987:	learn: 0.7810510	test: 0.9079415	best: 0.9035224 (750)	total: 15m 15s	remaining: 11.1s
988:	learn: 0.7809476	test: 0.9079365	best: 0.9035224 (750)	total: 15m 16s	remaining: 10.2s
989:	learn: 0.7809135	test: 0.9081263	best: 0.9035224 (750)	total: 15m 17s	remaining: 9.26s
990:	learn: 0.7808829	test: 0.9081467	best: 0.9035224 (750)	total: 15m 18s	remaining: 8.34s
991:	learn: 0.7808228	test: 0.9082894	best: 0.9035224 (750)	total: 15m 19s	remaining: 7.41s
992:	learn: 0.7808128	test: 0.9083051	best: 0.9035224 (750)	total: 15m 20s	remaining: 6.49s
993:	learn: 0.7808004	test: 0.9082615	best: 0.9035224 (750)	total: 15m 21s	remaining: 5.56s
994:	learn: 0.7807807	test: 0.9082568	best: 0.9035224 (750)	total: 15m 22s	remaining: 4.63s
995:	learn: 0.7807784	test: 0.9082625	best: 0.9035224 (750)	total: 15m 23s	remaining: 3.71s
996:	learn: 0.7807752	test: 0.9082341	best: 0.9035224 (750)	total: 15m 23s	remaini

In [30]:
pred = pd.Series(model.predict(val_pool))
print(f"{val_score(pred,val_y):.5f}")

0.90123


## 4A Feature Importance

In [None]:
fi = model.get_feature_importance(train_pool, type="LossFunctionChange")
feature_scores = pd.DataFrame(list(zip(train_x.dtypes.index, fi )),
                                        columns=['feature','score'])
feature_scores.sort_values("score", ascending=False, inplace=True)

In [None]:
feature_scores.head(50)

In [None]:
# feature_scores.to_csv(MODEL_FOLDER+"basic_cb_model_LossFunctionChange.csv")
feature_scores = pd.read_csv(MODEL_FOLDER+"basic_cb_model_LossFunctionChange.csv", index_col=0)

In [None]:
plt.figure(figsize=(12,6))
plt.bar(feature_scores["feature"].head(50), feature_scores["score"].head(50))
plt.xticks(rotation=90)
plt.show()

In [None]:
feature_scores[feature_scores["feature"].str.find("_bow_")==-1]

In [None]:
feature_scores[feature_scores["feature"].str.find("_bow_")>-1].head(20)

In [None]:
interaction = model.get_feature_importance(
    train_pool, type="Interaction")
interaction[:,0].

In [None]:
fi_new = []
for k,item in enumerate(fi):  
    first = test_x.dtypes.index[interaction[k][0]]
    second = test_x.dtypes.index[interaction[k][1]]
    if first != second:
        fi_new.append([first + "/" + second, interaction[k][2]])
feature_score = pd.DataFrame(fi_new,columns=['Feature-Pair','Score'])

In [None]:
feature_score = feature_score.sort_values(by='Score', ascending=False, inplace=False, kind='quicksort', na_position='last')
plt.rcParams["figure.figsize"] = (16,7)
ax = feature_score.head(50).plot('Feature-Pair', 'Score', kind='bar', color='c')
ax.set_title("Pairwise Feature Importance", fontsize = 14)
ax.set_xlabel("features Pair")
plt.show()

## 5. Model prediction analysis

In [None]:
pred = pred.rename("y_hat")
val_y = val_y.rename("y")
pred.index = val_y.index
results = pd.concat([val_x, pred, val_y], axis="columns")

In [None]:
results["new_items"] = results["months_since_first_sale"]==1

In [None]:
subset = results.sample(frac=0.05)
plt.scatter(
    x=subset["y_hat"].clip(0,20), 
    y=subset["y"].clip(0,20)+np.random.rand(subset.shape[0])-0.5,
    c=subset["new_items"]
)
plt.xlabel("Predicted Values")
plt.ylabel("Actual Values")
plt.show()

In [None]:
results["diff"]=results["y_hat"].clip(0,20)-results["y"].clip(0,20)

In [None]:
results=results.sort_values("diff")
results

In [None]:
results[results['y']==0]["diff"].describe(percentiles=[0.01, 0.05, 0.25, 0.75, 0.95, 0.99])

In [None]:
plt.hist(results[results['y']==0]["diff"],bins=100)
plt.show()

## 9. Submission

In [None]:
# Finding values not in training data

test_items = (test_ids["item_id"]
              .value_counts()
              .reset_index()
              .rename(columns={"item_id":"test", "index":"item_id"}))
train_items = (transactions["item_id"]
              .value_counts()
              .reset_index()
              .rename(columns={"item_id":"train", "index":"item_id"}))

item_overlap = test_items.merge(train_items, on="item_id", how="outer")

print("Items only in test set:", item_overlap["train"].isna().sum())
print("Items only in train set:", item_overlap["test"].isna().sum())
print("Items in both test and train set:", (item_overlap["train"].notna()&item_overlap["test"].notna()).sum())
print("Items in test set and items ds: 5100")

In [31]:
test_y = pd.Series(
    model.predict(test_x).clip(0,20),
    index=test_x.index,
    name="item_cnt_month")

In [32]:
test_y=pd.concat([test_x[["shop_id","item_id"]],test_y], axis="columns")

In [33]:
test_y=test_ids.merge(test_y, on=["shop_id", "item_id"], how="outer")

In [34]:
test_y[["ID", "item_cnt_month"]].to_csv(SUBMISSION_FOLDER+MODEL_NAME+".csv",index=False)
print(MODEL_NAME+".csv")

cb_20210905_mean_encode.csv
