In [21]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from freetextmethods import word_counts
from downcast import downcast_df

In [22]:
pd.options.mode.use_inf_as_na = True
DATA_FOLDER = '/home/roger/code/kaggle/predict-future-sales/data/'
PRED_FOLDER = '/home/roger/code/kaggle/predict-future-sales/predictions/'
SUBMISSION_FOLDER = "comp_submissions/"
index_vars = ["date_block_num", "shop_id", "item_id"]

transactions       = pd.read_csv(DATA_FOLDER + 'sales_train.csv')
items              = pd.read_csv(DATA_FOLDER + 'items.csv')
item_categories    = pd.read_csv(DATA_FOLDER + 'item_categories_en.csv')
item_categories_en = pd.read_csv(DATA_FOLDER + 'item_categories_en.csv')
shops              = pd.read_csv(DATA_FOLDER + 'shops_en.csv')
shops_en           = pd.read_csv(DATA_FOLDER + 'shops_en.csv')
test_ids           = pd.read_csv(DATA_FOLDER + 'test.csv')
sample             = pd.read_csv(DATA_FOLDER + 'sample_submission.csv')

# Adding English translations to item_categories and shops
item_categories = item_categories.merge(
    item_categories_en[["item_category_id", "item_category_name"]]
        .rename(columns={"item_category_name":"item_category_name_en"}),
    on="item_category_id")
item_categories.head()

shops = shops.merge(
    shops_en[["shop_id", "shop_name"]]
        .rename(columns={"shop_name":"shop_name_en"}),
    on="shop_id")

In [23]:
ignore_vars = [
    "first_sold_month",
    "month-1",
    "month-2",
    "month-3",
    "month-4",
    "month-5",
    "month-6",
    "month-7",
    "month-8",
    "month-9",
    "month-10",
    "month-11",
    "month-12",
    
    "item_cnt_month_orig",
    "item_cnt_month_orig_na",
    "item_name",
    "item_category_name",
    "item_category_name_en",
    "shop_name",
    "shop_name_en",
]

cat_vars = [
    "shop_id",
    "item_id",
    "month",
    "season",
    "quarter",
    "item_category_id",
]
# cat_vars += [c for c in items_bow.columns if c.startswith("item_bow_")]
# cat_vars += [c for c in items_cat_bow.columns if c.startswith("cat_bow_")]
# cat_vars += [c for c in shops_bow.columns if c.startswith("shop_bow_")]

In [24]:
def see(shop_id,item_id):
    return sales_by_month[(sales_by_month["item_id"]==item_id)&(sales_by_month["shop_id"]==shop_id)]

In [25]:
print(transactions.dtypes, "\n")
print(items.dtypes, "\n")
print(item_categories.dtypes, "\n")
print(shops.dtypes, "\n")

date               object
date_block_num      int64
shop_id             int64
item_id             int64
item_price        float64
item_cnt_day      float64
dtype: object 

item_name           object
item_id              int64
item_category_id     int64
dtype: object 

item_category_name       object
item_category_id          int64
item_category_name_en    object
dtype: object 

shop_name       object
shop_id          int64
shop_name_en    object
dtype: object 



## Contents

1. Features before split
2. Validation split
3. Features after split
4. Catboost
    1. Feature importance


9. Submission

## 1. Features before split

In [26]:
# item_name_length
items["item_name_length"] = items["item_name"].apply(lambda x:len(x.strip()))
# items["item_name_length"].value_counts().sort_index().plot()
# plt.title("Item name length frequencies")
# plt.plot()

In [27]:
items_bow, item_bow_ref = word_counts(items, "item_name", "item_bow_", words_to_keep=100)
items_cat_bow, items_cat_bow_ref = word_counts(item_categories, "item_category_name", "cat_bow_")
shops_bow, shops_bow_ref = word_counts(shops, "shop_name", "shop_bow_")

In [28]:
def int_to_bool(df, search_str="_bow_"):
    for c in df.columns:
        if c.find(search_str)>-1:
            df[c] = df[c].astype(bool)
            
    return df

items_bow = int_to_bool(items_bow)
items_cat_bow = int_to_bool(items_cat_bow)
shops_bow = int_to_bool(shops_bow)

# testing effect without BOW
items_bow = items
items_cat_bow = item_categories
shops_bow = shops

In [29]:
# Aggregate by sale month + append test set
sales_by_month = (transactions[index_vars + ["item_cnt_day"]]
                   .groupby(index_vars)
                   .agg(["sum", "count"])
                   .rename(columns={"item_cnt_day":"item_cnt_month"})
                   .reset_index())

sales_by_month.columns = ["sales_cnt_month" if c[1]=="count" else c[0] for c in sales_by_month.columns]
sales_by_month["item_cnt_month_orig"] = sales_by_month["item_cnt_month"]
sales_by_month["item_cnt_month"] = sales_by_month["item_cnt_month"].clip(0, 20)
sales_by_month = sales_by_month[sales_by_month["item_cnt_month"]>0]

In [30]:
# Creating test set
TEST_MONTH = 34

test_x = test_ids.drop(columns="ID")
test_x["date_block_num"] = TEST_MONTH
test_x["item_cnt_month"] = np.nan
test_x.index = test_x.index+sales_by_month.index.max()+1

In [31]:
# Joining training and test sets
sales_by_month = pd.concat([sales_by_month, test_x])

In [32]:
# Months since first sold
first_sold_month = (sales_by_month[sales_by_month["item_cnt_month"] > 0]
                    .groupby(["shop_id", "item_id"])
                    .agg({"date_block_num":"min"})
                    .rename(columns={"date_block_num":"first_sold_month"})
                    .reset_index())
first_sold_month = first_sold_month.astype({"first_sold_month":object})
sales_by_month = sales_by_month.merge(first_sold_month, on=["shop_id", "item_id"], how="left")


# 20210904 - See if model performs better if it thinks month 0 is first month of sale
sales_by_month["months_since_first_sale"] = sales_by_month["date_block_num"]-sales_by_month["first_sold_month"]
# Cannot tell how many months item was sold before dataset started. Impute with nan.
# sales_by_month["months_since_first_sale"] = np.where(
#     sales_by_month["first_sold_month"] > 0,
#     sales_by_month["date_block_num"]-sales_by_month["first_sold_month"],
#     np.nan)

# (sales_by_month[sales_by_month["first_sold_month"]!=0]["months_since_first_sale"]
# .value_counts()
# .sort_index()
# .plot(title="Sales by month after first sale"))
# plt.show()

In [33]:
# Months since last sale excluding current month
last_sold_month=pd.DataFrame()
for m in range(TEST_MONTH+1):
    l = (sales_by_month[(sales_by_month["item_cnt_month"] > 0)&(sales_by_month["date_block_num"]<m)]
                       .groupby(["shop_id", "item_id"])
                       .agg({"date_block_num":"max"})
                       .rename(columns={"date_block_num":"last_sold_month"})
                       .reset_index())
    l["date_block_num"] = m
    if m > 1:
        l.index = l.index + last_sold_month.index.max() + 1
    last_sold_month = pd.concat([last_sold_month, l], axis="rows")

sales_by_month = sales_by_month.merge(last_sold_month, on=index_vars, how="left")
sales_by_month["months_since_last_sale"] = sales_by_month["date_block_num"]-sales_by_month["last_sold_month"]
del l, last_sold_month

In [34]:
# Adding lagged sales
# item_cnt_month-i counts lagged by i months.
for i in range(1,13):
    sales_by_month[f"month-{i}"] = sales_by_month["date_block_num"]-i
    
for i in range(1,13):
    left_vars = [f"month-{i}", "shop_id", "item_id"]

    sales_by_month = sales_by_month.merge(
                sales_by_month[index_vars+["item_cnt_month"]].rename(columns={"item_cnt_month":f"item_cnt_month-{i}"}),
                left_on=left_vars,
                right_on=index_vars,
                how="left",
                suffixes=("","_y"))
    
#   Replaces nan with 0 if item has sold before month-i
    sales_by_month[f"item_cnt_month-{i}"] = np.select(
        condlist=[sales_by_month[f"month-{i}"] >= sales_by_month[f"first_sold_month"]],
        choicelist=[np.nan_to_num(sales_by_month[f"item_cnt_month-{i}"])],
        default=np.nan
    )
    
    sales_by_month = sales_by_month.drop(columns="date_block_num_y")

In [35]:
# Adding 1 month lagged number of sales

sales_by_month = sales_by_month.merge(
                sales_by_month[index_vars+["sales_cnt_month"]].rename(columns={"sales_cnt_month":f"last_mth_sale_cnt"}),
                left_on=[f"month-1", "shop_id", "item_id"],
                right_on=index_vars,
                how="left",
                suffixes=("","_y"))
sales_by_month = sales_by_month.drop(columns=["date_block_num_y"])
# "sales_cnt_month", 

In [36]:
def encode_mean(df, agg_levels, target_var:str ,time_var:str ,var_name:str):
    
    if type(agg_levels) == str:
        agg_levels = [agg_levels]
    
    means = pd.DataFrame()
    for m in range(TEST_MONTH):
        month_means = (df[sales_by_month[time_var]<=m]
                       .groupby(agg_levels)
                       .agg({target_var:"mean"})
                       .reset_index())
        month_means[time_var] = m + 1
        if m > 0:
            month_means.index = month_means.index + means.index.max() + 1
        means = pd.concat([means, month_means], axis="rows")
    means=means.rename(columns={target_var:var_name})
    
    return df.merge(means, on=agg_levels+[time_var], how="left")

In [37]:
sales_by_month = downcast_df(sales_by_month, categorical_cols=cat_vars, verbose="dataframe")


sales_by_month = sales_by_month.merge(items[["item_id", "item_category_id"]],
                                      on="item_id",
                                      how="left")

sales_by_month = encode_mean(sales_by_month, agg_levels="item_id", 
                             target_var="item_cnt_month", time_var="date_block_num",
                             var_name="item_rolling_mean")
sales_by_month = encode_mean(sales_by_month, agg_levels="shop_id", 
                             target_var="item_cnt_month", time_var="date_block_num",
                             var_name="shop_rolling_mean")

Memory usage of df before: 593 MB
Memory usage of df after: 115 MB


In [38]:
sales_by_month = encode_mean(sales_by_month, agg_levels="item_category_id",
                             target_var="item_cnt_month", time_var="date_block_num",
                             var_name="cat_rolling_mean")
sales_by_month = encode_mean(sales_by_month, agg_levels=["item_id", "shop_id"], 
                             target_var="item_cnt_month", time_var="date_block_num",
                             var_name="item_shop_rolling_mean")
sales_by_month = encode_mean(sales_by_month, agg_levels=["item_category_id", "shop_id"], 
                             target_var="item_cnt_month", time_var="date_block_num",
                             var_name="cat_shop_rolling_mean")

sales_by_month = sales_by_month.drop(columns="item_category_id")

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,sales_cnt_month,item_cnt_month_orig,first_sold_month,months_since_first_sale,last_sold_month,months_since_last_sale,...,item_cnt_month-9,item_cnt_month-10,item_cnt_month-11,item_cnt_month-12,last_mth_sale_cnt,item_rolling_mean,shop_rolling_mean,cat_rolling_mean,item_shop_rolling_mean,cat_shop_rolling_mean
0,0,0,32,6.0,4.0,6.0,0.0,0.0,,,...,,,,,,,,,,
1,0,0,33,3.0,3.0,3.0,0.0,0.0,,,...,,,,,,,,,,
2,0,0,35,1.0,1.0,1.0,0.0,0.0,,,...,,,,,,,,,,
3,0,0,43,1.0,1.0,1.0,0.0,0.0,,,...,,,,,,,,,,
4,0,0,51,2.0,2.0,2.0,0.0,0.0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1819821,34,45,18454,,,,23.0,11.0,33.0,1.0,...,1.0,2.0,4.0,,1.0,2.238281,1.71875,1.403320,1.799805,1.272461
1819822,34,45,16188,,,,,,,,...,,,,,,1.000000,1.71875,1.617188,,1.260742
1819823,34,45,15757,,,,0.0,34.0,25.0,9.0,...,1.0,1.0,0.0,0.0,,1.163086,1.71875,1.403320,1.166992,1.272461
1819824,34,45,19648,,,,,,,,...,,,,,,1.302734,1.71875,1.813477,,1.474609


In [None]:
start_month = "01.01.2013"
seasons={
    "01":"summer",
    "02":"summer",
    "03":"autumn",
    "04":"autumn",
    "05":"autumn",
    "06":"winter",
    "07":"winter",
    "08":"winter",
    "09":"spring",
    "10":"spring",
    "11":"spring",
    "12":"summer",
}

sales_by_month["month"] = sales_by_month["date_block_num"].apply(lambda x: f"{x%12 + 1:02}")
sales_by_month["season"] = sales_by_month["month"].map(seasons)
sales_by_month["quarter"] = (sales_by_month["month"].apply(int)+2)//3
sales_by_month["quarter"] = sales_by_month["quarter"].apply(str)

# fig, ax = plt.subplots(1, 3)
# fig.set_size_inches(12,6)
# ax[0].hist(sales_by_month["month"].sort_values())
# ax[0].set_title(label="Frequencies by month")
# ax[1].hist(sales_by_month["season"].sort_values())
# ax[1].set_title(label="Frequencies by season")
# ax[2].hist(sales_by_month["quarter"].sort_values())
# ax[2].set_title(label="Frequencies by quarter")
# plt.plot()

## 2. Validation Split

In [None]:
# import sys
# import importlib
# importlib.reload(sys.modules['validationsplit'])
from validationsplit import split_train_val, split_x_y, val_score

In [None]:
sales_by_month = downcast_df(sales_by_month, categorical_cols=cat_vars)
items_bow = downcast_df(items_bow, categorical_cols=cat_vars)
items_cat_bow = downcast_df(items_cat_bow, categorical_cols=cat_vars)
shops_bow = downcast_df(shops_bow, categorical_cols=cat_vars)

In [None]:
ds = (sales_by_month
    .merge(items_bow, on="item_id", how="left")
    .merge(items_cat_bow, on="item_category_id", how="left")
    .merge(shops_bow, on="shop_id", how="left")
    .drop(columns=ignore_vars))

In [None]:
bow_cols = [c for c in ds.columns if c.find("_bow_")>-1]
not_bow_cols = [c for c in ds.columns if c.find("_bow_")==-1]

In [None]:
train, val = split_train_val(ds, 33)
_, test = split_train_val(ds, 34)
train_x, train_y = split_x_y(train, "item_cnt_month")
val_x, val_y = split_x_y(val, "item_cnt_month")
test_x, _ = split_x_y(test, "item_cnt_month")

In [None]:
del _, train, val

## 3. Features after split

## 3A. EDA

## 4. Fitting catboost

In [None]:
import catboost as cb
MODEL_FOLDER = "catboost_models/"

In [None]:
model = cb.CatBoostRegressor(learning_rate=0.03
                            ,loss_function="RMSE"
                            ,iterations=2000
                            ,task_type="GPU"
                            ,boosting_type="Plain"
                            ,max_ctr_complexity=2)

In [None]:
train_pool = cb.Pool(train_x,
                     train_y,
                     cat_features=cat_vars)
val_pool = cb.Pool(val_x,
                   val_y,
                   cat_features=cat_vars)

In [None]:
MODEL_NAME = "cb_20210904_0"
model.fit(train_pool, eval_set=val_pool)
model.save_model(MODEL_FOLDER+MODEL_NAME, format="cbm")
# model.load_model(MODEL_FOLDER+MODEL_NAME, format="cbm")

In [None]:
pred = pd.Series(model.predict(val_pool))
print(val_score(pred,val_y))

## 4A Feature Importance

In [None]:
fi = model.get_feature_importance(train_pool, type="LossFunctionChange")
feature_scores = pd.DataFrame(list(zip(train_x.dtypes.index, fi )),
                                        columns=['feature','score'])
feature_scores.sort_values("score", ascending=False, inplace=True)

In [None]:
feature_scores.head(50)

In [None]:
# feature_scores.to_csv(MODEL_FOLDER+"basic_cb_model_LossFunctionChange.csv")
feature_scores = pd.read_csv(MODEL_FOLDER+"basic_cb_model_LossFunctionChange.csv", index_col=0)

In [None]:
plt.figure(figsize=(12,6))
plt.bar(feature_scores["feature"].head(50), feature_scores["score"].head(50))
plt.xticks(rotation=90)
plt.show()

In [None]:
feature_scores[feature_scores["feature"].str.find("_bow_")==-1]

In [None]:
feature_scores[feature_scores["feature"].str.find("_bow_")>-1].head(20)

In [None]:
interaction = model.get_feature_importance(
    train_pool, type="Interaction")
interaction[:,0].

In [None]:
fi_new = []
for k,item in enumerate(fi):  
    first = test_x.dtypes.index[interaction[k][0]]
    second = test_x.dtypes.index[interaction[k][1]]
    if first != second:
        fi_new.append([first + "/" + second, interaction[k][2]])
feature_score = pd.DataFrame(fi_new,columns=['Feature-Pair','Score'])

In [None]:
feature_score = feature_score.sort_values(by='Score', ascending=False, inplace=False, kind='quicksort', na_position='last')
plt.rcParams["figure.figsize"] = (16,7)
ax = feature_score.head(50).plot('Feature-Pair', 'Score', kind='bar', color='c')
ax.set_title("Pairwise Feature Importance", fontsize = 14)
ax.set_xlabel("features Pair")
plt.show()

## 5. Model prediction analysis

In [None]:
pred = pred.rename("y_hat")
val_y = val_y.rename("y")
pred.index = val_y.index
results = pd.concat([val_x, pred, val_y], axis="columns")

In [None]:
results["new_items"] = results["months_since_first_sale"]==1

In [None]:
subset = results.sample(frac=0.05)
plt.scatter(
    x=subset["y_hat"].clip(0,20), 
    y=subset["y"].clip(0,20)+np.random.rand(subset.shape[0])-0.5,
    c=subset["new_items"]
)
plt.xlabel("Predicted Values")
plt.ylabel("Actual Values")
plt.show()

In [None]:
results["diff"]=results["y_hat"].clip(0,20)-results["y"].clip(0,20)

In [None]:
results=results.sort_values("diff")
results

In [None]:
results[results['y']==0]["diff"].describe(percentiles=[0.01, 0.05, 0.25, 0.75, 0.95, 0.99])

In [None]:
plt.hist(results[results['y']==0]["diff"],bins=100)
plt.show()

## 9. Submission

In [None]:
# Finding values not in training data

test_items = (test_ids["item_id"]
              .value_counts()
              .reset_index()
              .rename(columns={"item_id":"test", "index":"item_id"}))
train_items = (transactions["item_id"]
              .value_counts()
              .reset_index()
              .rename(columns={"item_id":"train", "index":"item_id"}))

item_overlap = test_items.merge(train_items, on="item_id", how="outer")

print("Items only in test set:", item_overlap["train"].isna().sum())
print("Items only in train set:", item_overlap["test"].isna().sum())
print("Items in both test and train set:", (item_overlap["train"].notna()&item_overlap["test"].notna()).sum())
print("Items in test set and items ds: 5100")

In [None]:
test_y = pd.Series(
    model.predict(test_x).clip(0,20),
    index=test_x.index,
    name="item_cnt_month")

In [None]:
test_y=pd.concat([test_x[["shop_id","item_id"]],test_y], axis="columns")

In [None]:
test_y=test_ids.merge(test_y, on=["shop_id", "item_id"], how="outer")

In [None]:
test_y[["ID", "item_cnt_month"]].to_csv(SUBMISSION_FOLDER+"catboost20210902synthetictrainval.csv",index=False)
# Score 2.71894