In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
import sklearn as sk
import os

from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
pd.options.mode.use_inf_as_na = True
DATA_FOLDER = '/home/roger/code/kaggle/predict-future-sales/data'

transactions       = pd.read_csv(os.path.join(DATA_FOLDER, 'sales_train.csv'))
items              = pd.read_csv(os.path.join(DATA_FOLDER, 'items.csv'))
item_categories    = pd.read_csv(os.path.join(DATA_FOLDER, 'item_categories.csv'))
item_categories_en = pd.read_csv(os.path.join(DATA_FOLDER, 'item_categories_en.csv'))
shops              = pd.read_csv(os.path.join(DATA_FOLDER, 'shops.csv'))
shops_en           = pd.read_csv(os.path.join(DATA_FOLDER, 'shops_en.csv'))
test               = pd.read_csv(os.path.join(DATA_FOLDER, 'test.csv'))
sample             = pd.read_csv(os.path.join(DATA_FOLDER, 'sample_submission.csv'))

# Adding English translations to item_categories and shops
item_categories = item_categories.merge(
    item_categories_en[["item_category_id", "item_category_name"]]
        .rename(columns={"item_category_name":"item_category_name_en"}),
    on="item_category_id")
item_categories.head()

shops = shops.merge(
    shops_en[["shop_id", "shop_name"]]
        .rename(columns={"shop_name":"shop_name_en"}),
    on="shop_id")

index_vars = ["date_block_num", "shop_id", "item_id"]

In [None]:
print(transactions.head(2).T)
print(items.head(1).T)
print(item_categories.head(2).T)
print(shops.head(2).T)

In [None]:
# Converting date to dt format
transactions["date"] = pd.to_datetime(transactions["date"], format="%d.%m.%Y")

# Month and Season categories.
seasons={
    "01":"2",
    "02":"2",
    "03":"3",
    "04":"3",
    "05":"3",
    "06":"4",
    "07":"4",
    "08":"4",
    "09":"1",
    "10":"1",
    "11":"1",
    "12":"2",
}

transactions["month"] = pd.DatetimeIndex(transactions["date"]).month
transactions["month"] = transactions["month"].apply(lambda x: f"{x:02}")
transactions["season"] = transactions["month"].map(seasons)
transactions["quarter"] = (transactions["month"].apply(int)+2)//3
transactions["quarter"] = transactions["quarter"].apply(str)

In [None]:
# Months_on_sale
# TODO: Cannot tell how many months item was sold before dataset started.
sales_by_month = (transactions[index_vars + ["item_cnt_day"]]
                   .groupby(index_vars)
                   .agg("sum")
                   .rename(columns={"item_cnt_day":"item_cnt_month"})
                   .reset_index())
first_month = (sales_by_month[sales_by_month["item_cnt_month"] > 0]
               .groupby(["shop_id", "item_id"])
               .agg({"date_block_num":"min"})
               .rename(columns={"date_block_num":"first_sold_month"})
               .reset_index())
first_month = first_month.astype({"first_sold_month":object})
sales_by_month = sales_by_month.merge(first_month, on=["shop_id", "item_id"], how="left")
sales_by_month["months_since_first_sale"] = sales_by_month["date_block_num"]-sales_by_month["first_sold_month"]

In [None]:
# item_cnt_month-i counts lagged by i months.
for i in range(1,13):
    sales_by_month[f"month-{i}"] = sales_by_month["date_block_num"]-i
    
for i in range(1,13):
    left_vars = [f"month-{i}", "shop_id", "item_id"]

    sales_by_month = sales_by_month.merge(
                sales_by_month[index_vars+["item_cnt_month"]].rename(columns={"item_cnt_month":f"item_cnt_month-{i}"}),
                left_on=left_vars,
                right_on=index_vars,
                how="left",
                suffixes=("","_y"))
    sales_by_month = sales_by_month.drop(columns="date_block_num_y")

In [None]:
# item_name_length
items["item_name_length"] = items["item_name"].apply(lambda x:len(x.strip()))
items["item_name_length"].value_counts().sort_index().plot()

In [None]:
#Converting item_name, item_category_name, and shop_name to bow

def word_counts(df, name_col, bow_name_prefix, top_n_words=None):
    vectorisor = CountVectorizer(strip_accents="unicode", lowercase=True)
    x = vectorisor.fit_transform(df[name_col])
    counts = x.sum(axis=0).tolist()[0]
    words = vectorisor.get_feature_names()

#     Creates a df from bow index, words and count
    bow_reference = (pd.DataFrame(zip(words, counts), columns=["words", "counts"])
        .reset_index()
        .rename(columns={"index":"bow_key"})
        .sort_values("counts",ascending=False))
    if top_n_words is None:
        min_count = 1
    else:
        min_count = bow_reference.iloc[top_n_words]["counts"]
        
    bow_reference["top_n"] = bow_reference["counts"] > min_count

    #     appends top_n word columns to df
    keys = bow_reference[bow_reference["top_n"]]["bow_key"]
        
    out = df.merge(
        pd.DataFrame(
            x[:, keys].toarray(),
            columns=[f"{bow_name_prefix}{key}" for key in keys]),
        left_index=True,
        right_index=True)
    
    return out, bow_reference

In [None]:
items, item_bow_ref = word_counts(items, "item_name", "item_bow_", 100)
items_cat_bow, items_cat_bow_ref = word_counts(item_categories, "item_category_name", "cat_bow_")
shops_bow, shops_bow_ref = word_counts(shops, "shop_name", "shop_bow_")