# Imports

In [2]:
import os
import time
import numpy as np
import pandas as pd

from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline

# Submission Util

In [4]:
def submission_df(shop_id_item_id_item_cnt_month_df):
    test_df = pd.read_csv("data/test.csv.gz")
    sub_df = test_df.merge(shop_id_item_id_item_cnt_month_df, on=["shop_id", "item_id"], how="left")[["ID", "item_cnt_month"]]
    sub_df["item_cnt_month"] = sub_df.item_cnt_month.fillna(0).clip(0, 20)
    return sub_df

def create_submission_file(shop_id_item_id_item_cnt_day_df, submission_name="submission", add_timestamp=True):
    if add_timestamp:
        submission_name = "%s_%d" % (submission_name, time.time())

    path = os.path.abspath(os.path.join("submissions/%s.csv" % submission_name))
    base_dir = os.path.dirname(path)
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)
        
    sub_df = submission_df(shop_id_item_id_item_cnt_day_df)
    sub_df.to_csv(path, sep=",", index=False)

# Preprocessing Util

In [5]:
def parse_date(df):
    _df = df.copy()
    _df["date"] = pd.to_datetime(_df.date, format="%d.%m.%Y")
    return _df

def day_month_year(df):
    _df = df.copy()
    _df["day"] = _df.date.apply(lambda x: x.day)
    _df["month"] = _df.date.apply(lambda x: x.month)
    _df["year"] = _df.date.apply(lambda x: x.year)
    return _df

# Transformers
parse_date_transformer = FunctionTransformer(parse_date, validate=False)
day_month_year_transformer = FunctionTransformer(day_month_year, validate=False)

# Pipelines
date_pipeline = make_pipeline(parse_date_transformer, day_month_year_transformer)

# Main Pipeline
preprocessing_pipeline = make_pipeline(date_pipeline)


# Load Data

In [6]:
sales_df = preprocessing_pipeline.transform(pd.read_csv("data/sales_train.csv.gz"))
test_df = pd.read_csv("data/test.csv.gz")

# Previous Value Benchmark 

A good exercise is to reproduce previous_value_benchmark. As the name suggest - in this benchmark for the each shop/item pair our predictions are just monthly sales from the previous month, i.e. October 2015.

The most important step at reproducing this score is correctly aggregating daily data and constructing monthly sales data frame. You need to get lagged values, fill NaNs with zeros and clip the values into [0,20] range. If you do it correctly, you'll get precisely 1.16777 on the public leaderboard.

Generating features like this is a necessary basis for more complex models. Also, if you decide to fit some model, don't forget to clip the target into [0,20] range, it makes a big difference.

In [7]:
ind_previous_month = (sales_df.month == 10) & (sales_df.year == 2015)
sales_previous_month_df = sales_df[ind_previous_month].groupby(["shop_id", "item_id"], as_index=False).item_cnt_day.sum().rename(columns={"item_cnt_day": "item_cnt_month"})

In [8]:
sales_previous_month_df.head()

Unnamed: 0,shop_id,item_id,item_cnt_month
0,2,31,1.0
1,2,486,3.0
2,2,787,1.0
3,2,794,1.0
4,2,968,1.0


# Submission

In [9]:
submission_name = "benchmarks/previous_value_benchmark"
create_submission_file(sales_previous_month_df, submission_name, add_timestamp=False)

In [None]:
ls submissions/benchmarks/

> Score: 1.16777