# Imports

In [47]:
from copy import deepcopy

# Imports
import pandas as pd

# Metrics
from sklearn.metrics import mean_squared_error

# Data transforms
from itertools import product

# Building model
from sklearn.base import BaseEstimator

# Import data

In [2]:
train = pd.read_csv("./data/sales_train.csv")
test = pd.read_csv("./data/test.csv")
sample_submission = pd.read_csv("./data/sample_submission.csv")

In [3]:
train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [4]:
test.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [5]:
sample_submission.head()

Unnamed: 0,ID,item_cnt_month
0,0,0.5
1,1,0.5
2,2,0.5
3,3,0.5
4,4,0.5


In [7]:
# Helper function to fill missed data in train set
def fill_data(train):
    df = train.groupby(["date_block_num", "shop_id", "item_id"])["item_cnt_day"].sum().reset_index(name="item_cnt_month")
    unique_shop_id = df["shop_id"].unique()
    unique_item_id = df["item_id"].unique()
    unique_date_block_num = df["date_block_num"].unique()

    date_block_shop_item_grid = pd.DataFrame([(shop_id, item_id, date_block) for shop_id, item_id, date_block in product(unique_shop_id, unique_item_id, unique_date_block_num)], columns=["shop_id", "item_id", "date_block_num"])
    date_block_shop_item_grid = date_block_shop_item_grid.merge(df, on=["shop_id", "item_id", "date_block_num"], how="left")
    date_block_shop_item_grid.fillna({"item_cnt_month": 0}, inplace=True)

    return date_block_shop_item_grid

X = fill_data(train)
X.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month
0,0,32,0,6.0
1,0,32,1,10.0
2,0,32,2,0.0
3,0,32,3,0.0
4,0,32,4,0.0


# Validation approach

In [83]:
def cv(model):
    results = []

    folds = [(list(range(25, 31)), [31]), (list(range(26, 32)), [32]), (list(range(27, 33)), [33])]

    for train_range, test_range in folds:
        train_data = X.loc[X["date_block_num"].isin(train_range)]
        test_data = X.loc[X["date_block_num"].isin(test_range)]

        local_model = deepcopy(model)

        local_model.fit(train_data)
        y_pred = local_model.predict(test_data.drop("item_cnt_month", axis=1))

        results.append(mean_squared_error(y_pred.clip(0, 20), test_data["item_cnt_month"].clip(0, 20), squared=False))

    print(results)

# Make submission helper

In [65]:
def make_submission(name, model):
    """Helper function to generate submission

    Parameters:
    name (string): Name of the submission
    model (sklearn.base.BaseEstimator): Model to make prediction

    """
    y_test = model.predict(test).clip(0, 20)

    submission = pd.DataFrame({
        "ID": test.index,
        "item_cnt_month": y_test,
    })

    submission.to_csv(f"./submissions/submission_{name}.csv", index=False)

    print(f"Submission {name} was generated.")

## Предстказание на основании константы

Создадим модель, которая предсказывает константу. Будем использовать ее как как бейслайн.

In [84]:
class ConstantRegressor(BaseEstimator):
    """Constant estimator for defining baseline"""

    def __init__(self, value):
        self.value = value


    def fit(self, X):
        return self

    def predict(self, X):
        return  pd.Series(data = self.value, index = range(0, X.shape[0]))

# Make prediction with 0
model = ConstantRegressor(0)

# Validation
cv(model)

# Submission
make_submission("constant_0", model)

[0.44581002663061814, 0.48399744540120154, 0.49713491044374875]
Submission constant_0 was generated.


Результат модели на Kaggle `1.25011`.

## Предсказание на основании предыдущего периода

In [85]:
class LastPeriodRegressor(BaseEstimator):
    """Estimato basen on data for preview period"""

    def fit(self, X):
        X_ = X.copy()
        temp_df = X_[X_["date_block_num"] == X_["date_block_num"].max()]
        self.last = temp_df.drop("date_block_num", axis=1)
        return self

    def predict(self, X):
        # make copies for safety
        last_df = self.last.copy()
        test_df = X.copy()

        # Take last from train date
        result = pd.merge(test_df, last_df,  how='left', left_on=['shop_id','item_id'], right_on = ['shop_id','item_id'])

        # Fill NaN
        result.fillna(0, inplace=True)

        return result["item_cnt_month"]

# Make prediction
model = LastPeriodRegressor()

# Validation
cv(model)

# Train
model.fit(X)

# Create submission
make_submission("last_period", model)

[0.3959110258919294, 0.4370807413379953, 0.5037204333030488]
Submission last_period was generated.


Result on Kaggel `1.16777`.

## Предсказание средним за n-последний периодов

In [100]:
class LastNRegressor(BaseEstimator):
    """Estimator basen on data for n last periods period

    Parameters:
    type (string):
    model (sklearn.base.BaseEstimator): Model to make prediction
    """

    def __init__(self, type = "mean", n = 1):
        self.type = type
        self.n = n

    def fit(self, X):
        # Make copy for safety
        X_ = X.copy()

        if len(X_["date_block_num"].unique()) < self.n:
            raise NameError(f"Train data should contain at least {self.n} periods.")

        last_period = X_["date_block_num"].max()

        temp_df = X_[X_["date_block_num"] > last_period - self.n]

        if self.type == "mean":
            temp_df = temp_df.groupby(["shop_id", "item_id"])["item_cnt_month"].mean().reset_index(name="value")
        else:
            temp_df = temp_df.groupby(["shop_id", "item_id"])["item_cnt_month"].median().reset_index(name="value")

        self.last = temp_df

        return self

    def predict(self, X):
        # make copies for safety
        last_df = self.last.copy()
        test_df = X.copy()

        # Take last from train date
        result = pd.merge(test_df, last_df,  how='left', left_on=['shop_id','item_id'], right_on = ['shop_id','item_id'])

        # Fill NaN
        result.fillna(0, inplace=True)

        return result["value"]

# Validation
model1 = LastNRegressor(n=1)
model2 = LastNRegressor(n=2)
model3 = LastNRegressor(n=3)
model4 = LastNRegressor(n=4)
model5 = LastNRegressor(n=5)
model6 = LastNRegressor(n=6)

cv(model1)
cv(model2)
cv(model3)
cv(model4)
cv(model5)
cv(model6)

[0.3959110258919294, 0.4370807413379953, 0.5037204333030488]
[0.3755607537581353, 0.4260474707331795, 0.47103188352220626]
[0.3876995966945143, 0.43262718982465254, 0.4581371201942133]
[0.3968842771370006, 0.44841803518026235, 0.45805007985104557]
[0.3983392097762329, 0.45947577698541997, 0.468807984801644]
[0.39651072256006165, 0.4608998910887789, 0.47665548220661663]


In [102]:
model = LastNRegressor(n = 32)

# Train
model.fit(X)

# Create submission
make_submission("last_mean", model)

Submission last_mean was generated.


Results on Kaggle:
n=2 - `1.16777`
n=32 - `1.18864`

In [101]:
# Validation
model1 = LastNRegressor(type="median", n=1)
model2 = LastNRegressor(type="median", n=2)
model3 = LastNRegressor(type="median", n=3)
model4 = LastNRegressor(type="median", n=4)
model5 = LastNRegressor(type="median", n=5)
model6 = LastNRegressor(type="median", n=6)

cv(model1)
cv(model2)
cv(model3)
cv(model4)
cv(model5)
cv(model6)

[0.3959110258919294, 0.4370807413379953, 0.5037204333030488]
[0.3755607537581353, 0.4260474707331795, 0.47103188352220626]
[0.3791766536246795, 0.4452696745538588, 0.4649119642120707]
[0.3695921265159397, 0.439545442082874, 0.45796377627360163]
[0.37507992249413263, 0.4409516188878128, 0.4612736946525961]
[0.3723243252300437, 0.4362108200737118, 0.45702114802404337]


In [103]:
model = LastNRegressor(type="median", n = 32)

# Train
model.fit(X)

# Create submission
make_submission("last_median", model)

Submission last_median was generated.


Results on Kaggle:
n=2 - `1.16777`
n=32 - `1.19240`