In [1]:
from sklearn.model_selection import TimeSeriesSplit
from itertools import product
from typing import Any
import pandas as pd
import numpy as np

from src.ToyModel import *
from src.utilities import run_cv
from src.FeatureGenerator import *
from src.settings import RAW_PATH, PROCESSED_PATH, SHIFTS, WINS

# Initial merge of `sales_train` and `items`

In [23]:
sales_train = pd.read_csv(RAW_PATH + 'sales_train.csv')
items = pd.read_csv(RAW_PATH + 'items.csv')[['item_id', 'item_category_id']]

In [24]:
sales_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [25]:
items.head()

Unnamed: 0,item_id,item_category_id
0,0,40
1,1,76
2,2,40
3,3,40
4,4,40


In [26]:
merged_df = sales_train.merge(items, how='left', on='item_id')

In [27]:
merged_df.isna().sum()

date                0
date_block_num      0
shop_id             0
item_id             0
item_price          0
item_cnt_day        0
item_category_id    0
dtype: int64

In [29]:
merged_df.to_parquet(PROCESSED_PATH + 'merged_train_df.parquet', index=False)

# Constructing target

This will be a simpler version of target where instead of rolling window of 30 days we will simply use the month id to sum aggregate number of items sold. We will calculate this for each shop and item, add missing months with 0 sales and shift the aggregate by 1 month.

This functionality has been moved to src.TestGenerator

## Building features

## Features describing sales for `shop_id` and `date_block_num`

For all feature construction we have to use expanded dataset to preserve the sequence of months. If some months are missing, than shifts and window aggregates would not be correct since we are not working with the datetime column here.

This functionality has been implemented and moved to src.FeatureGenerator

### Counts of deals per month and per shop, lags, rolling aggregates

This functionality has been implemented and moved to src.FeatureGenerator

### Aggregates over prices per month and per shop, lags, window aggregates

This functionality has been implemented and moved to src.FeatureGenerator

## Features describing sales for `shop_id`, `category_id` and `date_block_num`

In [2]:
from src.settings import grouppin_cols

merged_df = pd.read_parquet(PROCESSED_PATH + 'merged_train_df.parquet')
local_df = merged_df[['date_block_num', 'item_id', 'shop_id', 'item_cnt_day', 'item_price']].reset_index(drop=True).copy()

index_cols = ['shop_id', 'item_id', 'date_block_num']
base_cols = ['item_price', 'item_cnt_day']
target_col = ['target']

res_df = generate_backbone()

In [4]:
agg_di = {col: ROLL_FUNCS for col in base_cols}

for k, group in grouppin_cols.items():
    agg_df = local_df.groupby(group, as_index = False).agg(agg_di)
    agg_df.columns = ['_'.join(col) + f'_per_{k}' if col[1] else col[0] for col in agg_df.columns ]
    res_df = res_df.merge(agg_df, how='left', on=group).fillna(0)

res_df = res_df.rename(columns={'item_cnt_day_sum_per_shop_item': 'target'})
base_feat_cols = [col for col in res_df if col not in index_cols + base_cols + target_col]

In [5]:
cols_to_shift = base_feat_cols + target_col
shifted_cols = []
for shift in SHIFTS:
    preshift_df = res_df[index_cols + cols_to_shift].copy()
    preshift_df['date_block_num'] = preshift_df['date_block_num'] + shift
    rename_dict = {col: f'{col}_lag_{shift}' for col in cols_to_shift}
    shifted_cols += [f'{col}_lag_{shift}' for col in cols_to_shift]
    preshift_df = preshift_df.rename(columns = rename_dict)
    
    res_df = res_df.merge(preshift_df, how='left', on=index_cols).fillna(0)

In [8]:
roll_cols = []
col = target_col[0]
for win_len in WINS:
    # groupping_k = col.split('_per_')[1] if not col == target_col[0] else 'shop_item'
    group = grouppin_cols['shop_item']
    roll_df = res_df[group + [col]].drop_duplicates().sort_values(group)
    new_name = f'{col}_roll_mean_{win_len}'
    roll_df = roll_df.groupby(grouppin_cols[groupping_k][:-1], as_index=False)\
                        .rolling(win_len, on='date_block_num', closed='right')[col].mean().fillna(0).reset_index()\
                        .rename(columns={col: new_name})
    res_df = res_df.merge(roll_df, how='left', on=group)
    roll_cols.append(new_name)
    # print(roll_df.head())
    # print('-'*30)
    # tmp.groupby('shop_id', as_index=False).rolling(2, on='date_block_num')[base_feat_cols[0]].sum()

   shop_id  item_id  date_block_num  target_roll_mean_3
0       26        0               0                 0.0
1       26        0               1                 0.0
2       26        0               2                 0.0
3       26        0               3                 0.0
4       26        0               4                 0.0
------------------------------
   shop_id  item_id  date_block_num  target_roll_mean_9
0       26        0               0                 0.0
1       26        0               1                 0.0
2       26        0               2                 0.0
3       26        0               3                 0.0
4       26        0               4                 0.0
------------------------------


In [9]:
res_df

Unnamed: 0,shop_id,item_id,date_block_num,item_price_sum_per_shop,item_cnt_day_sum_per_shop,item_price_sum_per_item,item_cnt_day_sum_per_item,item_price_sum_per_shop_item,target,item_price_sum_per_shop_lag_1,...,item_price_sum_per_shop_item_lag_6,target_lag_6,item_price_sum_per_shop_lag_12,item_cnt_day_sum_per_shop_lag_12,item_price_sum_per_item_lag_12,item_cnt_day_sum_per_item_lag_12,item_price_sum_per_shop_item_lag_12,target_lag_12,target_roll_mean_3,target_roll_mean_9
0,26,0,0,1.457946e+06,2331.0,0.0,0.0,0.0,0.0,0.000000e+00,...,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,26,0,1,1.525165e+06,2597.0,0.0,0.0,0.0,0.0,1.457946e+06,...,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,26,0,2,1.876700e+06,3036.0,0.0,0.0,0.0,0.0,1.525165e+06,...,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,26,0,3,1.328178e+06,2381.0,0.0,0.0,0.0,0.0,1.876700e+06,...,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,26,0,4,1.193884e+06,2148.0,0.0,0.0,0.0,0.0,1.328178e+06,...,0.0,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2261335,28,22169,29,3.293298e+06,3921.0,0.0,0.0,0.0,0.0,3.236544e+06,...,0.0,0.0,3.891872e+06,5701.0,0.0,0.0,0.0,0.0,0.0,0.0
2261336,28,22169,30,3.092312e+06,3612.0,0.0,0.0,0.0,0.0,3.293298e+06,...,0.0,0.0,3.321702e+06,4536.0,0.0,0.0,0.0,0.0,0.0,0.0
2261337,28,22169,31,2.979148e+06,3749.0,0.0,0.0,0.0,0.0,3.092312e+06,...,0.0,0.0,3.771254e+06,5802.0,0.0,0.0,0.0,0.0,0.0,0.0
2261338,28,22169,32,2.896085e+06,2979.0,0.0,0.0,0.0,0.0,2.979148e+06,...,0.0,0.0,3.500124e+06,4403.0,0.0,0.0,0.0,0.0,0.0,0.0


This functionality has been moved to src.FeatureGenerator

## Feature describing sales for `shop_id`, `item_id` and `date_block_num`

This functionality has been implemented and moved to src.FeatureGenerator

## Autoregression features

This functionality has been implemented and moved to src.FeatureGenerator

So we've created a dataset where for every month we have some descriptive features about sales and profits. We also have a target column that contains the number of items sold in the given shop. We can now train a model to predic number of items sold in the following month.

# Feature generation at inference time

Let's verify that we can create feature dataset to predict target for a given test.csv.

FeatureGenerator and TestGenerator classes have been created and moved to src. We've imported it above and will demonstrate usage for the cases of crossvalidation and inference for a provided test index backbone here.

In [3]:
feat_generator = FeatureGenerator()
test_generator = TestGenerator()

shop_item_backbone_back = generate_backbone(cols_for_backbone=['shop_id', 'item_id'])

# suppose we want to make 11 the target month, i.e. the month we predict to
backbone = test_generator.add_month_to_backbone(shop_item_backbone = shop_item_backbone_back, 
                                                month_num=11) 
test_features_df = test_generator.add_features_to_backbone(test_backbone=backbone, 
                                            feat_generator=feat_generator)
test_features_df.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_price_sum,item_price_mean,item_price_std,item_price_min,item_price_max,deals_count,item_price_sum_shift_2,...,deals_count_roll_min_12,item_price_sum_roll_max_2,item_price_mean_roll_max_2,deals_count_roll_max_2,item_price_sum_roll_max_6,item_price_mean_roll_max_6,deals_count_roll_max_6,item_price_sum_roll_max_12,item_price_mean_roll_max_12,deals_count_roll_max_12
0,0,0,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3007530.0,593.904028,5064.0
1,0,1,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3007530.0,593.904028,5064.0
2,0,2,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3007530.0,593.904028,5064.0
3,0,3,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3007530.0,593.904028,5064.0
4,0,4,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3007530.0,593.904028,5064.0


So, features for a generated shop-item backbone and a given month are generated correctly. Let's now use this functionality
to generate features for the provided test index backbone.

In [3]:
# we are told that the test month is November, so following the last one in the train set, so 34-th 
test_month_num = 34 

# test_backbone in this case is the provided test dataset itself
test_backbone = pd.read_csv(RAW_PATH + 'test.csv')

feat_generator = FeatureGenerator()
test_features_df = test_generator.add_features_to_backbone(test_backbone=test_backbone, 
                                                           feat_generator=feat_generator)

In [4]:
test_features_df.head()

Unnamed: 0,ID,shop_id,item_id,date_block_num,deals_cnt,deals_cnt_shift_2,deals_cnt_shift_6,deals_cnt_shift_12,deals_cnt_roll_sum_2,deals_cnt_roll_sum_6,...,item_price_sum_roll_min_6,item_price_mean_roll_min_6,item_price_sum_roll_min_12,item_price_mean_roll_min_12,item_price_sum_roll_max_2,item_price_mean_roll_max_2,item_price_sum_roll_max_6,item_price_mean_roll_max_6,item_price_sum_roll_max_12,item_price_mean_roll_max_12
0,0,5,5037,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,5,5037,1,845.0,0.0,0.0,0.0,845.0,845.0,...,0.0,0.0,0.0,0.0,499735.96,591.403503,499735.96,591.403503,499735.96,591.403503
2,0,5,5037,2,1262.0,0.0,0.0,0.0,2107.0,2107.0,...,0.0,0.0,0.0,0.0,756773.413333,599.661976,756773.413333,599.661976,756773.413333,599.661976
3,0,5,5037,3,946.0,845.0,0.0,0.0,2208.0,3053.0,...,0.0,0.0,0.0,0.0,756773.413333,644.232294,756773.413333,644.232294,756773.413333,644.232294
4,0,5,5037,4,1060.0,1262.0,0.0,0.0,2006.0,4113.0,...,0.0,0.0,0.0,0.0,609443.75,644.232294,756773.413333,644.232294,756773.413333,644.232294


# Cross validation

In here we will check if existing functionality is enough to do a round of crossvalidation.
Overall idea:
- data for all folds is generated using TestGenerator and FeatureGenerator classes
- data is split to train/test by order number of month using sklearn.TimeSeriesSplit

In [53]:
# TODO: consider introducing 1-2 months gap between train and test
# TODO: it should be possible to use sklearn cross validation functionality: 
# 1) build df with all features and offload it to data/processed 
# 2) build custom layer to sample from the dataset using sklearn.TimeSeriesSplit based on months

In [4]:
merged_df = pd.read_parquet(PROCESSED_PATH + 'merged_train_df.parquet')
all_months = merged_df['date_block_num'].unique()
all_months = all_months[all_months > max(WINS_SHIFTS)] # leaving enough months for longest shift/window calculation
print(f'all months: {all_months}\n len: {len(all_months)}')

tscv = TimeSeriesSplit(test_size = 2, max_train_size=11)
for i, (train_index, test_index) in enumerate(tscv.split(all_months)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")

all months: [13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33]
 len: 21
Fold 0:
  Train: index=[ 0  1  2  3  4  5  6  7  8  9 10]
  Test:  index=[11 12]
Fold 1:
  Train: index=[ 2  3  4  5  6  7  8  9 10 11 12]
  Test:  index=[13 14]
Fold 2:
  Train: index=[ 4  5  6  7  8  9 10 11 12 13 14]
  Test:  index=[15 16]
Fold 3:
  Train: index=[ 6  7  8  9 10 11 12 13 14 15 16]
  Test:  index=[17 18]
Fold 4:
  Train: index=[ 8  9 10 11 12 13 14 15 16 17 18]
  Test:  index=[19 20]


Let's try the implemented functionality to run a simple time series cross validation. We will use a simple mean predictor as a model for now.

In [3]:
# initiating data generators
feat_generator = FeatureGenerator()
test_generator = TestGenerator(train=True)

# initiating toy model
model = ToyModel()

# initiating cv splitter
tscv = TimeSeriesSplit(test_size = 2, max_train_size=11)

# generating all of the data which we will iterate over during CV
target_df = test_generator.generate_target_for_month(month_nums=list(range(13, 34)))
features_df = test_generator.add_features_to_backbone(test_backbone=target_df, 
                                                      feat_generator=feat_generator)

# creating col lists for training
cols_di={
    'index': ['shop_id', 'item_id', 'date_block_num'],
    'target': ['target'],
    'feats': feat_generator.base_feat_cols + feat_generator.lag_cols + feat_generator.roll_cols + ['sum_sales_cnt']
}
model = ToyModel()
# iterating over CV folds
cv_res = run_cv(df=features_df, months_cv_split=tscv, model=model, cols_di=cols_di, verbose=2)

Fold 0:
  Train months: [13 14 15 16 17 18 19 20 21 22 23]
  Test months: [24 25]
  NRMSE:  1.2
  RMSE :  3.9


------------------------------
Fold 1:
  Train months: [15 16 17 18 19 20 21 22 23 24 25]
  Test months: [26 27]
  NRMSE:  0.96
  RMSE :  7.4


------------------------------
Fold 2:
  Train months: [17 18 19 20 21 22 23 24 25 26 27]
  Test months: [28 29]
  NRMSE:  0.82
  RMSE :  3.2


------------------------------
Fold 3:
  Train months: [19 20 21 22 23 24 25 26 27 28 29]
  Test months: [30 31]
  NRMSE:  0.9
  RMSE :  7.9


------------------------------
Fold 4:
  Train months: [21 22 23 24 25 26 27 28 29 30 31]
  Test months: [32 33]
  NRMSE:  0.89
  RMSE :  8.5


------------------------------
RMSE mean: 6.2
NRMSE mean: 0.95


We see that functionality built is sufficient and we can now go on to try more complex models.

In [2]:
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
# initiating data generators
feat_generator = FeatureGenerator()
test_generator = TestGenerator(train=True)

# initiating toy model
model = XGBRegressor()
# model = LinearRegression()
# model = ToyModel()
# generating all of the data which we will iterate over during CV
target_df = test_generator.generate_target_for_month(month_nums=list(range(13, 34)))
features_df = test_generator.add_features_to_backbone(test_backbone=target_df, 
                                                      feat_generator=feat_generator)


In [3]:
feat_generator.lag_cols

['item_price_sum_shift_1',
 'item_cnt_day_sum_shift_1',
 'item_price_sum_shift_2',
 'item_cnt_day_sum_shift_2',
 'item_price_sum_shift_6',
 'item_cnt_day_sum_shift_6',
 'item_price_sum_shift_12',
 'item_cnt_day_sum_shift_12',
 'item_price_sum_shift_0',
 'item_cnt_day_sum_shift_0']

In [4]:
tscv = TimeSeriesSplit(test_size = 1, max_train_size=16)

# creating col lists for training
cols_di={
    'index': ['shop_id', 'item_id', 'date_block_num'],
    'target': ['target'],
    'feats':  feat_generator.base_feat_cols + feat_generator.lag_cols + feat_generator.roll_cols# ['sum_sales_cnt'] + 
}

# iterating over CV folds
cv_res = run_cv(df=features_df, months_cv_split=tscv, model=model, cols_di=cols_di, verbose=2)

7215 125002
Fold 0:
  Train months: [12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27]
  Test months: [28]
  NRMSE:  1.0
  RMSE :  5.1


------------------------------
6987 124021
Fold 1:
  Train months: [13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28]
  Test months: [29]
  NRMSE:  1.0
  RMSE :  4.4


------------------------------
6018 122774
Fold 2:
  Train months: [14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29]
  Test months: [30]
  NRMSE:  1.0
  RMSE :  4.2


------------------------------
5659 121206
Fold 3:
  Train months: [15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30]
  Test months: [31]
  NRMSE:  1.0
  RMSE :  4.8


------------------------------
5971 118886
Fold 4:
  Train months: [16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31]
  Test months: [32]
  NRMSE:  1.0
  RMSE :  3.8


------------------------------
RMSE mean: 4.5
NRMSE mean: 1.0


In [None]:
RMSE mean: 2.3
NRMSE mean: 0.53

RMSE mean: 2.3
NRMSE mean: 0.52

RMSE mean: 2.3
NRMSE mean: 0.52

In [None]:

    # num_shops = int(np.sqrt(test_size))
    # ideal_num_items = int(test_size / num_shops)
    # test_shops = np.random.choice(range(shop_id_min_max[0], 
    #                                     shop_id_min_max[1]+1), 
    #                               num_shops, replace=False)
    # back = []
    # for val in test_shops:
    #     num_items_to_pick = np.random.choice(range(int(ideal_num_items * (1-num_items_variety)),
    #                                             int(ideal_num_items * (1+num_items_variety))))
    #     test_items = np.random.choice(range(item_id_min_max[0], 
    #                                         item_id_min_max[1]+1), 
    #                                 num_items_to_pick, replace=False)
    #     back.append(pd.DataFrame(product([val], test_items), columns = ['shop_id', 'item_id']))
    # back_df = pd.concat(back, ignore_index=True).drop_duplicates()
    # return back_df