In [4]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from itertools import product
from typing import Any
import pandas as pd
import numpy as np

from src.ToyModel import *
from src.TestGenerator import *
from src.utilities import run_cv
from src.FeatureGenerator import *
from src.settings import RAW_PATH, PROCESSED_PATH, WINS_SHIFTS, ROLL_FUNCS, COLS_MIN_MAX

# Initial merge of `sales_train` and `items`

In [23]:
sales_train = pd.read_csv(RAW_PATH + 'sales_train.csv')
items = pd.read_csv(RAW_PATH + 'items.csv')[['item_id', 'item_category_id']]

In [24]:
sales_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [25]:
items.head()

Unnamed: 0,item_id,item_category_id
0,0,40
1,1,76
2,2,40
3,3,40
4,4,40


In [26]:
merged_df = sales_train.merge(items, how='left', on='item_id')

In [27]:
merged_df.isna().sum()

date                0
date_block_num      0
shop_id             0
item_id             0
item_price          0
item_cnt_day        0
item_category_id    0
dtype: int64

In [29]:
merged_df.to_parquet(PROCESSED_PATH + 'merged_train_df.parquet', index=False)

# Constructing target

This will be a simpler version of target where instead of rolling window of 30 days we will simply use the month id to sum aggregate number of items sold. We will calculate this for each shop and item, add missing months with 0 sales and shift the aggregate by 1 month.

This functionality has been moved to src.TestGenerator

## Building features

## Features describing sales for `shop_id` and `date_block_num`

For all feature construction we have to use expanded dataset to preserve the sequence of months. If some months are missing, than shifts and window aggregates would not be correct since we are not working with the datetime column here.

This functionality has been implemented and moved to src.FeatureGenerator

### Counts of deals per month and per shop, lags, rolling aggregates

This functionality has been implemented and moved to src.FeatureGenerator

### Aggregates over prices per month and per shop, lags, window aggregates

This functionality has been implemented and moved to src.FeatureGenerator

## Features describing sales for `shop_id`, `category_id` and `date_block_num`

In [7]:
# TODO: build

## Feature describing sales for `shop_id`, `item_id` and `date_block_num`

In [None]:
# TODO: build

## Autoregression features

In [8]:
# TODO: build

So we've created a dataset where for every month we have some descriptive features about sales and profits. We also have a target column that contains the number of items sold in the given shop. We can now train a model to predic number of items sold in the following month.

# Feature generation at inference time

Let's verify that we can create feature dataset to predict target for a given test.csv.

FeatureGenerator and TestGenerator classes have been created and moved to src. We've imported it above and will demonstrate usage for the cases of crossvalidation and inference for a provided test index backbone here.

In [2]:
feat_generator = FeatureGenerator()
test_generator = TestGenerator()
shop_item_backbone_back = test_generator.generate_shop_item_backbone(test_size=1000)
backbone = test_generator.add_month_to_backbone(shop_item_backbone = shop_item_backbone_back, 
                                  month_num=11)
test_features_df = test_generator.add_features_to_backbone(test_backbone=backbone, 
                                            feat_generator=feat_generator)
test_features_df.head()

Unnamed: 0,shop_id,item_id,date_block_num,deals_cnt,deals_cnt_shift_2,deals_cnt_shift_6,deals_cnt_shift_12,deals_cnt_roll_sum_2,deals_cnt_roll_sum_6,deals_cnt_roll_sum_12,...,item_price_sum_roll_min_6,item_price_mean_roll_min_6,item_price_sum_roll_min_12,item_price_mean_roll_min_12,item_price_sum_roll_max_2,item_price_mean_roll_max_2,item_price_sum_roll_max_6,item_price_mean_roll_max_6,item_price_sum_roll_max_12,item_price_mean_roll_max_12
0,7,4070,10,1960.0,2050.0,1539.0,0.0,3831.0,11750.0,21427.0,...,1208660.0,653.329888,947112.4,637.356965,1805427.0,921.136026,1805427.0,921.136026,1805427.0,921.136026
1,4,12143,10,1209.0,1239.0,1113.0,0.0,2450.0,7896.0,14785.0,...,935770.9,688.788252,182636.5,688.788252,1110282.0,918.34709,1110282.0,918.34709,1365703.0,918.34709
2,18,12077,10,1674.0,1769.0,2470.0,0.0,3229.0,11269.0,24001.0,...,1310901.0,877.704487,1310901.0,877.704487,2130369.0,1272.622116,2174952.0,1272.622116,2513204.0,1272.622116
3,48,19324,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,23,20094,10,0.0,0.0,0.0,0.0,0.0,0.0,6963.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1395718.0,760.609444


So, features for a generated shop-item backbone and a given month are generated correctly. Let's now use this functionality could be used to generate features for the provided test index backbone.

In [3]:
# we are told that the test month is November, so following the last one in the train set, so 34-th 
test_month_num = 34 

# test_backbone in this case is the provided test dataset itself
test_backbone = pd.read_csv(RAW_PATH + 'test.csv')

feat_generator = FeatureGenerator()
test_features_df = test_generator.add_features_to_backbone(test_backbone=test_backbone, 
                                            feat_generator=feat_generator)

In [4]:
test_features_df.head()

Unnamed: 0,ID,shop_id,item_id,date_block_num,deals_cnt,deals_cnt_shift_2,deals_cnt_shift_6,deals_cnt_shift_12,deals_cnt_roll_sum_2,deals_cnt_roll_sum_6,...,item_price_sum_roll_min_6,item_price_mean_roll_min_6,item_price_sum_roll_min_12,item_price_mean_roll_min_12,item_price_sum_roll_max_2,item_price_mean_roll_max_2,item_price_sum_roll_max_6,item_price_mean_roll_max_6,item_price_sum_roll_max_12,item_price_mean_roll_max_12
0,0,5,5037,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,5,5037,1,845.0,0.0,0.0,0.0,845.0,845.0,...,0.0,0.0,0.0,0.0,499735.96,591.403503,499735.96,591.403503,499735.96,591.403503
2,0,5,5037,2,1262.0,0.0,0.0,0.0,2107.0,2107.0,...,0.0,0.0,0.0,0.0,756773.413333,599.661976,756773.413333,599.661976,756773.413333,599.661976
3,0,5,5037,3,946.0,845.0,0.0,0.0,2208.0,3053.0,...,0.0,0.0,0.0,0.0,756773.413333,644.232294,756773.413333,644.232294,756773.413333,644.232294
4,0,5,5037,4,1060.0,1262.0,0.0,0.0,2006.0,4113.0,...,0.0,0.0,0.0,0.0,609443.75,644.232294,756773.413333,644.232294,756773.413333,644.232294


# Cross validation pathfinder

In here we will check if existing functionality is enough to do a round of crossvalidation.
Overall idea:
- data for all folds is generated using TestGenerator and FeatureGenerator classes
- data is split to train/test by order number of month using sklearn.TimeSeriesSplit

In [53]:
# TODO: consider introducing 1-2 months gap between train and test
# TODO: it should be possible to use sklearn cross validation functionality: 
# 1) build df with all features and offload it to data/processed 
# 2) build custom layer to sample for that dataset using sklearn.TimeSeriesSplit based on months

In [4]:
merged_df = pd.read_parquet(PROCESSED_PATH + 'merged_train_df.parquet')
all_months = merged_df['date_block_num'].unique()
all_months = all_months[all_months > max(WINS_SHIFTS)] # leaving enough months for longest shift/window calculation
print(f'all months: {all_months}\n len: {len(all_months)}')

tscv = TimeSeriesSplit(test_size = 2, max_train_size=11)
for i, (train_index, test_index) in enumerate(tscv.split(all_months)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")

all months: [13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33]
 len: 21
Fold 0:
  Train: index=[ 0  1  2  3  4  5  6  7  8  9 10]
  Test:  index=[11 12]
Fold 1:
  Train: index=[ 2  3  4  5  6  7  8  9 10 11 12]
  Test:  index=[13 14]
Fold 2:
  Train: index=[ 4  5  6  7  8  9 10 11 12 13 14]
  Test:  index=[15 16]
Fold 3:
  Train: index=[ 6  7  8  9 10 11 12 13 14 15 16]
  Test:  index=[17 18]
Fold 4:
  Train: index=[ 8  9 10 11 12 13 14 15 16 17 18]
  Test:  index=[19 20]


We see that functionality built is sufficient for a simple time series cross validation.

In [8]:
# initiating data generators
feat_generator = FeatureGenerator()
test_generator = TestGenerator(train=True)

# initiating toy model
model = ToyModel()

# initiating cv splitter
tscv = TimeSeriesSplit(test_size = 2, max_train_size=11)

# generating all of the data which we will iterate over during CV
target_df = test_generator.generate_target_for_month(list(range(13, 34)))
features_df = test_generator.add_features_to_backbone(test_backbone=target_df, 
                                                      feat_generator=feat_generator)

# creating col lists for training
cols_di={
    'index': ['shop_id', 'item_id', 'date_block_num'],
    'target': ['target'],
    'feats': feat_generator.base_feat_cols + feat_generator.lag_cols + feat_generator.roll_cols
}

# iterating over CV folds
run_cv(df=features_df, months_cv_split=tscv, model=model, cols_di=cols_di)

Fold 0:
  Train: target months=[16 14 15 32 13 17 19 20 22 25]
  Test:  target months=[30 24]
  NRMSE:  1.0
  RMSE :  3.5

Fold 1:
  Train: target months=[14 15 32 13 17 19 20 22 25 30 24]
  Test:  target months=[26 27]
  NRMSE:  1.0
  RMSE :  7.7

Fold 2:
  Train: target months=[32 13 17 19 20 22 25 30 24 26 27]
  Test:  target months=[29 31]
  NRMSE:  1.0
  RMSE :  8.8

Fold 3:
  Train: target months=[17 19 20 22 25 30 24 26 27 29 31]
  Test:  target months=[23 18]
  NRMSE:  1.0
  RMSE :  6.8

Fold 4:
  Train: target months=[20 22 25 30 24 26 27 29 31 23 18]
  Test:  target months=[21 28]
  NRMSE:  1.0
  RMSE :  6.5



In [None]:

    # num_shops = int(np.sqrt(test_size))
    # ideal_num_items = int(test_size / num_shops)
    # test_shops = np.random.choice(range(shop_id_min_max[0], 
    #                                     shop_id_min_max[1]+1), 
    #                               num_shops, replace=False)
    # back = []
    # for val in test_shops:
    #     num_items_to_pick = np.random.choice(range(int(ideal_num_items * (1-num_items_variety)),
    #                                             int(ideal_num_items * (1+num_items_variety))))
    #     test_items = np.random.choice(range(item_id_min_max[0], 
    #                                         item_id_min_max[1]+1), 
    #                                 num_items_to_pick, replace=False)
    #     back.append(pd.DataFrame(product([val], test_items), columns = ['shop_id', 'item_id']))
    # back_df = pd.concat(back, ignore_index=True).drop_duplicates()
    # return back_df